From 7b6808b69ca706c724a4258e7cfb460b3c8c25a7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 11 Aug 2019 23:28:13 +0200
Subject: [PATCH 001/210] Increment version to 0.3.8.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7d9c2fce..74db77135 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 7.dev)
+set(OpenBLAS_PATCH_VERSION 8.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 02d92039811af88acd7e2b3d9fe4726c9f1008f4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 11 Aug 2019 23:28:47 +0200
Subject: [PATCH 002/210] Increment version to 0.3.8.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index a299588e0..c0941e488 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.7.dev
+VERSION = 0.3.8.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 303869f5724bb86d722bc32f254a976625ea2046 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 11 Aug 2019 23:31:36 +0200
Subject: [PATCH 003/210] Update with changes from 0.3.7

---
 Changelog.txt | 42 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index 8df35d5c3..f160a4e13 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,46 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.7
+11-Aug 2019
+
+common:
+        * having the gmake special variables TARGET_ARCH or TARGET_MACH
+          defined no longer causes build failures in ctest or utest
+        * defining NO_AFFINITY or USE_TLS to 0 in gmake builds no longer
+          has the same effect as setting them to 1
+        * a new test program was added to allow checking the library for
+          thread safety
+        * a new option USE_LOCKING was added to ensure thread safety when
+          OpenBLAS itself is built without multithreading but will be 
+          called from multiple threads.
+        * a build failure on Linux with glibc versions earlier than 2.5
+          was fixed
+        * a runtime error with CPU enumeration (and NO_AFFINITY not set)
+          on glibc 2.6 was fixed
+        * NO_AFFINITY was added to the CMAKE options (and defaults to being
+          active on Linux, as in the gmake builds)
+
+x86_64:
+        * the build-time logic for detection of AVX512 availability in
+          the processor and compiler was fixed
+        * gmake builds on OSX now set the internal name of the library to 
+          libopenblas.0.dylib (consistent with CMAKE)
+        * the Haswell DGEMM kernel received a significant speedup through
+          improved prefetch and load instructions
+        * performance of DGEMM, DTRMM, DTRSM and ZDOT on Zen/Zen2 was markedly
+          increased by avoiding vpermpd instructions
+        * the SKYLAKEX (AVX512) DGEMM helper functions have now been disabled
+          to fix remaining errors in DGEMM, DSYMM and DTRMM
+
+## POWER:
+        * added support for building on FreeBSD/powerpc64 and FreeBSD/ppc970
+        * added optimized kernels for POWER9 SGEMM and STRMM
+
+## ARMV7:
+        * fixed the softfp implementations of xAMAX and IxAMAX
+        * removed the predefined -march= flags on both ARMV5 and ARMV6 as
+          they were appropriate for only a subset of platforms
+
 ====================================================================
 Version 0.3.6
 29-Apr-2019

From aef9804089b0c968806a0fc3cfb0219359ce42b2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 13 Aug 2019 10:19:10 +0200
Subject: [PATCH 004/210] Fix unwanted case-sensitivity in x86 LSAME for (AMD)
 processors without CMOV

Problem was already noticed some years ago in #238, but back then the problem was only corrected in one of the #ifdef branches.
Fixes #2214
---
 kernel/x86/lsame.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86/lsame.S b/kernel/x86/lsame.S
index 3ac7a7314..2a2ab2bb5 100644
--- a/kernel/x86/lsame.S
+++ b/kernel/x86/lsame.S
@@ -56,13 +56,13 @@
 #ifndef HAVE_CMOV
 	movl	%eax, %ecx
 	subl	$32,  %ecx
-	jle	.L1
+	jl	.L1
 	movl	%ecx, %eax
 .L1:
 
 	movl	%edx, %ecx
 	subl	$32,  %ecx
-	jle	.L2
+	jl	.L2
 	movl	%ecx, %edx
 .L2:
 	subl	%eax, %edx

From a1fce677435a79d3cb577086793556d87ff76552 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 13 Aug 2019 22:29:48 +0200
Subject: [PATCH 005/210] Make the new DGEMM regression test properly depend on
 CBLAS and LAPACKE

fixes #2215
---
 utest/CMakeLists.txt | 5 +++++
 utest/Makefile       | 7 +++++++
 2 files changed, 12 insertions(+)

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 4e647cadc..1e3051a8f 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -38,9 +38,14 @@ if (NOT NO_LAPACK)
 set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
   test_potrs.c
+  )
+if (NOT NO_CBLAS AND NOT NO_LAPACKE)
+set(OpenBLAS_utest_src
+  ${OpenBLAS_utest_src}
   test_kernel_regress.c
   )
 endif()
+endif()
 
 set(OpenBLAS_utest_bin openblas_utest)
 add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
diff --git a/utest/Makefile b/utest/Makefile
index cbe639cdb..8c7e6b9f8 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -1,6 +1,9 @@
 UTEST_CHECK = 1
 TOPDIR	= ..
 
+override TARGET_ARCH=
+override TARGET_MACH=
+
 UTESTBIN=openblas_utest
 
 .PHONY : all
@@ -13,8 +16,12 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o
 
 ifneq ($(NO_LAPACK), 1)
 OBJS += test_potrs.o
+ifneq ($(NO_CBLAS), 1)
+ifneq ($(NO_LAPACKE), 1)
 OBJS += test_kernel_regress.o
 endif
+endif
+endif
 
 #this does not work with OpenMP nor with native Windows or Android threads
 # FIXME TBD if this works on OSX, SunOS, POWER and zarch

From 9ef96b32a6cc9a41908e832f2f713462bb94f40f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 15 Aug 2019 22:09:12 +0200
Subject: [PATCH 006/210] Add multithreading support to the x86_64 zdot kernel
 (#2222)

* Add multithreading support

copied from the ThunderX2T99 kernel. For #2221
---
 kernel/x86_64/zdot.c | 86 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 72 insertions(+), 14 deletions(-)

diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index ef12569c8..48f855b0e 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -86,18 +86,26 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
 
 #endif
 
-OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+
+#if defined(SMP)
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n,
+        BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+                
+                
+
+static void zdot_compute (BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,OPENBLAS_COMPLEX_FLOAT *result)
 {
 	BLASLONG i;
 	BLASLONG ix,iy;
 	FLOAT  dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; 
-
+	
 	if ( n <= 0 ) 
 	{
-//	        CREAL(result) = 0.0 ;
-//        	CIMAG(result) = 0.0 ;
-		OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
-		return(result);
+		OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
+		*result=res;
+		return;
 
 	}
 
@@ -150,18 +158,68 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 	}
 
 #if !defined(CONJ)
-	OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
-//	CREAL(result) = dot[0] - dot[1];
-//	CIMAG(result) = dot[2] + dot[3];
+	OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]-dot[1],dot[2]+dot[3]);
 #else
-	OPENBLAS_COMPLEX_FLOAT result=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
-//	CREAL(result) = dot[0] + dot[1];
-//	CIMAG(result) = dot[2] - dot[3];
+	OPENBLAS_COMPLEX_FLOAT res=OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0]+dot[1],dot[2]-dot[3]);
+#endif
+        *result=res;
+	return;
+}
 
+#if defined(SMP)
+static int zdot_thread_function(BLASLONG n, BLASLONG dummy0,
+BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *y,
+BLASLONG inc_y, FLOAT *result, BLASLONG dummy3)
+{
+        zdot_compute(n, x, inc_x, y, inc_y, (void *)result);
+        return 0;
+}
 #endif
 
-	return(result);
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+#if defined(SMP)
+	int nthreads;
+	FLOAT dummy_alpha;
+#endif
+	OPENBLAS_COMPLEX_FLOAT zdot;
+       CREAL(zdot) = 0.0;
+       CIMAG(zdot) = 0.0;
 
-}
+#if defined(SMP)
+	if (inc_x == 0 || inc_y == 0 || n <= 10000)
+		nthreads = 1;
+	else
+		nthreads = num_cpu_avail(1);
+
+	if (nthreads == 1) {
+		zdot_compute(n, x, inc_x, y, inc_y, &zdot);
+	} else {
+		int mode, i;
+		char result[MAX_CPU_NUMBER * sizeof(double) * 2];
+		OPENBLAS_COMPLEX_FLOAT *ptr;
+
+#if !defined(DOUBLE)
+		mode = BLAS_SINGLE  | BLAS_COMPLEX;
+#else
+		mode = BLAS_DOUBLE  | BLAS_COMPLEX;
+#endif
+
+		blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
+				   x, inc_x, y, inc_y, result, 0,
+				   ( void *)zdot_thread_function, nthreads);
 
+		ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
+		for (i = 0; i < nthreads; i++) {
+			CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
+			CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
+			ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
+		}
+	}
+#else
+	zdot_compute(n, x, inc_x, y, inc_y, &zdot);
+#endif
+
+	return zdot;
+}
 

From e3d846ab57eabadf5b933e8ca66d0b2c62e23e4c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 16 Aug 2019 08:58:10 +0200
Subject: [PATCH 007/210] Do not use -march=native with the PGI compiler

---
 cmake/system.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 1c2093efe..4f8011603 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -66,7 +66,7 @@ if (DEFINED TARGET)
 endif ()
 
 # On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
-if (X86_64)
+if (X86_64 AND NOT ${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
   set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native")
 endif ()
 

From a95a5e52b8df842f0ec23c6d0ad9b299c1318ab4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 16 Aug 2019 09:00:11 +0200
Subject: [PATCH 008/210] Fix PGI compiler detection for getarch

---
 Makefile.system | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index 6addbdad5..a54282f6c 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -142,9 +142,9 @@ endif
 endif
 
 
-# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch.
+# On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
 ifeq ($(ARCH), x86_64)
-ifneq ($(C_COMPILER), PGI)
+ifeq ($(findstring pgcc,$(HOSTCC)),)
 GETARCH_FLAGS += -march=native
 endif
 endif

From 6d8595351c8452f32dc015cda30cf1d4983d2447 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 19 Aug 2019 14:19:21 +0200
Subject: [PATCH 009/210] Add Intel Goldmont Plus CPUID

fixes #2227
---
 cpuid_x86.c |  29 +-
 dynamic.c   | 897 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 919 insertions(+), 7 deletions(-)
 create mode 100644 dynamic.c

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 884d4b78a..8c954bf21 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1211,7 +1211,7 @@ int get_cpuname(void){
 	  return CPUTYPE_CORE2;
 	}
 	break;
-      case 1:
+      case 1:  // family 6 exmodel 1
 	switch (model) {
 	case  6:
 	  return CPUTYPE_CORE2;
@@ -1228,7 +1228,7 @@ int get_cpuname(void){
 	  return CPUTYPE_DUNNINGTON;
 	}
 	break;
-      case  2:
+      case  2: // family 6 exmodel 2
 	switch (model) {
 	case 5:
 	  //Intel Core (Clarkdale) / Core (Arrandale)
@@ -1257,7 +1257,7 @@ int get_cpuname(void){
 	  return CPUTYPE_NEHALEM;
 	}
 	break;
-      case 3:
+      case 3: // family 6 exmodel 3
 	switch (model) {
 	case  7:
 	    // Bay Trail	
@@ -1287,7 +1287,7 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
 	}
 	break;
-      case 4:
+      case 4: // family 6 exmodel 4
         switch (model) {
         case 5:
 	case 6:
@@ -1321,7 +1321,7 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
         }
         break;
-      case 5:
+      case 5:  // family 6 exmodel 5
         switch (model) {
 	case 6:
 	  //Broadwell
@@ -1364,7 +1364,7 @@ int get_cpuname(void){
 	    return CPUTYPE_NEHALEM;
 	}
 	break;
-      case 6:
+      case 6:  // family 6 exmodel 6
         switch (model) {
         case 6: // Cannon Lake
           if(support_avx512())
@@ -1376,7 +1376,22 @@ int get_cpuname(void){
 	  else
 	  return CPUTYPE_NEHALEM;
         }
-      break;  
+      break;
+      case 7: // family 6 exmodel 7
+        switch (model) {
+        case 10: // Goldmont Plus
+          return CPUTYPE_NEHALEM;
+        case 14: // Ice Lake
+          if(support_avx512())
+            return CPUTYPE_SKYLAKEX;
+          if(support_avx2())
+            return CPUTYPE_HASWELL;
+          if(support_avx())
+	    return CPUTYPE_SANDYBRIDGE;
+	  else
+	  return CPUTYPE_NEHALEM;
+        }
+      break;
       case 9:
       case 8:      
         switch (model) {
diff --git a/dynamic.c b/dynamic.c
new file mode 100644
index 000000000..aa2b87621
--- /dev/null
+++ b/dynamic.c
@@ -0,0 +1,897 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+
+#ifdef _MSC_VER
+#define strncasecmp _strnicmp
+#define strcasecmp _stricmp
+#endif
+
+#ifdef ARCH_X86
+#define EXTERN extern
+#else
+#define EXTERN
+#endif
+
+#ifdef DYNAMIC_LIST
+extern gotoblas_t gotoblas_PRESCOTT;
+
+#ifdef DYN_ATHLON
+extern gotoblas_t gotoblas_ATHLON;
+#else
+#define gotoblas_ATHLON gotoblas_PRESCOTT
+#endif
+#ifdef DYN_KATMAI
+extern gotoblas_t gotoblas_KATMAI;
+#else
+#define gotoblas_KATMAI gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BANIAS
+extern gotoblas_t gotoblas_BANIAS;
+#else
+#define gotoblas_BANIAS gotoblas_PRESCOTT
+#endif
+#ifdef DYN_COPPERMINE
+extern gotoblas_t gotoblas_COPPERMINE;
+#else
+#define gotoblas_COPPERMINE gotoblas_PRESCOTT
+#endif
+#ifdef DYN_NORTHWOOD
+extern gotoblas_t gotoblas_NORTHWOOD;
+#else
+#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
+#endif
+#ifdef DYN_CORE2
+extern gotoblas_t gotoblas_CORE2;
+#else
+#define gotoblas_CORE2 gotoblas_PRESCOTT
+#endif
+#ifdef DYN_NEHALEM
+extern gotoblas_t gotoblas_NEHALEM;
+#else
+#define gotoblas_NEHALEM gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BARCELONA
+extern gotoblas_t gotoblas_BARCELONA;
+#elif defined(DYN_NEHALEM)
+#define gotoblas_BARCELONA gotoblas_NEHALEM
+#else
+#define gotoblas_BARCELONA gotoblas_PRESCOTT
+#endif
+#ifdef DYN_ATOM
+extern gotoblas_t gotoblas_ATOM;
+elif defined(DYN_NEHALEM)
+#define gotoblas_ATOM gotoblas_NEHALEM
+#else
+#define gotoblas_ATOM gotoblas_PRESCOTT
+#endif
+#ifdef DYN_NANO
+extern gotoblas_t gotoblas_NANO;
+#else
+#define gotoblas_NANO gotoblas_PRESCOTT
+#endif
+#ifdef DYN_PENRYN
+extern gotoblas_t gotoblas_PENRYN;
+#else
+#define gotoblas_PENRYN gotoblas_PRESCOTT
+#endif
+#ifdef DYN_DUNNINGTON
+extern gotoblas_t gotoblas_DUNNINGTON;
+#else
+#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
+#endif
+#ifdef DYN_OPTERON
+extern gotoblas_t gotoblas_OPTERON;
+#else
+#define gotoblas_OPTERON gotoblas_PRESCOTT
+#endif
+#ifdef DYN_OPTERON_SSE3
+extern gotoblas_t gotoblas_OPTERON_SSE3;
+#else
+#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BOBCAT
+extern gotoblas_t gotoblas_BOBCAT;
+#elif defined(DYN_NEHALEM)
+#define gotoblas_BOBCAT gotoblas_NEHALEM
+#else
+#define gotoblas_BOBCAT gotoblas_PRESCOTT
+#endif
+#ifdef DYN_SANDYBRIDGE
+extern gotoblas_t gotoblas_SANDYBRIDGE;
+#elif defined(DYN_NEHALEM)
+#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
+#else
+#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
+#endif
+#ifdef DYN_BULLDOZER
+extern gotoblas_t gotoblas_BULLDOZER;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_BULLDOZER gotoblas_NEHALEM
+#else
+#define gotoblas_BULLDOZER gotoblas_PRESCOTT
+#endif
+#ifdef DYN_PILEDRIVER
+extern gotoblas_t gotoblas_PILEDRIVER;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_PILEDRIVER gotoblas_NEHALEM
+#else
+#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
+#endif
+#ifdef DYN_STEAMROLLER
+extern gotoblas_t gotoblas_STEAMROLLER;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_STEAMROLLER gotoblas_NEHALEM
+#else
+#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
+#endif
+#ifdef DYN_EXCAVATOR
+extern gotoblas_t gotoblas_EXCAVATOR;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_EXCAVATOR gotoblas_NEHALEM
+#else
+#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
+#endif
+#ifdef DYN_HASWELL
+extern gotoblas_t gotoblas_HASWELL;
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_HASWELL gotoblas_NEHALEM
+#else
+#define gotoblas_HASWELL gotoblas_PRESCOTT
+#endif
+#ifdef DYN_ZEN
+extern gotoblas_t gotoblas_ZEN;
+#elif defined(DYN_HASWELL)
+#define gotoblas_ZEN gotoblas_HASWELL
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_ZEN gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_ZEN gotoblas_NEHALEM
+#else
+#define gotoblas_ZEN gotoblas_PRESCOTT
+#endif
+#ifdef DYN_SKYLAKEX
+extern gotoblas_t gotoblas_SKYLAKEX;
+#elif defined(DYN_HASWELL)
+#define gotoblas_SKYLAKEX gotoblas_HASWELL
+#elif defined(DYN_SANDYBRIDGE)
+#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
+#elif defined(DYN_NEHALEM)
+#define gotoblas_SKYLAKEX gotoblas_NEHALEM
+#else
+#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
+#endif
+
+
+#else // not DYNAMIC_LIST
+EXTERN gotoblas_t  gotoblas_KATMAI;
+EXTERN gotoblas_t  gotoblas_COPPERMINE;
+EXTERN gotoblas_t  gotoblas_NORTHWOOD;
+EXTERN gotoblas_t  gotoblas_BANIAS;
+EXTERN gotoblas_t  gotoblas_ATHLON;
+
+extern gotoblas_t  gotoblas_PRESCOTT;
+extern gotoblas_t  gotoblas_CORE2;
+extern gotoblas_t  gotoblas_NEHALEM;
+extern gotoblas_t  gotoblas_BARCELONA;
+#ifdef DYNAMIC_OLDER
+extern gotoblas_t  gotoblas_ATOM;
+extern gotoblas_t  gotoblas_NANO;
+extern gotoblas_t  gotoblas_PENRYN;
+extern gotoblas_t  gotoblas_DUNNINGTON;
+extern gotoblas_t  gotoblas_OPTERON;
+extern gotoblas_t  gotoblas_OPTERON_SSE3;
+extern gotoblas_t  gotoblas_BOBCAT;
+#else
+#define gotoblas_ATOM gotoblas_NEHALEM
+#define gotoblas_NANO gotoblas_NEHALEM
+#define gotoblas_PENRYN gotoblas_CORE2
+#define gotoblas_DUNNINGTON gotoblas_CORE2
+#define gotoblas_OPTERON gotoblas_CORE2
+#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
+#define gotoblas_BOBCAT gotoblas_CORE2
+#endif
+
+#ifndef NO_AVX
+extern gotoblas_t  gotoblas_SANDYBRIDGE;
+extern gotoblas_t  gotoblas_BULLDOZER;
+extern gotoblas_t  gotoblas_PILEDRIVER;
+extern gotoblas_t  gotoblas_STEAMROLLER;
+extern gotoblas_t  gotoblas_EXCAVATOR;
+#ifdef NO_AVX2
+#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
+#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
+#define gotoblas_ZEN gotoblas_SANDYBRIDGE
+#else
+extern gotoblas_t  gotoblas_HASWELL;
+extern gotoblas_t  gotoblas_ZEN;
+#ifndef NO_AVX512
+extern gotoblas_t  gotoblas_SKYLAKEX;
+#else
+#define gotoblas_SKYLAKEX gotoblas_HASWELL
+#endif
+#endif
+#else
+//Use NEHALEM kernels for sandy bridge
+#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
+#define gotoblas_HASWELL gotoblas_NEHALEM
+#define gotoblas_SKYLAKEX gotoblas_NEHALEM
+#define gotoblas_BULLDOZER gotoblas_BARCELONA
+#define gotoblas_PILEDRIVER gotoblas_BARCELONA
+#define gotoblas_STEAMROLLER gotoblas_BARCELONA
+#define gotoblas_EXCAVATOR gotoblas_BARCELONA
+#define gotoblas_ZEN gotoblas_BARCELONA
+#endif
+
+#endif // DYNAMIC_LIST
+
+#define VENDOR_INTEL      1
+#define VENDOR_AMD        2
+#define VENDOR_CENTAUR    3
+#define VENDOR_HYGON	  4
+#define VENDOR_UNKNOWN   99
+
+#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
+
+#ifndef NO_AVX
+static inline void xgetbv(int op, int * eax, int * edx){
+  //Use binary code for xgetbv
+  __asm__ __volatile__
+    (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
+}
+#endif
+
+int support_avx(){
+#ifndef NO_AVX
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  cpuid(1, &eax, &ebx, &ecx, &edx);
+  if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
+    xgetbv(0, &eax, &edx);
+    if((eax & 6) == 6){
+      ret=1;  //OS support AVX
+    }
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+int support_avx2(){
+#ifndef NO_AVX2
+  int eax, ebx, ecx=0, edx;
+  int ret=0;
+
+  if (!support_avx())
+    return 0;
+  cpuid(7, &eax, &ebx, &ecx, &edx);
+  if((ebx & (1<<7)) != 0)
+      ret=1;  //OS supports AVX2
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+int support_avx512(){
+#if !defined(NO_AVX) && !defined(NO_AVX512)
+  int eax, ebx, ecx, edx;
+  int ret=0;
+
+  if (!support_avx())
+    return 0;
+  cpuid(7, &eax, &ebx, &ecx, &edx);
+  if((ebx & (1<<7)) != 1){
+      ret=0;  //OS does not even support AVX2
+  }
+  if((ebx & (1<<31)) != 0){
+    xgetbv(0, &eax, &edx);
+    if((eax & 0xe0) == 0xe0)
+      ret=1;  //OS supports AVX512VL
+  }
+  return ret;
+#else
+  return 0;
+#endif
+}
+
+extern void openblas_warning(int verbose, const char * msg);
+#define FALLBACK_VERBOSE 1
+#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
+#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
+#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
+#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
+
+static int get_vendor(void){
+  int eax, ebx, ecx, edx;
+
+  union
+  {
+        char vchar[16];
+        int  vint[4];
+  } vendor;
+
+  cpuid(0, &eax, &ebx, &ecx, &edx);
+
+  *(&vendor.vint[0]) = ebx;
+  *(&vendor.vint[1]) = edx;
+  *(&vendor.vint[2]) = ecx;
+
+  vendor.vchar[12] = '\0';
+
+  if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
+  if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
+  if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
+  if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
+
+  if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
+
+  return VENDOR_UNKNOWN;
+}
+
+static gotoblas_t *get_coretype(void){
+
+  int eax, ebx, ecx, edx;
+  int family, exfamily, model, vendor, exmodel;
+
+  cpuid(1, &eax, &ebx, &ecx, &edx);
+
+  family   = BITMASK(eax,  8, 0x0f);
+  exfamily = BITMASK(eax, 20, 0xff);
+  model    = BITMASK(eax,  4, 0x0f);
+  exmodel  = BITMASK(eax, 16, 0x0f);
+
+  vendor = get_vendor();
+
+  if (vendor == VENDOR_INTEL){
+    switch (family) {
+    case 0x6:
+      switch (exmodel) {
+      case 0:
+	if (model <= 0x7) return &gotoblas_KATMAI;
+	if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
+	if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
+	if (model == 14) return &gotoblas_BANIAS;
+	if (model == 15) return &gotoblas_CORE2;
+	return NULL;
+
+      case 1:
+	if (model == 6) return &gotoblas_CORE2;
+	if (model == 7) return &gotoblas_PENRYN;
+	if (model == 13) return &gotoblas_DUNNINGTON;
+	if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
+	if (model == 12) return &gotoblas_ATOM;
+	return NULL;
+
+      case 2:
+	//Intel Core (Clarkdale) / Core (Arrandale)
+	// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
+	// Xeon (Clarkdale), 32nm
+	if (model ==  5) return &gotoblas_NEHALEM;
+
+	//Intel Xeon Processor 5600 (Westmere-EP)
+	//Xeon Processor E7 (Westmere-EX)
+	//Xeon E7540
+	if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
+
+	//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
+	//Intel Core i7-3000 / Xeon E5
+	if (model == 10 || model == 13) {
+	  if(support_avx())
+	    return &gotoblas_SANDYBRIDGE;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	return NULL;
+      case 3:
+	//Intel Sandy Bridge 22nm (Ivy Bridge?)
+	if (model == 10 || model == 14) {
+	  if(support_avx())
+	    return &gotoblas_SANDYBRIDGE;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	//Intel Haswell
+	if (model == 12 || model == 15) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	//Intel Broadwell
+	if (model == 13) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	if (model == 7) return &gotoblas_ATOM; //Bay Trail	
+	return NULL;
+      case 4:
+		//Intel Haswell
+	if (model == 5 || model == 6) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	//Intel Broadwell
+	if (model == 7 || model == 15) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	//Intel Skylake
+	if (model == 14) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	//Intel Braswell / Avoton
+	if (model == 12 || model == 13) { 
+	  return &gotoblas_NEHALEM;
+	}	
+	return NULL;
+      case 5:
+	//Intel Broadwell
+	if (model == 6) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	if (model == 5) {	
+	// Intel Skylake X
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+	  if(support_avx2()){
+	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
+	    return &gotoblas_HASWELL;
+          }
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+          return &gotoblas_NEHALEM;
+          }
+	}
+	//Intel Skylake
+	if (model == 14) {
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	//Intel Phi Knights Landing
+	if (model == 7) {
+	  if(support_avx2()){
+	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
+	    return &gotoblas_HASWELL;
+	  }  
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	//Apollo Lake or Denverton
+	if (model == 12 || model == 15) { 
+	  return &gotoblas_NEHALEM;
+	}	
+	return NULL;
+      case 6:
+        if (model == 6) {
+          // Cannon Lake
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM;
+	  }
+        }
+        return NULL;  
+      case 7:
+        if (model == 10) // Goldmont plus 
+           return &gotoblas_NEHALEM;
+        if (model == 14) {
+	// Ice Lake
+          if (support_avx512()) 
+	    return &gotoblas_SKYLAKEX;
+	  if(support_avx2()){
+	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
+	    return &gotoblas_HASWELL;
+          }
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+          return &gotoblas_NEHALEM;
+          }
+        }
+        return NULL;  
+      case 9:
+      case 8:
+	if (model == 14 ) { // Kaby Lake, Coffee Lake
+	  if(support_avx2())
+	    return &gotoblas_HASWELL;
+	  if(support_avx()) {
+	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
+	    return &gotoblas_SANDYBRIDGE;
+	  } else {
+	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
+	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+	return NULL;
+      }
+      case 0xf:
+      if (model <= 0x2) return &gotoblas_NORTHWOOD;
+      return &gotoblas_PRESCOTT;
+    }
+  }
+
+  if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
+    if (family <= 0xe) {
+        // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
+        cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
+        if ( (eax & 0xffff)  >= 0x01) {
+            cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
+            if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
+              return NULL;
+          }
+        else
+          return NULL;
+
+        return &gotoblas_ATHLON;
+      }
+    if (family == 0xf){
+      if ((exfamily == 0) || (exfamily == 2)) {
+	if (ecx & (1 <<  0)) return &gotoblas_OPTERON_SSE3;
+	else return &gotoblas_OPTERON;
+      }  else if (exfamily == 5) {
+	return &gotoblas_BOBCAT;
+      } else if (exfamily == 6) {
+	if(model == 1){
+	  //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
+	  if(support_avx())
+	    return &gotoblas_BULLDOZER;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+	  }
+	}else if(model == 2 || model == 3){
+	  //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
+	  if(support_avx())
+	    return &gotoblas_PILEDRIVER;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+	  }
+	}else if(model == 5){
+	  if(support_avx())
+	    return &gotoblas_EXCAVATOR;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+	  }
+	}else if(model == 0 || model == 8){
+	  if (exmodel == 1) {
+	    //AMD Trinity
+	    if(support_avx())
+	      return &gotoblas_PILEDRIVER;
+	    else{
+	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+	    }
+	   }else if (exmodel == 3) {
+	    //AMD STEAMROLLER
+	    if(support_avx())
+	      return &gotoblas_STEAMROLLER;
+	    else{
+	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+	    }
+	  }else if (exmodel == 6) {
+	    if(support_avx())
+	      return &gotoblas_EXCAVATOR;
+	    else{
+	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+	    }
+
+	  }
+	}
+      } else if (exfamily == 8) {
+	if (model == 1 || model == 8) {
+	  if(support_avx())
+	    return &gotoblas_ZEN;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+	  }
+	}
+      } else if (exfamily == 9) {
+	  if(support_avx())
+	    return &gotoblas_ZEN;
+	  else{
+	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
+	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
+        }
+      }else {
+	return &gotoblas_BARCELONA;
+      }
+    }
+  }
+
+  if (vendor == VENDOR_CENTAUR) {
+    switch (family) {
+    case 0x6:
+      return &gotoblas_NANO;
+    }
+  }
+
+  return NULL;
+}
+
+static char *corename[] = {
+    "Unknown",
+    "Katmai",
+    "Coppermine",
+    "Northwood",
+    "Prescott",
+    "Banias",
+    "Atom",
+    "Core2",
+    "Penryn",
+    "Dunnington",
+    "Nehalem",
+    "Athlon",
+    "Opteron",
+    "Opteron_SSE3",
+    "Barcelona",
+    "Nano",
+    "Sandybridge",
+    "Bobcat",
+    "Bulldozer",
+    "Piledriver",
+    "Haswell",
+    "Steamroller",
+    "Excavator",
+    "Zen",
+    "SkylakeX"	
+};
+
+char *gotoblas_corename(void) {
+
+  if (gotoblas == &gotoblas_KATMAI)       return corename[ 1];
+  if (gotoblas == &gotoblas_COPPERMINE)   return corename[ 2];
+  if (gotoblas == &gotoblas_NORTHWOOD)    return corename[ 3];
+  if (gotoblas == &gotoblas_PRESCOTT)     return corename[ 4];
+  if (gotoblas == &gotoblas_BANIAS)       return corename[ 5];
+  if (gotoblas == &gotoblas_ATOM)         return corename[ 6];
+  if (gotoblas == &gotoblas_CORE2)        return corename[ 7];
+  if (gotoblas == &gotoblas_PENRYN)       return corename[ 8];
+  if (gotoblas == &gotoblas_DUNNINGTON)   return corename[ 9];
+  if (gotoblas == &gotoblas_NEHALEM)      return corename[10];
+  if (gotoblas == &gotoblas_ATHLON)       return corename[11];
+  if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
+  if (gotoblas == &gotoblas_OPTERON)      return corename[13];
+  if (gotoblas == &gotoblas_BARCELONA)    return corename[14];
+  if (gotoblas == &gotoblas_NANO)         return corename[15];
+  if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
+  if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
+  if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
+  if (gotoblas == &gotoblas_PILEDRIVER)   return corename[19];
+  if (gotoblas == &gotoblas_HASWELL)      return corename[20];
+  if (gotoblas == &gotoblas_STEAMROLLER)  return corename[21];
+  if (gotoblas == &gotoblas_EXCAVATOR)    return corename[22];
+  if (gotoblas == &gotoblas_ZEN)          return corename[23];
+  if (gotoblas == &gotoblas_SKYLAKEX)     return corename[24];
+  return corename[0];
+}
+
+
+static gotoblas_t *force_coretype(char *coretype){
+
+	int i ;
+	int found = -1;
+	char message[128];
+	//char mname[20];
+
+	for ( i=1 ; i <= 24; i++)
+	{
+		if (!strncasecmp(coretype,corename[i],20))
+		{
+			found = i;
+			break;
+		}
+	}
+	if (found < 0)
+	{
+	        //strncpy(mname,coretype,20);
+	        snprintf(message, 128, "Core not found: %s\n",coretype);
+    		openblas_warning(1, message);
+		return(NULL);
+	}
+
+	switch (found)
+	{
+		case 24: return (&gotoblas_SKYLAKEX);	
+		case 23: return (&gotoblas_ZEN);
+		case 22: return (&gotoblas_EXCAVATOR);
+		case 21: return (&gotoblas_STEAMROLLER);
+		case 20: return (&gotoblas_HASWELL);
+		case 19: return (&gotoblas_PILEDRIVER);
+		case 18: return (&gotoblas_BULLDOZER);
+		case 17: return (&gotoblas_BOBCAT);
+		case 16: return (&gotoblas_SANDYBRIDGE);
+		case 15: return (&gotoblas_NANO);
+		case 14: return (&gotoblas_BARCELONA);
+		case 13: return (&gotoblas_OPTERON);
+		case 12: return (&gotoblas_OPTERON_SSE3);
+		case 11: return (&gotoblas_ATHLON);
+		case 10: return (&gotoblas_NEHALEM);
+		case  9: return (&gotoblas_DUNNINGTON);
+		case  8: return (&gotoblas_PENRYN);
+		case  7: return (&gotoblas_CORE2);
+		case  6: return (&gotoblas_ATOM);
+		case  5: return (&gotoblas_BANIAS);
+		case  4: return (&gotoblas_PRESCOTT);
+		case  3: return (&gotoblas_NORTHWOOD);
+		case  2: return (&gotoblas_COPPERMINE);
+		case  1: return (&gotoblas_KATMAI);
+	}
+	return(NULL);
+
+}
+
+
+
+
+void gotoblas_dynamic_init(void) {
+
+  char coremsg[128];
+  char coren[22];
+  char *p;
+
+
+  if (gotoblas) return;
+
+  p = getenv("OPENBLAS_CORETYPE");
+  if ( p )
+  {
+	gotoblas = force_coretype(p);
+  }
+  else
+  {
+  	gotoblas = get_coretype();
+  }
+
+#ifdef ARCH_X86
+  if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
+#else
+  if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
+  /* sanity check, if 64bit pointer we can't have a 32 bit cpu */
+  if (sizeof(void*) == 8) {
+      if (gotoblas == &gotoblas_KATMAI ||
+          gotoblas == &gotoblas_COPPERMINE ||
+          gotoblas == &gotoblas_NORTHWOOD ||
+          gotoblas == &gotoblas_BANIAS ||
+          gotoblas == &gotoblas_ATHLON)
+          gotoblas = &gotoblas_PRESCOTT;
+  }
+#endif
+
+  if (gotoblas && gotoblas -> init) {
+    strncpy(coren,gotoblas_corename(),20);
+    sprintf(coremsg, "Core: %s\n",coren);
+    openblas_warning(2, coremsg);
+    gotoblas -> init();
+  } else {
+    openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+    exit(1);
+  }
+
+}
+
+void gotoblas_dynamic_quit(void) {
+
+  gotoblas = NULL;
+
+}

From 3dc6b26eff770dc3a74bb370500f5fb99ce540d1 Mon Sep 17 00:00:00 2001
From: Kavana Bhat <Kavana.bhat@in.ibm.com>
Date: Tue, 20 Aug 2019 06:51:35 -0500
Subject: [PATCH 010/210] AIX changes for Power8

---
 common_power.h                              |   29 +
 kernel/Makefile.L3                          |  238 +++-
 kernel/power/casum_microk_power8.c          |   10 +-
 kernel/power/ccopy_microk_power8.c          |   10 +-
 kernel/power/cgemm_macros_8x4_power8.S      |  768 +++++++++++
 kernel/power/cgemm_tcopy_macros_8_power8.S  |   96 ++
 kernel/power/crot.c                         |   10 +-
 kernel/power/cswap_microk_power8.c          |    6 +-
 kernel/power/ctrmm_macros_8x4_power8.S      |  768 +++++++++++
 kernel/power/dasum_microk_power8.c          |   12 +-
 kernel/power/daxpy_microk_power8.c          |   10 +-
 kernel/power/dcopy_microk_power8.c          |   10 +-
 kernel/power/ddot_microk_power8.c           |   12 +-
 kernel/power/dgemm_macros_16x4_power8.S     |  976 ++++++++++++++
 kernel/power/dgemm_ncopy_macros_4_power8.S  |  120 ++
 kernel/power/dgemm_tcopy_macros_16_power8.S |  120 ++
 kernel/power/dgemv_n_microk_power8.c        |   26 +-
 kernel/power/dgemv_t.c                      |   36 +-
 kernel/power/drot_microk_power8.c           |   14 +-
 kernel/power/dscal_microk_power8.c          |   18 +-
 kernel/power/dswap_microk_power8.c          |    6 +-
 kernel/power/dtrmm_macros_16x4_power8.S     |  960 ++++++++++++++
 kernel/power/dtrsm_macros_LT_16x4_power8.S  |  878 +++++++++----
 kernel/power/idamax.c                       |   64 +-
 kernel/power/idamin.c                       |   64 +-
 kernel/power/izamax.c                       |  136 +-
 kernel/power/izamin.c                       |  136 +-
 kernel/power/lock.c                         |    4 +-
 kernel/power/sasum_microk_power8.c          |   10 +-
 kernel/power/scopy_microk_power8.c          |   10 +-
 kernel/power/sdot_microk_power8.c           |   10 +-
 kernel/power/sgemm_macros_16x8_power8.S     | 1296 +++++++++++++++++++
 kernel/power/sgemm_tcopy_macros_16_power8.S |  120 ++
 kernel/power/sgemm_tcopy_macros_8_power8.S  |   96 ++
 kernel/power/srot_microk_power8.c           |   10 +-
 kernel/power/sscal_microk_power8.c          |   16 +-
 kernel/power/sswap_microk_power8.c          |    6 +-
 kernel/power/strmm_macros_16x8_power8.S     | 1280 ++++++++++++++++++
 kernel/power/zasum_microk_power8.c          |   12 +-
 kernel/power/zaxpy_microk_power8.c          |   46 +-
 kernel/power/zcopy_microk_power8.c          |   10 +-
 kernel/power/zdot_microk_power8.c           |   42 +-
 kernel/power/zgemm_macros_8x2_power8.S      |  832 +++++++++---
 kernel/power/zgemm_tcopy_macros_8_power8.S  |   96 ++
 kernel/power/zrot.c                         |   14 +-
 kernel/power/zscal_microk_power8.c          |   46 +-
 kernel/power/zswap_microk_power8.c          |    6 +-
 kernel/power/ztrmm_macros_8x2_power8.S      |  782 +++++++++--
 48 files changed, 9272 insertions(+), 1005 deletions(-)

diff --git a/common_power.h b/common_power.h
index 889205c75..76b9f0f32 100644
--- a/common_power.h
+++ b/common_power.h
@@ -39,6 +39,35 @@
 #ifndef COMMON_POWER
 #define COMMON_POWER
 
+#define str(x)	#x
+
+#ifdef OS_AIX
+#define XXSPLTD(T,A,z)	xxpermdi	T, A, A, 0b##z##z
+#define XXMRGHD(T,A,B)	xxpermdi	T, A, B, 0b00
+#define XXMRGLD(T,A,B)	xxpermdi	T, A, B, 0b11
+#define XXSWAPD(T,A)	xxpermdi	T, A, A, 0b10
+#define XVMOVDP(T,A)	xvcpsgndp	T, A, A
+
+#define XXSPLTD_S(T,A,z)	"xxpermdi	" str(T) ", " str(A) ", " str(A) ", 0b" str(z ## z) "	\n\t"
+#define XXMRGHD_S(T,A,B)	"xxpermdi	" str(T) ", " str(A) ", " str(B) ", 0b00	\n\t"
+#define XXMRGLD_S(T,A,B)	"xxpermdi	" str(T) ", " str(A) ", " str(B) ", 0b11	\n\t"
+#define XXSWAPD_S(T,A)	"xxpermdi	" str(T) ", " str(A) ", " str(A) ", 0b10	\n\t"
+
+#else
+#define XXSPLTD(T,A,z)	xxspltd	T, A, z
+#define XXMRGHD(T,A,B)	xxmrghd	T, A, B
+#define XXMRGLD(T,A,B)	xxmrgld	T, A, B
+#define XXSWAPD(T,A)	xxswapd	T, A
+#define XVMOVDP(T,A)	xvmovdp	T, A
+
+#define XXSPLTD_S(T,A,z)	"xxspltd	T, A, z	\n\t"
+#define XXMRGHD_S(T,A,B)	"xxmrghd	T, A, B	\n\t"
+#define XXMRGLD_S(T,A,B)	"xxmrgld	T, A, B	\n\t"
+#define XXSWAPD_S(T,A)	"xxswapd	T, A"
+
+#endif
+
+
 #if defined(POWER8) || defined(POWER9)
 #define MB		__asm__ __volatile__ ("eieio":::"memory")
 #define WMB		__asm__ __volatile__ ("eieio":::"memory")
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index f83def47b..ed8ae406f 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -57,8 +57,6 @@ USE_TRMM = 1
 endif
 
 
-
-
 SKERNELOBJS	+= \
 	sgemm_kernel$(TSUFFIX).$(SUFFIX) \
 	$(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \
@@ -436,7 +434,10 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s	
+	m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
+	rm sgemmotcopy.s sgemmotcopy_nomacros.s
 
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 
@@ -444,12 +445,17 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-
+	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s	
+	m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
+	rm sgemmitcopy.s sgemmitcopy_nomacros.s
 endif
 
 $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
+	m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
+	rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
 
 $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@@ -460,7 +466,10 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
+	m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
+	rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
 
 endif
 
@@ -485,10 +494,16 @@ endif
 endif
 
 $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
+#	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s
+#	m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+#	rm cgemm_oncopy.s cgemm_oncopy_nomacros.s
 
 $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
+#	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX  $< -o cgemm_otcopy.s
+#	m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+#	rm cgemm_otcopy.s cgemm_otcopy_nomacros.s
 
 ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
 
@@ -496,7 +511,10 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
+	m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
+	rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
 
 endif
 
@@ -512,7 +530,10 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
+	m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
+	rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
 
 endif
 
@@ -537,37 +558,67 @@ endif
 endif
 
 $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX  $< -o sgemm_kernel$(TSUFFIX).s
+	m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
+	rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
 
 $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
+	m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
+	rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
 
 $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
+	m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
+	rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
 
 $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
+	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
+	m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
+	rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
 
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC  $< -o cgemm_kernel_r.s
+	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
+	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
 
 $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
+	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
+	m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
+	rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
 
 $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
+	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
+	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
 
 $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
+	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
+	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
 
 $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
+	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
+	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
 
 $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
-	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
+	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
+	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
+	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
 
 $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
@@ -584,28 +635,56 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
 
 ifdef USE_TRMM
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s	
+	m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
+	rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
 
 $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s	
+	m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
+	rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
 
 $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s	
+	m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
+	rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
+	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
+	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
+#	$(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s
+	m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
+	rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
 
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
+#	$(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s
+	m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
+	rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
 
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
+#	$(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s
+	m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
+	rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
 
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
+#	$(CC) $(CFLAGS) -E  $< -o dtrmm_kernel_rt.s
+	m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
+	rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
 
 $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -620,52 +699,100 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o ctrmm_kernel_ln.s
+	m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
+	rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
 
 $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
+	m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
+	rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
 
 $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
+	m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN  ctrmm_kernel_lr_nomacros.s -o $@
+	rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
 
 $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
+	m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
+	rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
 
 $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
+	m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
+	rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
 
 $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
+	m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
+	rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
 
 $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
+	m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
+	rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
 
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
+	m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
+	rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
 
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
+	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
+	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
 
 $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
+	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
+	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
 
 $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
+	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
+	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
 
 $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
+	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
+	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
 
 $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
+	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
+	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
 
 $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
+	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
+	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
 
 $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
+	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
+	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
 
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
+	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
+	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
 else
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -677,7 +804,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
+	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
+	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -801,10 +931,16 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
 
 $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
+#	$(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s
+#	m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
+#	rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s
 
 $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
-	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
+	$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
+	m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
+	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
+	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
 
 $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
@@ -1940,7 +2076,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY)
 
 endif
 
-$(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
+$(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY)
 	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 $(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY)
@@ -2044,7 +2180,10 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
 
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
-	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+	$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
+	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
+	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
+	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s 
 
 $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
@@ -2083,7 +2222,10 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
-	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
+	m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
+	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
+	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
 
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index 7d12c9885..91d53ffc3 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -68,10 +68,10 @@ static float casum_kernel_16 (long n, float *x)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvabssp		48, 40		\n\t"
        "xvabssp		49, 41		\n\t"
@@ -108,9 +108,9 @@ static float casum_kernel_16 (long n, float *x)
        "xvaddsp		38, 38, %x5	\n\t"
        "xvaddsp		39, 39, %x6	\n\t"
 
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvabssp		48, 40		\n\t"
        "xvabssp		49, 41		\n\t"
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
index 613c4d286..6a7886e6f 100644
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -62,10 +62,10 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "stxvd2x		32, 0, %3	\n\t"
        "stxvd2x		33, %5, %3	\n\t"
@@ -108,9 +108,9 @@ static void ccopy_kernel_32 (long n, float *x, float *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "stxvd2x		32, 0, %3	\n\t"
        "stxvd2x		33, %5, %3	\n\t"
diff --git a/kernel/power/cgemm_macros_8x4_power8.S b/kernel/power/cgemm_macros_8x4_power8.S
index 9a18cb189..46108bbb4 100644
--- a/kernel/power/cgemm_macros_8x4_power8.S
+++ b/kernel/power/cgemm_macros_8x4_power8.S
@@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x8_1', `
+#else
 .macro LOAD4x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 	lxvw4x		vs1,	o16,	AO		// load a2, a3
@@ -107,9 +111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_I1', `
+#else
 .macro KERNEL4x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -172,9 +184,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_1', `
+#else
 .macro KERNEL4x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -237,9 +257,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_2', `
+#else
 .macro KERNEL4x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -302,9 +330,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_E2', `
+#else
 .macro KERNEL4x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -344,9 +380,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUBI1', `
+#else
 .macro KERNEL4x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -409,9 +453,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUB1', `
+#else
 .macro KERNEL4x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -474,9 +526,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x8', `
+#else
 .macro SAVE4x8
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -1546,14 +1606,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x4_1', `
+#else
 .macro LOAD4x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 	lxvw4x		vs1,	o16,	AO		// load a2, a3
@@ -1575,9 +1643,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_I1', `
+#else
 .macro KERNEL4x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -1622,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_1', `
+#else
 .macro KERNEL4x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -1669,9 +1753,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_2', `
+#else
 .macro KERNEL4x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -1716,9 +1808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_E2', `
+#else
 .macro KERNEL4x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -1742,9 +1842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUBI1', `
+#else
 .macro KERNEL4x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -1789,9 +1897,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUB1', `
+#else
 .macro KERNEL4x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -1836,9 +1952,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x4', `
+#else
 .macro SAVE4x4
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -2388,14 +2512,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x2_1', `
+#else
 .macro LOAD4x2_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -2416,9 +2548,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_I1', `
+#else
 .macro KERNEL4x2_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -2454,9 +2594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_1', `
+#else
 .macro KERNEL4x2_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -2492,9 +2640,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_2', `
+#else
 .macro KERNEL4x2_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -2530,9 +2686,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_E2', `
+#else
 .macro KERNEL4x2_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -2548,9 +2712,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUBI1', `
+#else
 .macro KERNEL4x2_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -2586,9 +2758,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUB1', `
+#else
 .macro KERNEL4x2_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -2624,9 +2804,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x2', `
+#else
 .macro SAVE4x2
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -2916,14 +3104,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x1_1', `
+#else
 .macro LOAD4x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
 	lxsspx		vs1,	o4,	AO		// load a0_i
@@ -2945,9 +3141,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_I1', `
+#else
 .macro KERNEL4x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -2992,9 +3196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_1', `
+#else
 .macro KERNEL4x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -3039,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_2', `
+#else
 .macro KERNEL4x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -3086,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_E2', `
+#else
 .macro KERNEL4x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
@@ -3112,9 +3340,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUBI1', `
+#else
 .macro KERNEL4x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -3159,9 +3395,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUB1', `
+#else
 .macro KERNEL4x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -3206,9 +3450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x1', `
+#else
 .macro SAVE4x1
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -3382,14 +3634,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 	lxvw4x		vs1,	o16,	AO		// load a2, a3
@@ -3406,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -3446,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -3486,9 +3762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -3526,9 +3810,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -3550,9 +3842,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -3590,9 +3890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -3630,9 +3938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -4170,14 +4486,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 	lxvw4x		vs1,	o16,	AO		// load a2, a3
@@ -4192,9 +4516,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4222,9 +4554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4252,9 +4592,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4282,9 +4630,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -4298,9 +4654,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4328,9 +4692,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4358,9 +4730,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -4638,14 +5018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -4659,9 +5047,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4684,9 +5080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4709,9 +5113,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4734,9 +5146,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -4746,9 +5166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4771,9 +5199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4796,9 +5232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -4946,14 +5390,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
 	lxsspx		vs1,	o4,	AO		// load a0_i
@@ -4968,9 +5420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -4998,9 +5458,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -5028,9 +5496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -5058,9 +5534,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
@@ -5074,9 +5558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -5104,9 +5596,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -5134,9 +5634,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -5226,14 +5734,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 	lxvw4x		vs1,	o16,	AO		// load a2, a3
@@ -5247,9 +5763,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -5275,9 +5799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -5303,9 +5835,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5331,9 +5871,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5346,9 +5894,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5374,9 +5930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5402,9 +5966,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -5676,14 +6248,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 	lxvw4x		vs1,	o16,	AO		// load a2, a3
@@ -5695,9 +6275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -5717,9 +6305,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -5739,9 +6335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5761,9 +6365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5772,9 +6384,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5794,9 +6414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5816,9 +6444,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -5960,14 +6596,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -5978,9 +6622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -5997,9 +6649,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -6016,9 +6676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6035,18 +6703,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
 	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6063,9 +6747,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6082,9 +6774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -6161,14 +6861,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
 	lxsspx		vs1,	o4,	AO		// load a0_i
@@ -6180,9 +6888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -6202,9 +6918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -6224,9 +6948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -6246,9 +6978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
@@ -6257,9 +6997,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -6279,9 +7027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -6301,9 +7057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 	mr		T1,	CO
 	xxlxor		vs24,	vs24,	vs24
@@ -6351,5 +7115,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/cgemm_tcopy_macros_8_power8.S b/kernel/power/cgemm_tcopy_macros_8_power8.S
index 03fda2766..64bf8dd99 100644
--- a/kernel/power/cgemm_tcopy_macros_8_power8.S
+++ b/kernel/power/cgemm_tcopy_macros_8_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
 .macro COPY_4x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -93,13 +97,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs46,	o32,	T1
 	stxvw4x		vs47,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
 .macro COPY_4x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -133,13 +145,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs38,	o32,	T1
 	stxvw4x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x2', `
+#else
 .macro COPY_4x2
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -163,13 +183,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x1', `
+#else
 .macro COPY_4x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -207,13 +235,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs38,	o0,	T1
 	stxsspx		vs39,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
 .macro COPY_2x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -241,13 +277,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs38,	o32,	T1
 	stxvw4x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
 .macro COPY_2x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -265,13 +309,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs34,	o32,	T1
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x2', `
+#else
 .macro COPY_2x2
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -285,13 +337,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x1', `
+#else
 .macro COPY_2x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -311,13 +371,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs34,	o0,	T1
 	stxsspx		vs35,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
 .macro COPY_1x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -332,13 +400,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs34,	o32,	T1
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
 .macro COPY_1x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -349,13 +425,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs32,	o0,	T1
 	stxvw4x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x2', `
+#else
 .macro COPY_1x2
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -364,13 +448,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x1', `
+#else
 .macro COPY_1x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -381,5 +473,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs32,	o0,	T1
 	stxsspx		vs33,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/crot.c b/kernel/power/crot.c
index 959a9eda0..2a5835546 100644
--- a/kernel/power/crot.c
+++ b/kernel/power/crot.c
@@ -56,9 +56,9 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
        "addi        %[x_ptr], %[x_ptr], 64    \n\t" 
        "addi        %[y_ptr], %[y_ptr], 64    \n\t" 
        "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "ble         2f                        \n\t" 
-       ".p2align    5                         \n\t" 
-       "1:                                    \n\t" 
+       "ble         two%=                        \n\t" 
+       ".align    5                         \n\t" 
+       "one%=:                                    \n\t" 
        "xvmulsp     40, 32, 36                \n\t" // c * x
        "xvmulsp     41, 33, 36                \n\t" 
        "xvmulsp     42, 34, 36                \n\t" 
@@ -104,8 +104,8 @@ static void crot_kernel_8 (long n, float *x, float *y, float c, float s)
        "addi        %[x_ptr], %[x_ptr], 128   \n\t" 
        "addi        %[y_ptr], %[y_ptr], 128   \n\t" 
        "addic.      %[temp_n], %[temp_n], -8  \n\t" 
-       "bgt         1b                        \n\t" 
-       "2:                                    \n\t" 
+       "bgt         one%=                        \n\t" 
+       "two%=:                                    \n\t" 
        "xvmulsp     40, 32, 36                \n\t" // c * x
        "xvmulsp     41, 33, 36                \n\t" 
        "xvmulsp     42, 34, 36                \n\t" 
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
index 8d7d0c0b9..829800230 100644
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -39,8 +39,8 @@ static void cswap_kernel_32 (long n, float *x, float *y)
 {
   __asm__
     (
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "lxvd2x		32, 0, %4	\n\t"
        "lxvd2x		33, %5, %4	\n\t"
@@ -131,7 +131,7 @@ static void cswap_kernel_32 (long n, float *x, float *y)
        "addi		%4, %4, 128	\n\t"
 
        "addic.		%2, %2, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
      "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :
diff --git a/kernel/power/ctrmm_macros_8x4_power8.S b/kernel/power/ctrmm_macros_8x4_power8.S
index 48a21252c..922cab57a 100644
--- a/kernel/power/ctrmm_macros_8x4_power8.S
+++ b/kernel/power/ctrmm_macros_8x4_power8.S
@@ -83,7 +83,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x8_1', `
+#else
 .macro LOAD4x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -113,9 +117,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_I1', `
+#else
 .macro KERNEL4x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -184,9 +196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_1', `
+#else
 .macro KERNEL4x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -255,9 +275,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_2', `
+#else
 .macro KERNEL4x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -326,9 +354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_E2', `
+#else
 .macro KERNEL4x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -368,9 +404,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23		// a7_r*b3_i, a7_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUBI1', `
+#else
 .macro KERNEL4x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -439,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUB1', `
+#else
 .macro KERNEL4x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -510,9 +562,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15		// a3_r*b3_i, a3_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x8', `
+#else
 .macro SAVE4x8
+#endif
 
 	mr		T1,	CO
 
@@ -1597,14 +1657,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x4_1', `
+#else
 .macro LOAD4x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -1630,9 +1698,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_I1', `
+#else
 .macro KERNEL4x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -1681,9 +1757,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_1', `
+#else
 .macro KERNEL4x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -1732,9 +1816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_2', `
+#else
 .macro KERNEL4x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -1783,9 +1875,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_E2', `
+#else
 .macro KERNEL4x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -1809,9 +1909,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23		// a5_r*b3_i, a5_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUBI1', `
+#else
 .macro KERNEL4x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -1860,9 +1968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUB1', `
+#else
 .macro KERNEL4x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -1911,9 +2027,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15		// a1_r*b3_i, a1_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x4', `
+#else
 .macro SAVE4x4
+#endif
 
 	mr		T1,	CO
 
@@ -2470,14 +2594,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x2_1', `
+#else
 .macro LOAD4x2_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -2501,9 +2633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_I1', `
+#else
 .macro KERNEL4x2_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -2542,9 +2682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_1', `
+#else
 .macro KERNEL4x2_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -2583,9 +2731,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_2', `
+#else
 .macro KERNEL4x2_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -2624,9 +2780,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_E2', `
+#else
 .macro KERNEL4x2_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -2642,9 +2806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23		// a4_r*b3_i, a4_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUBI1', `
+#else
 .macro KERNEL4x2_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -2683,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUB1', `
+#else
 .macro KERNEL4x2_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -2724,9 +2904,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15		// a0_r*b3_i, a0_i*b3_i, a1_r*b3_i, a1_i*b3_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x2', `
+#else
 .macro SAVE4x2
+#endif
 
 	mr		T1,	CO
 
@@ -3019,14 +3207,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x1_1', `
+#else
 .macro LOAD4x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
 	lxsspx		vs1,	o4,	AO		// load a0_i
@@ -3055,9 +3251,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_I1', `
+#else
 .macro KERNEL4x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -3109,9 +3313,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_1', `
+#else
 .macro KERNEL4x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -3163,9 +3375,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_2', `
+#else
 .macro KERNEL4x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -3217,9 +3437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_E2', `
+#else
 .macro KERNEL4x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
@@ -3243,9 +3471,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs22		// a4_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUBI1', `
+#else
 .macro KERNEL4x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -3297,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUB1', `
+#else
 .macro KERNEL4x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -3351,9 +3595,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs14		// a0_i*b3_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x1', `
+#else
 .macro SAVE4x1
+#endif
 
 	mr		T1,	CO
 
@@ -3526,14 +3778,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -3556,9 +3816,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -3602,9 +3870,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -3648,9 +3924,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -3694,9 +3978,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -3718,9 +4010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19		// a7_r*b1_i, a7_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -3764,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -3810,9 +4118,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11		// a3_r*b1_i, a3_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 	mr		T1,	CO
 
@@ -4357,14 +4673,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -4383,9 +4707,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4417,9 +4749,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4451,9 +4791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4485,9 +4833,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -4501,9 +4857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19		// a5_r*b1_i, a5_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4535,9 +4899,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4569,9 +4941,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11		// a1_r*b1_i, a1_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 	mr		T1,	CO
 
@@ -4852,14 +5232,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -4876,9 +5264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4904,9 +5300,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -4932,9 +5336,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -4960,9 +5372,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -4972,9 +5392,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19		// a4_r*b1_i, a4_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5000,9 +5428,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5028,9 +5464,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11		// a0_r*b1_i, a0_i*b1_i, a1_r*b1_i, a1_i*b1_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 	mr		T1,	CO
 
@@ -5179,14 +5623,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
 	lxsspx		vs1,	o4,	AO		// load a0_i
@@ -5205,9 +5657,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -5239,9 +5699,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -5273,9 +5741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -5307,9 +5783,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
@@ -5323,9 +5807,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs18		// a4_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -5357,9 +5849,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -5391,9 +5891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs10		// a0_i*b1_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 	mr		T1,	CO
 
@@ -5482,14 +5990,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -5514,9 +6030,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -5553,9 +6077,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -5592,9 +6124,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5631,9 +6171,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -5646,9 +6194,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17		// a7_r*b0_i, a7_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5685,9 +6241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -5724,9 +6288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9		// a3_r*b0_i, a3_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 	mr		T1,	CO
 
@@ -6001,14 +6573,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -6029,9 +6609,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -6060,9 +6648,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -6091,9 +6687,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6122,9 +6726,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
@@ -6133,9 +6745,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17		// a5_r*b0_i, a5_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6164,9 +6784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6195,9 +6823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9		// a1_r*b0_i, a1_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 	mr		T1,	CO
 
@@ -6340,14 +6976,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
 
@@ -6366,9 +7010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -6393,9 +7045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO		// load a0, a1
@@ -6420,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6447,18 +7115,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16		// a4_r*b0_r, a4_i*b0_r, a1_r*b0_r, a1_i*b0_r
 	xvmaddasp	vs33,	vs4,	vs17		// a4_r*b0_i, a4_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6483,9 +7167,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO		// load a0, a1
@@ -6510,9 +7202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9		// a0_r*b0_i, a0_i*b0_i, a1_r*b0_i, a1_i*b0_i
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 	mr		T1,	CO
 
@@ -6589,14 +7289,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
 	lxsspx		vs1,	o4,	AO		// load a0_i
@@ -6610,9 +7318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -6634,9 +7350,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO		// load a0_r
@@ -6658,9 +7382,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -6682,9 +7414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16		// a4_r*b0_r
@@ -6693,9 +7433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs16		// a4_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -6717,9 +7465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO		// load a0_r
@@ -6741,9 +7497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs8		// a0_i*b0_r
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 	mr		T1,	CO
 
@@ -6790,5 +7554,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c
index 880d7d271..4652fc57c 100644
--- a/kernel/power/dasum_microk_power8.c
+++ b/kernel/power/dasum_microk_power8.c
@@ -68,10 +68,10 @@ static double dasum_kernel_16 (long n, double *x)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvabsdp		48, 40		\n\t"
        "xvabsdp		49, 41		\n\t"
@@ -108,9 +108,9 @@ static double dasum_kernel_16 (long n, double *x)
        "xvadddp		38, 38, %x5	\n\t"
        "xvadddp		39, 39, %x6	\n\t"
 
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvabsdp		48, 40		\n\t"
        "xvabsdp		49, 41		\n\t"
@@ -140,7 +140,7 @@ static double dasum_kernel_16 (long n, double *x)
 
        "xvadddp		32, 32, 36	\n\t"
 
-       "xxswapd		33, 32		\n\t"
+       XXSWAPD_S(33,32)
        "xsadddp		%x0, 32, 33	\n"
 
      "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c
index fb714a3f9..a92026e83 100644
--- a/kernel/power/daxpy_microk_power8.c
+++ b/kernel/power/daxpy_microk_power8.c
@@ -58,7 +58,7 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
 
   __asm__
     (
-       "xxspltd		%x4, %x22, 0	\n\t"
+       XXSPLTD_S(%x4,%x22,0)
 
        "dcbt		0, %2		\n\t"
        "dcbt		0, %3		\n\t"
@@ -90,10 +90,10 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
        "addi		%3, %3, -64	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
        ".align 5			\n"
-     "1:				\n\t"
+     "one%=:				\n\t"
 
        "xvmaddadp	%x13, %x5, %x4	\n\t"
        "xvmaddadp	%x14, %x6, %x4	\n\t"
@@ -152,9 +152,9 @@ static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
        "addi		%3, %3, -64	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmaddadp	%x13, %x5, %x4	\n\t"
        "xvmaddadp	%x14, %x6, %x4	\n\t"
diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c
index 261dc04de..b51a21d08 100644
--- a/kernel/power/dcopy_microk_power8.c
+++ b/kernel/power/dcopy_microk_power8.c
@@ -62,10 +62,10 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "stxvd2x		32, 0, %3	\n\t"
        "stxvd2x		33, %5, %3	\n\t"
@@ -108,9 +108,9 @@ static void dcopy_kernel_32 (long n, double *x, double *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "stxvd2x		32, 0, %3	\n\t"
        "stxvd2x		33, %5, %3	\n\t"
diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c
index 4e6bc29c9..d2518ef7e 100644
--- a/kernel/power/ddot_microk_power8.c
+++ b/kernel/power/ddot_microk_power8.c
@@ -78,10 +78,10 @@ static double ddot_kernel_8 (long n, double *x, double *y)
        "addi		%3, %3, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmaddadp	32, 40, 48	\n\t"
        "lxvd2x		40, 0, %2	\n\t"
@@ -112,9 +112,9 @@ static double ddot_kernel_8 (long n, double *x, double *y)
        "addi		%3, %3, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmaddadp	32, 40, 48	\n\t"
        "xvmaddadp	33, 41, 49	\n\t"
@@ -135,7 +135,7 @@ static double ddot_kernel_8 (long n, double *x, double *y)
 
        "xvadddp		32, 32, 36	\n\t"
 
-       "xxswapd		33, 32		\n\t"
+       XXSWAPD_S(33,32)
 
        "xsadddp		%x0, 32, 33	\n"
 
diff --git a/kernel/power/dgemm_macros_16x4_power8.S b/kernel/power/dgemm_macros_16x4_power8.S
index 5be517f7c..782425fbd 100644
--- a/kernel/power/dgemm_macros_16x4_power8.S
+++ b/kernel/power/dgemm_macros_16x4_power8.S
@@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4, M=16                                               *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x16_1', `
+#else
 .macro LOAD4x16_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -58,10 +62,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 128
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL4x16_I1', `
+#else
 .macro KERNEL4x16_I1
+#endif
 
 	xvmuldp		vs32,	vs0,	vs24
 	xvmuldp		vs33,	vs1,	vs24
@@ -125,11 +137,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO, AO, 128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 
+#if defined(_AIX)
+define(`KERNEL4x16_1', `
+#else
 .macro KERNEL4x16_1
+#endif
 
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
@@ -194,9 +214,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 128
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_2', `
+#else
 .macro KERNEL4x16_2
+#endif
 
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
@@ -260,9 +288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 128
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_L1', `
+#else
 .macro KERNEL4x16_L1
+#endif
 
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
@@ -326,9 +362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO, AO, 128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_L2', `
+#else
 .macro KERNEL4x16_L2
+#endif
 
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
@@ -392,10 +436,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs63,	vs15,	vs31
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL4x16_E2', `
+#else
 .macro KERNEL4x16_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -434,9 +486,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs62,	vs14,	vs31
 	xvmaddadp		vs63,	vs15,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUBI1', `
+#else
 .macro KERNEL4x16_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -495,9 +555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs62,	vs6,	vs27
 	xvmuldp			vs63,	vs7,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUB1', `
+#else
 .macro KERNEL4x16_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -555,9 +623,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs62,	vs6,	vs27
 	xvmaddadp		vs63,	vs7,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x16', `
+#else
 .macro SAVE4x16
+#endif
 
 	add		T2,	CO,	LDC
 
@@ -680,13 +756,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs39,	o112,	T4
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=8                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x8_1', `
+#else
 .macro LOAD4x8_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -703,9 +787,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_I1', `
+#else
 .macro KERNEL4x8_I1
+#endif
 
 	xvmuldp			vs32,	vs0,	vs24
 	xvmuldp			vs33,	vs1,	vs24
@@ -744,9 +836,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_1', `
+#else
 .macro KERNEL4x8_1
+#endif
 
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
@@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_2', `
+#else
 .macro KERNEL4x8_2
+#endif
 
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
@@ -824,9 +932,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_E2', `
+#else
 .macro KERNEL4x8_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -849,9 +965,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs58,	vs10,	vs31
 	xvmaddadp		vs59,	vs11,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUBI1', `
+#else
 .macro KERNEL4x8_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -887,9 +1011,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs58,	vs2,	vs27
 	xvmuldp			vs59,	vs3,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUB1', `
+#else
 .macro KERNEL4x8_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -925,9 +1057,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs58,	vs2,	vs27
 	xvmaddadp		vs59,	vs3,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x8', `
+#else
 .macro SAVE4x8
+#endif
 
 	mr		T1,	CO
 
@@ -1035,13 +1175,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=4                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x4_1', `
+#else
 .macro LOAD4x4_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1054,9 +1202,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 32
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_I1', `
+#else
 .macro KERNEL4x4_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1082,9 +1238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs56,	vs0,	vs27
 	xvmuldp			vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_1', `
+#else
 .macro KERNEL4x4_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1110,9 +1274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs0,	vs27
 	xvmaddadp		vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_2', `
+#else
 .macro KERNEL4x4_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1138,9 +1310,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs8,	vs31
 	xvmaddadp		vs57,	vs9,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_E2', `
+#else
 .macro KERNEL4x4_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -1155,9 +1335,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs8,	vs31
 	xvmaddadp		vs57,	vs9,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUBI1', `
+#else
 .macro KERNEL4x4_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1183,9 +1371,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs56,	vs0,	vs27
 	xvmuldp			vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUB1', `
+#else
 .macro KERNEL4x4_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1211,9 +1407,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs0,	vs27
 	xvmaddadp		vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x4', `
+#else
 .macro SAVE4x4
+#endif
 
 	mr		T1,	CO
 
@@ -1289,13 +1493,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=2                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x2_1', `
+#else
 .macro LOAD4x2_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1307,9 +1519,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 16
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_I1', `
+#else
 .macro KERNEL4x2_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -1330,9 +1550,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_1', `
+#else
 .macro KERNEL4x2_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -1353,9 +1581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_2', `
+#else
 .macro KERNEL4x2_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1376,9 +1612,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_E2', `
+#else
 .macro KERNEL4x2_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -1389,9 +1633,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUBI1', `
+#else
 .macro KERNEL4x2_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1412,9 +1664,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUB1', `
+#else
 .macro KERNEL4x2_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1435,9 +1695,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x2', `
+#else
 .macro SAVE4x2
+#endif
 
 	mr		T1,	CO
 
@@ -1497,13 +1765,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=1                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x1_1', `
+#else
 .macro LOAD4x1_1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1515,9 +1791,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 8
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_I1', `
+#else
 .macro KERNEL4x1_I1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -1538,9 +1822,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_1', `
+#else
 .macro KERNEL4x1_1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -1561,9 +1853,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_2', `
+#else
 .macro KERNEL4x1_2
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1584,9 +1884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_E2', `
+#else
 .macro KERNEL4x1_E2
+#endif
 
 
 	xsmaddadp		vs32,	vs8,	vs28
@@ -1597,9 +1905,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUBI1', `
+#else
 .macro KERNEL4x1_SUBI1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1620,9 +1936,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUB1', `
+#else
 .macro KERNEL4x1_SUB1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1643,9 +1967,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x1', `
+#else
 .macro SAVE4x1
+#endif
 
 	mr		T1,	CO
 
@@ -1705,13 +2037,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=16                                               *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x16_1', `
+#else
 .macro LOAD2x16_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1731,9 +2071,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO, AO, 64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_I1', `
+#else
 .macro KERNEL2x16_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1772,9 +2120,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs46,	vs6,	vs25
 	xvmuldp			vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_1', `
+#else
 .macro KERNEL2x16_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1813,9 +2169,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs6,	vs25
 	xvmaddadp		vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_2', `
+#else
 .macro KERNEL2x16_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1854,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs14,	vs29
 	xvmaddadp		vs47,	vs15,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_E2', `
+#else
 .macro KERNEL2x16_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -1877,9 +2249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs14,	vs29
 	xvmaddadp		vs47,	vs15,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUBI1', `
+#else
 .macro KERNEL2x16_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1918,9 +2298,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs46,	vs6,	vs25
 	xvmuldp			vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUB1', `
+#else
 .macro KERNEL2x16_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1959,9 +2347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs6,	vs25
 	xvmaddadp		vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x16', `
+#else
 .macro SAVE2x16
+#endif
 
 	mr		T1,	CO
 	addi		T2,	T1,	64
@@ -2055,13 +2451,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=8                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2074,9 +2478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2100,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs42,	vs2,	vs25
 	xvmuldp			vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2126,9 +2546,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs2,	vs25
 	xvmaddadp		vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2152,9 +2580,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs10,	vs29
 	xvmaddadp		vs43,	vs11,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -2167,9 +2603,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs10,	vs29
 	xvmaddadp		vs43,	vs11,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2193,9 +2637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs42,	vs2,	vs25
 	xvmuldp			vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2219,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs2,	vs25
 	xvmaddadp		vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 	mr		T1,	CO
 
@@ -2277,13 +2737,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=4                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2294,9 +2762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 32
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2314,9 +2790,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs40,	vs0,	vs25
 	xvmuldp			vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2334,9 +2818,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs0,	vs25
 	xvmaddadp		vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2354,9 +2846,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs8,	vs29
 	xvmaddadp		vs41,	vs9,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -2365,9 +2865,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs8,	vs29
 	xvmaddadp		vs41,	vs9,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2385,9 +2893,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs40,	vs0,	vs25
 	xvmuldp			vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2405,9 +2921,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs0,	vs25
 	xvmaddadp		vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 	mr		T1,	CO
 
@@ -2447,13 +2971,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=2                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2463,9 +2995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 16
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -2480,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -2497,9 +3045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2514,18 +3070,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
 
 	xvmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2540,9 +3112,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2557,9 +3137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 	mr		T1,	CO
 
@@ -2591,13 +3179,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=1                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2607,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 8
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -2624,9 +3228,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -2641,9 +3253,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2658,18 +3278,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xsmaddadp		vs32,	vs8,	vs28
 
 	xsmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2684,9 +3320,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2701,9 +3345,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 	mr		T1,	CO
 
@@ -2735,13 +3387,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=16                                               *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x16_1', `
+#else
 .macro LOAD1x16_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2760,9 +3420,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO, AO, 64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_I1', `
+#else
 .macro KERNEL1x16_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2791,9 +3459,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs38,	vs6,	vs24
 	xvmuldp			vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_1', `
+#else
 .macro KERNEL1x16_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2822,9 +3498,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs6,	vs24
 	xvmaddadp		vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_2', `
+#else
 .macro KERNEL1x16_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2853,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs14,	vs28
 	xvmaddadp		vs39,	vs15,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_E2', `
+#else
 .macro KERNEL1x16_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -2867,9 +3559,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs14,	vs28
 	xvmaddadp		vs39,	vs15,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUBI1', `
+#else
 .macro KERNEL1x16_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2898,9 +3598,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs38,	vs6,	vs24
 	xvmuldp			vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUB1', `
+#else
 .macro KERNEL1x16_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2929,9 +3637,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs6,	vs24
 	xvmaddadp		vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x16', `
+#else
 .macro SAVE1x16
+#endif
 
 	mr		T1,	CO
 	addi		T2,	T1,	64
@@ -2980,13 +3696,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=8                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2998,9 +3722,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -3018,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs34,	vs2,	vs24
 	xvmuldp			vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -3038,9 +3778,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs2,	vs24
 	xvmaddadp		vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3058,9 +3806,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs10,	vs28
 	xvmaddadp		vs35,	vs11,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -3068,9 +3824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs10,	vs28
 	xvmaddadp		vs35,	vs11,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3088,9 +3852,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs34,	vs2,	vs24
 	xvmuldp			vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3108,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs2,	vs24
 	xvmaddadp		vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 	mr		T1,	CO
 
@@ -3140,13 +3920,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=4                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3156,9 +3944,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 32
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -3172,9 +3968,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs32,	vs0,	vs24
 	xvmuldp			vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -3188,9 +3992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3204,17 +4016,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3228,9 +4056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs32,	vs0,	vs24
 	xvmuldp			vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3244,9 +4080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 	mr		T1,	CO
 
@@ -3268,13 +4112,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=2                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3283,9 +4135,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 16
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -3297,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -3311,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3325,16 +4201,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3346,9 +4238,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3360,9 +4260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 	mr		T1,	CO
 
@@ -3380,13 +4288,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=1                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3395,9 +4311,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 8
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -3409,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -3423,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3437,16 +4377,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xsmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3458,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3472,9 +4436,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 	mr		T1,	CO
 
@@ -3492,5 +4464,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/dgemm_ncopy_macros_4_power8.S b/kernel/power/dgemm_ncopy_macros_4_power8.S
index 8d6744b91..33d02c77d 100644
--- a/kernel/power/dgemm_ncopy_macros_4_power8.S
+++ b/kernel/power/dgemm_ncopy_macros_4_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
 .macro COPY_4x16
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o0,	A1
@@ -180,14 +184,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	128
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
 .macro COPY_4x8
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -259,14 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	128
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
 .macro COPY_4x4
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -310,14 +330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	128
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x2', `
+#else
 .macro COPY_4x2
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	addi		A0,	A0,	16
@@ -348,14 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x1', `
+#else
 .macro COPY_4x1
+#endif
 
 	lxsdx		vs0,	o0,	A0
 	addi		A0,	A0,	8
@@ -382,14 +418,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	32
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
 .macro COPY_2x16
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -459,14 +503,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	128
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
 .macro COPY_2x8
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -506,14 +558,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	128
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
 .macro COPY_2x4
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -539,14 +599,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x2', `
+#else
 .macro COPY_2x2
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	addi		A0,	A0,	16
@@ -565,14 +633,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	32
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x1', `
+#else
 .macro COPY_2x1
+#endif
 
 	lxsdx		vs0,	o0,	A0
 	addi		A0,	A0,	8
@@ -589,14 +665,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
 .macro COPY_1x16
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -622,14 +706,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
 .macro COPY_1x8
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -645,14 +737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
 .macro COPY_1x4
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	lxvd2x		vs1,	o16,	A0
@@ -664,14 +764,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	32
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x2', `
+#else
 .macro COPY_1x2
+#endif
 
 	lxvd2x		vs0,	o0,	A0
 	addi		A0,	A0,	16
@@ -681,14 +789,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x1', `
+#else
 .macro COPY_1x1
+#endif
 
 	lxsdx		vs0,	o0,	A0
 	addi		A0,	A0,	8
@@ -698,5 +814,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		BO,	BO,	8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/dgemm_tcopy_macros_16_power8.S b/kernel/power/dgemm_tcopy_macros_16_power8.S
index 68e53bcf2..6c5b8ed62 100644
--- a/kernel/power/dgemm_tcopy_macros_16_power8.S
+++ b/kernel/power/dgemm_tcopy_macros_16_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
 .macro COPY_4x16
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -140,14 +144,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs10,	o32,	T1
 	stxvd2x		vs11,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
 .macro COPY_4x8
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -205,14 +217,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs46,	o32,	T1
 	stxvd2x		vs47,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
 .macro COPY_4x4
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -250,14 +270,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs38,	o32,	T1
 	stxvd2x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x2', `
+#else
 .macro COPY_4x2
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -285,14 +313,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvd2x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x1', `
+#else
 .macro COPY_4x1
+#endif
 
 	lxsdx		vs32,	o0,	A0
 	addi		A0,	A0,	8
@@ -322,14 +358,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsdx		vs35,	o8,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
 .macro COPY_2x16
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -383,14 +427,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs46,	o32,	T1
 	stxvd2x		vs47,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
 .macro COPY_2x8
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -420,14 +472,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs38,	o32,	T1
 	stxvd2x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
 .macro COPY_2x4
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -447,14 +507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs34,	o32,	T1
 	stxvd2x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x2', `
+#else
 .macro COPY_2x2
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -470,14 +538,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvd2x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x1', `
+#else
 .macro COPY_2x1
+#endif
 
 	lxsdx		vs32,	o0,	A0
 	addi		A0,	A0,	8
@@ -493,14 +569,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsdx		vs33,	o8,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
 .macro COPY_1x16
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -528,14 +612,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs38,	o32,	T1
 	stxvd2x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
 .macro COPY_1x8
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -551,14 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs34,	o32,	T1
 	stxvd2x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
 .macro COPY_1x4
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -570,14 +670,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs32,	o0,	T1
 	stxvd2x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x2', `
+#else
 .macro COPY_1x2
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -587,14 +695,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvd2x		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x1', `
+#else
 .macro COPY_1x1
+#endif
 
 	lxsdx		vs32,	o0,	A0
 	addi		A0,	A0,	8
@@ -604,5 +720,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsdx		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c
index ae4fe9009..c2eb3968c 100644
--- a/kernel/power/dgemv_n_microk_power8.c
+++ b/kernel/power/dgemv_n_microk_power8.c
@@ -46,7 +46,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
     (
        "lxvd2x		34, 0, %10	\n\t"	// x0, x1
        "lxvd2x		35, %11, %10	\n\t"	// x2, x3
-       "xxspltd		32, %x9, 0	\n\t"	// alpha, alpha
+       XXSPLTD_S(32,%x9,0)	// alpha, alpha
 
        "sldi		%6, %13, 3	\n\t"	// lda * sizeof (double)
 
@@ -56,10 +56,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "add		%4, %3, %6	\n\t"	// a0 = ap, a1 = a0 + lda
        "add		%6, %6, %6	\n\t"	// 2 * lda
 
-       "xxspltd		32, 34, 0	\n\t"	// x0 * alpha, x0 * alpha
-       "xxspltd		33, 34, 1	\n\t"	// x1 * alpha, x1 * alpha
-       "xxspltd		34, 35, 0	\n\t"	// x2 * alpha, x2 * alpha
-       "xxspltd		35, 35, 1	\n\t"	// x3 * alpha, x3 * alpha
+       XXSPLTD_S(32,34,0)	// x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,1)	// x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,0)	// x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,1)	// x3 * alpha, x3 * alpha
 
        "add		%5, %3, %6	\n\t"	// a2 = a0 + 2 * lda
        "add		%6, %4, %6	\n\t"	// a3 = a1 + 2 * lda
@@ -89,10 +89,10 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "addi		%6, %6, 32	\n\t"
 
        "addic.		%1, %1, -4	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "lxvd2x		36, 0, %2	\n\t"	// y0, y1
        "lxvd2x		37, %11, %2	\n\t"	// y2, y3
@@ -131,7 +131,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "addi		%2, %2, 32	\n\t"
 
        "addic.		%1, %1, -4	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
 
        "lxvd2x		36, 0, %2	\n\t"	// y0, y1
@@ -171,7 +171,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "addi		%2, %2, 32	\n\t"
 
        "addic.		%1, %1, -4	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
 
        "lxvd2x		36, 0, %2	\n\t"	// y0, y1
@@ -211,7 +211,7 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "addi		%2, %2, 32	\n\t"
 
        "addic.		%1, %1, -4	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
 
        "lxvd2x		36, 0, %2	\n\t"	// y0, y1
@@ -251,9 +251,9 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "addi		%2, %2, 32	\n\t"
 
        "addic.		%1, %1, -4	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "lxvd2x		36, 0, %2	\n\t"	// y0, y1
        "lxvd2x		37, %11, %2	\n\t"	// y2, y3
diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c
index b8589a131..ffe469d4d 100644
--- a/kernel/power/dgemv_t.c
+++ b/kernel/power/dgemv_t.c
@@ -93,11 +93,11 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             "li       %[off],32 \n\t" 
 
 
-            "ble-     2f      \n\t"
+            "ble-     two%=      \n\t"
 
             //--------------------------------------------------           
-            ".p2align   5           \n\t"
-            "1:                     \n\t"
+            ".align   5           \n\t"
+            "one%=:                     \n\t"
             "xvmaddadp   34,36,32  \n\t"
             "xvmaddadp   35,38,32  \n\t"
             "addi   %[off2],  %[off2],32 \n\t"
@@ -137,7 +137,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             "lxvd2x     49, %[a6], %[off2]  \n\t"
             "lxvd2x     51, %[a7], %[off2]  \n\t" 
             "lxvd2x     33,  %[x], %[off2]  \n\t"            
-            "ble- 2f  \n\t"
+            "ble- two%=  \n\t"
             "xvmaddadp   34,36,32  \n\t"
             "xvmaddadp   35,38,32  \n\t"
             "addi   %[off2],  %[off2],32 \n\t" 
@@ -177,7 +177,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             "lxvd2x     49, %[a6], %[off2]  \n\t"
             "lxvd2x     51, %[a7], %[off2]  \n\t" 
             "lxvd2x     33,  %[x], %[off2]  \n\t"            
-            "ble- 2f  \n\t"
+            "ble- two%=  \n\t"
             "xvmaddadp   34,36,32  \n\t"
             "xvmaddadp   35,38,32  \n\t"
 #if defined(PREFETCH)            
@@ -229,7 +229,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             
             "lxvd2x     33,  %[x], %[off2]  \n\t"            
             "addic. %[n],%[n],-4 \n\t"                        
-            "ble- 2f  \n\t"            
+            "ble- two%=  \n\t"            
  
             "addi   %[off2],  %[off2],32 \n\t" 
 #if defined(PREFETCH)
@@ -288,9 +288,9 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
 #if defined(PREFETCH)
             "dcbt   %[temp],%[x]  \n\t" 
 #endif            
-            "bgt+ 1b  \n\t"
-            ".p2align   5           \n\t"
-            "2: \n\t"
+            "bgt+ one%=  \n\t"
+            ".align   5           \n\t"
+            "two%=: \n\t"
             //--------------------------------------------
 
             "xvmaddadp   34,36,32  \n\t"
@@ -301,7 +301,7 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
             "xvmaddadp   7,46,32  \n\t" 
             "xvmaddadp   8,48,32  \n\t"
             "xvmaddadp   9,50,32  \n\t" 
-            "xxspltd 36, %x[alpha], 0 \n\t"
+            XXSPLTD_S(36,%x[alpha],0)
             "xvmaddadp  34,37,33  \n\t"
             "xvmaddadp  35,39,33  \n\t"            
             "xvmaddadp  4,41,33  \n\t"
@@ -322,21 +322,21 @@ static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, do
 
  
 
-            "xxmrgld 42,34,35 \n\t"
-            "xxmrghd 43,34,35 \n\t"
+            XXMRGLD_S(42,34,35)
+            XXMRGHD_S(43,34,35)
 
-            "xxmrgld 44,4,5 \n\t"
-            "xxmrghd 45,4,5 \n\t"
+            XXMRGLD_S(44,4,5)
+            XXMRGHD_S(45,4,5)
 
             "xvadddp 42,42,43 \n\t"
 
-            "xxmrgld 46,6,7 \n\t"
-            "xxmrghd 47,6,7 \n\t"
+            XXMRGLD_S(46,6,7)
+            XXMRGHD_S(47,6,7)
 
             "xvadddp 44,44,45 \n\t"
 
-            "xxmrgld 48,8,9 \n\t"
-            "xxmrghd 49,8,9 \n\t"
+            XXMRGLD_S(48,8,9)
+            XXMRGHD_S(49,8,9)
 
             "xvadddp 46,46,47 \n\t"
             
diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c
index 016b7764d..259c08187 100644
--- a/kernel/power/drot_microk_power8.c
+++ b/kernel/power/drot_microk_power8.c
@@ -51,8 +51,8 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
 
   __asm__
     (
-       "xxspltd		36, %x13, 0	\n\t"	// load c to both dwords
-       "xxspltd		37, %x14, 0	\n\t"	// load s to both dwords
+       XXSPLTD_S(36,%x13,0)	// load c to both dwords
+       XXSPLTD_S(37,%x14,0)	// load s to both dwords
 
        "lxvd2x		32, 0, %3	\n\t"	// load x
        "lxvd2x		33, %15, %3	\n\t"
@@ -68,10 +68,10 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
        "addi		%4, %4, 64	\n\t"
 
        "addic.		%2, %2, -8	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmuldp		40, 32, 36	\n\t"	// c * x
        "xvmuldp		41, 33, 36	\n\t"
@@ -135,9 +135,9 @@ static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
        "addi		%4, %4, 128	\n\t"
 
        "addic.		%2, %2, -8	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmuldp		40, 32, 36	\n\t"	// c * x
        "xvmuldp		41, 33, 36	\n\t"
diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c
index 04898eb3d..e9bacd05a 100644
--- a/kernel/power/dscal_microk_power8.c
+++ b/kernel/power/dscal_microk_power8.c
@@ -41,7 +41,7 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
     (
        "dcbt		0, %2		\n\t"
 
-       "xxspltd		%x3, %x3, 0	\n\t"
+       XXSPLTD_S(%x3,%x3,0)
 
        "lxvd2x		32, 0, %2	\n\t"
        "lxvd2x		33, %4, %2	\n\t"
@@ -55,10 +55,10 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmuldp		40, 32, %x3	\n\t"
        "xvmuldp		41, 33, %x3	\n\t"
@@ -91,9 +91,9 @@ static void dscal_kernel_8 (long n, double *x, double alpha)
        "addi		%2, %2, 256	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmuldp		40, 32, %x3	\n\t"
        "xvmuldp		41, 33, %x3	\n\t"
@@ -146,8 +146,8 @@ static void dscal_kernel_8_zero (long n, double *x)
     (
        "xxlxor		%x3, %x3, %x3	\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "stxvd2x		%x3, 0, %2	\n\t"
        "stxvd2x		%x3, %4, %2	\n\t"
@@ -161,7 +161,7 @@ static void dscal_kernel_8_zero (long n, double *x)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
      "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
      :
diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c
index 31eff3449..ecfd5c9f9 100644
--- a/kernel/power/dswap_microk_power8.c
+++ b/kernel/power/dswap_microk_power8.c
@@ -39,8 +39,8 @@ static void dswap_kernel_32 (long n, double *x, double *y)
 {
   __asm__
     (
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "lxvd2x		32, 0, %4	\n\t"
        "lxvd2x		33, %5, %4	\n\t"
@@ -131,7 +131,7 @@ static void dswap_kernel_32 (long n, double *x, double *y)
        "addi		%4, %4, 128	\n\t"
 
        "addic.		%2, %2, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
      "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :
diff --git a/kernel/power/dtrmm_macros_16x4_power8.S b/kernel/power/dtrmm_macros_16x4_power8.S
index 079144a90..efb034594 100644
--- a/kernel/power/dtrmm_macros_16x4_power8.S
+++ b/kernel/power/dtrmm_macros_16x4_power8.S
@@ -37,7 +37,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4, M=16                                               *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x16_1', `
+#else
 .macro LOAD4x16_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -60,9 +64,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_I1', `
+#else
 .macro KERNEL4x16_I1
+#endif
 
 	xvmuldp			vs32,	vs0,	vs24
 	xvmuldp			vs33,	vs1,	vs24
@@ -127,9 +139,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_1', `
+#else
 .macro KERNEL4x16_1
+#endif
 
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
@@ -195,9 +215,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_2', `
+#else
 .macro KERNEL4x16_2
+#endif
 
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
@@ -262,9 +290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_E2', `
+#else
 .macro KERNEL4x16_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -303,9 +339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs62,	vs14,	vs31
 	xvmaddadp		vs63,	vs15,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUBI1', `
+#else
 .macro KERNEL4x16_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -364,9 +408,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs62,	vs6,	vs27
 	xvmuldp			vs63,	vs7,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUB1', `
+#else
 .macro KERNEL4x16_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -425,9 +477,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs62,	vs6,	vs27
 	xvmaddadp		vs63,	vs7,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x16', `
+#else
 .macro SAVE4x16
+#endif
 
 	mr		T1,	CO
 	addi		T2,	T1,	64
@@ -615,13 +675,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=8                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x8_1', `
+#else
 .macro LOAD4x8_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -638,9 +706,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_I1', `
+#else
 .macro KERNEL4x8_I1
+#endif
 
 	xvmuldp			vs32,	vs0,	vs24
 	xvmuldp			vs33,	vs1,	vs24
@@ -679,9 +755,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_1', `
+#else
 .macro KERNEL4x8_1
+#endif
 
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
@@ -719,9 +803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_2', `
+#else
 .macro KERNEL4x8_2
+#endif
 
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
@@ -759,9 +851,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_E2', `
+#else
 .macro KERNEL4x8_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -784,9 +884,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs58,	vs10,	vs31
 	xvmaddadp		vs59,	vs11,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUBI1', `
+#else
 .macro KERNEL4x8_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -822,9 +930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs58,	vs2,	vs27
 	xvmuldp			vs59,	vs3,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUB1', `
+#else
 .macro KERNEL4x8_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -860,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs58,	vs2,	vs27
 	xvmaddadp		vs59,	vs3,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x8', `
+#else
 .macro SAVE4x8
+#endif
 
 	mr		T1,	CO
 
@@ -970,13 +1094,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=4                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x4_1', `
+#else
 .macro LOAD4x4_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -989,9 +1121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 32
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_I1', `
+#else
 .macro KERNEL4x4_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1017,9 +1157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs56,	vs0,	vs27
 	xvmuldp			vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_1', `
+#else
 .macro KERNEL4x4_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1045,9 +1193,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs0,	vs27
 	xvmaddadp		vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_2', `
+#else
 .macro KERNEL4x4_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1073,9 +1229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs8,	vs31
 	xvmaddadp		vs57,	vs9,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_E2', `
+#else
 .macro KERNEL4x4_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -1090,9 +1254,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs8,	vs31
 	xvmaddadp		vs57,	vs9,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUBI1', `
+#else
 .macro KERNEL4x4_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1118,9 +1290,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs56,	vs0,	vs27
 	xvmuldp			vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUB1', `
+#else
 .macro KERNEL4x4_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1146,9 +1326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs56,	vs0,	vs27
 	xvmaddadp		vs57,	vs1,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x4', `
+#else
 .macro SAVE4x4
+#endif
 
 	mr		T1,	CO
 
@@ -1224,13 +1412,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=2                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x2_1', `
+#else
 .macro LOAD4x2_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1242,9 +1438,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 16
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_I1', `
+#else
 .macro KERNEL4x2_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -1265,9 +1469,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_1', `
+#else
 .macro KERNEL4x2_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -1288,9 +1500,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_2', `
+#else
 .macro KERNEL4x2_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1311,9 +1531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_E2', `
+#else
 .macro KERNEL4x2_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -1324,9 +1552,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUBI1', `
+#else
 .macro KERNEL4x2_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1347,9 +1583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUB1', `
+#else
 .macro KERNEL4x2_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -1370,9 +1614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x2', `
+#else
 .macro SAVE4x2
+#endif
 
 	mr		T1,	CO
 
@@ -1432,13 +1684,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=1                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x1_1', `
+#else
 .macro LOAD4x1_1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1450,9 +1710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 8
 	addi		BO, BO, 32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_I1', `
+#else
 .macro KERNEL4x1_I1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -1473,9 +1741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_1', `
+#else
 .macro KERNEL4x1_1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -1496,9 +1772,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_2', `
+#else
 .macro KERNEL4x1_2
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1519,9 +1803,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_E2', `
+#else
 .macro KERNEL4x1_E2
+#endif
 
 
 	xsmaddadp		vs32,	vs8,	vs28
@@ -1532,9 +1824,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs8,	vs31
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUBI1', `
+#else
 .macro KERNEL4x1_SUBI1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1555,9 +1855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUB1', `
+#else
 .macro KERNEL4x1_SUB1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -1578,9 +1886,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs56,	vs0,	vs27
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x1', `
+#else
 .macro SAVE4x1
+#endif
 
 	mr		T1,	CO
 
@@ -1640,13 +1956,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=16                                               *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x16_1', `
+#else
 .macro LOAD2x16_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1666,9 +1990,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO, AO, 64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_I1', `
+#else
 .macro KERNEL2x16_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1707,9 +2039,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs46,	vs6,	vs25
 	xvmuldp			vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_1', `
+#else
 .macro KERNEL2x16_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -1748,9 +2088,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs6,	vs25
 	xvmaddadp		vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_2', `
+#else
 .macro KERNEL2x16_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1789,9 +2137,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs14,	vs29
 	xvmaddadp		vs47,	vs15,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_E2', `
+#else
 .macro KERNEL2x16_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -1812,9 +2168,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs14,	vs29
 	xvmaddadp		vs47,	vs15,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUBI1', `
+#else
 .macro KERNEL2x16_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1853,9 +2217,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs46,	vs6,	vs25
 	xvmuldp			vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUB1', `
+#else
 .macro KERNEL2x16_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -1894,9 +2266,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs46,	vs6,	vs25
 	xvmaddadp		vs47,	vs7,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x16', `
+#else
 .macro SAVE2x16
+#endif
 
 	mr		T1,	CO
 	addi		T2,	T1,	64
@@ -1990,13 +2370,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=8                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2009,9 +2397,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2035,9 +2431,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs42,	vs2,	vs25
 	xvmuldp			vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2061,9 +2465,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs2,	vs25
 	xvmaddadp		vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2087,9 +2499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs10,	vs29
 	xvmaddadp		vs43,	vs11,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -2102,9 +2522,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs10,	vs29
 	xvmaddadp		vs43,	vs11,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2128,9 +2556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs42,	vs2,	vs25
 	xvmuldp			vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2154,9 +2590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs42,	vs2,	vs25
 	xvmaddadp		vs43,	vs3,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 	mr		T1,	CO
 
@@ -2212,13 +2656,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=4                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2229,9 +2681,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 32
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2249,9 +2709,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs40,	vs0,	vs25
 	xvmuldp			vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2269,9 +2737,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs0,	vs25
 	xvmaddadp		vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2289,9 +2765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs8,	vs29
 	xvmaddadp		vs41,	vs9,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -2300,9 +2784,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs8,	vs29
 	xvmaddadp		vs41,	vs9,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2320,9 +2812,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs40,	vs0,	vs25
 	xvmuldp			vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2340,9 +2840,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs40,	vs0,	vs25
 	xvmaddadp		vs41,	vs1,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 	mr		T1,	CO
 
@@ -2382,13 +2890,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=2                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2398,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 16
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -2415,9 +2939,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -2432,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2449,18 +2989,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
 
 	xvmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2475,9 +3031,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -2492,9 +3056,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 	mr		T1,	CO
 
@@ -2526,13 +3098,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=2, M=1                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2542,9 +3122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 8
 	addi		BO, BO, 16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -2559,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -2576,9 +3172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2593,18 +3197,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xsmaddadp		vs32,	vs8,	vs28
 
 	xsmaddadp		vs40,	vs8,	vs29
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2619,9 +3239,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -2636,9 +3264,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs40,	vs0,	vs25
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 	mr		T1,	CO
 
@@ -2670,13 +3306,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=16                                               *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x16_1', `
+#else
 .macro LOAD1x16_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2695,9 +3339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		AO, AO, 64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_I1', `
+#else
 .macro KERNEL1x16_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2726,9 +3378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs38,	vs6,	vs24
 	xvmuldp			vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_1', `
+#else
 .macro KERNEL1x16_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2757,9 +3417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs6,	vs24
 	xvmaddadp		vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_2', `
+#else
 .macro KERNEL1x16_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2788,9 +3456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs14,	vs28
 	xvmaddadp		vs39,	vs15,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_E2', `
+#else
 .macro KERNEL1x16_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -2802,9 +3478,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs14,	vs28
 	xvmaddadp		vs39,	vs15,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUBI1', `
+#else
 .macro KERNEL1x16_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2833,9 +3517,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs38,	vs6,	vs24
 	xvmuldp			vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUB1', `
+#else
 .macro KERNEL1x16_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2864,9 +3556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs38,	vs6,	vs24
 	xvmaddadp		vs39,	vs7,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x16', `
+#else
 .macro SAVE1x16
+#endif
 
 	mr		T1,	CO
 	addi		T2,	T1,	64
@@ -2915,13 +3615,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=4, M=8                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2933,9 +3641,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 64
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2953,9 +3669,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs34,	vs2,	vs24
 	xvmuldp			vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -2973,9 +3697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs2,	vs24
 	xvmaddadp		vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -2993,9 +3725,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs10,	vs28
 	xvmaddadp		vs35,	vs11,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
@@ -3003,9 +3743,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs10,	vs28
 	xvmaddadp		vs35,	vs11,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3023,9 +3771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs34,	vs2,	vs24
 	xvmuldp			vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3043,9 +3799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs34,	vs2,	vs24
 	xvmaddadp		vs35,	vs3,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 	mr		T1,	CO
 
@@ -3075,13 +3839,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=4                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3091,9 +3863,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 32
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -3107,9 +3887,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs32,	vs0,	vs24
 	xvmuldp			vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 	lxvd2x	vs9,	o16,	AO
@@ -3123,9 +3911,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3139,17 +3935,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
 	xvmaddadp		vs33,	vs9,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3163,9 +3975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp			vs32,	vs0,	vs24
 	xvmuldp			vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 	lxvd2x	vs1,	o16,	AO
@@ -3179,9 +3999,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp		vs32,	vs0,	vs24
 	xvmaddadp		vs33,	vs1,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 	mr		T1,	CO
 
@@ -3203,13 +4031,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=2                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3218,9 +4054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 16
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -3232,9 +4076,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 	lxvd2x	vs8,	0,	AO
 
@@ -3246,9 +4098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3260,16 +4120,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xvmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3281,9 +4157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 	lxvd2x	vs0,	0,	AO
 
@@ -3295,9 +4179,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xvmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 	mr		T1,	CO
 
@@ -3315,13 +4207,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /*********************************************************************
 * Macros for N=1, M=1                                                *
 *********************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3330,9 +4230,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO, AO, 8
 	addi		BO, BO, 8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -3344,9 +4252,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 	lxsdx	vs8,	0,	AO
 
@@ -3358,9 +4274,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3372,16 +4296,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xsmaddadp		vs32,	vs8,	vs28
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3393,9 +4333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmuldp			vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 	lxsdx	vs0,	0,	AO
 
@@ -3407,9 +4355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xsmaddadp		vs32,	vs0,	vs24
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 	mr		T1,	CO
 
@@ -3427,5 +4383,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/dtrsm_macros_LT_16x4_power8.S b/kernel/power/dtrsm_macros_LT_16x4_power8.S
index dc47daa3a..5a5c4037c 100644
--- a/kernel/power/dtrsm_macros_LT_16x4_power8.S
+++ b/kernel/power/dtrsm_macros_LT_16x4_power8.S
@@ -1,46 +1,58 @@
 
+#if defined(_AIX)
+define(`INIT_16x4', `
+#else
 .macro INIT_16x4
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
-	xvmovdp		vs36,	vs0
-	xvmovdp		vs37,	vs0
-	xvmovdp		vs38,	vs0
-	xvmovdp		vs39,	vs0
-	xvmovdp		vs40,	vs0
-	xvmovdp		vs41,	vs0
-	xvmovdp		vs42,	vs0
-	xvmovdp		vs43,	vs0
-	xvmovdp		vs44,	vs0
-	xvmovdp		vs45,	vs0
-	xvmovdp		vs46,	vs0
-	xvmovdp		vs47,	vs0
-	xvmovdp		vs48,	vs0
-	xvmovdp		vs49,	vs0
-	xvmovdp		vs50,	vs0
-	xvmovdp		vs51,	vs0
-	xvmovdp		vs52,	vs0
-	xvmovdp		vs53,	vs0
-	xvmovdp		vs54,	vs0
-	xvmovdp		vs55,	vs0
-	xvmovdp		vs56,	vs0
-	xvmovdp		vs57,	vs0
-	xvmovdp		vs58,	vs0
-	xvmovdp		vs59,	vs0
-	xvmovdp		vs60,	vs0
-	xvmovdp		vs61,	vs0
-	xvmovdp		vs62,	vs0
-	xvmovdp		vs63,	vs0
-
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
+	XVMOVDP(vs36,vs0)
+	XVMOVDP(vs37,vs0)
+	XVMOVDP(vs38,vs0)
+	XVMOVDP(vs39,vs0)
+	XVMOVDP(vs40,vs0)
+	XVMOVDP(vs41,vs0)
+	XVMOVDP(vs42,vs0)
+	XVMOVDP(vs43,vs0)
+	XVMOVDP(vs44,vs0)
+	XVMOVDP(vs45,vs0)
+	XVMOVDP(vs46,vs0)
+	XVMOVDP(vs47,vs0)
+	XVMOVDP(vs48,vs0)
+	XVMOVDP(vs49,vs0)
+	XVMOVDP(vs50,vs0)
+	XVMOVDP(vs51,vs0)
+	XVMOVDP(vs52,vs0)
+	XVMOVDP(vs53,vs0)
+	XVMOVDP(vs54,vs0)
+	XVMOVDP(vs55,vs0)
+	XVMOVDP(vs56,vs0)
+	XVMOVDP(vs57,vs0)
+	XVMOVDP(vs58,vs0)
+	XVMOVDP(vs59,vs0)
+	XVMOVDP(vs60,vs0)
+	XVMOVDP(vs61,vs0)
+	XVMOVDP(vs62,vs0)
+	XVMOVDP(vs63,vs0)
+
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_16x4', `
+#else
 .macro KERNEL_16x4
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -98,35 +110,51 @@
 	xvmaddadp	vs63,	vs7,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_8x4', `
+#else
 .macro INIT_8x4
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
-	xvmovdp		vs36,	vs0
-	xvmovdp		vs37,	vs0
-	xvmovdp		vs38,	vs0
-	xvmovdp		vs39,	vs0
-	xvmovdp		vs40,	vs0
-	xvmovdp		vs41,	vs0
-	xvmovdp		vs42,	vs0
-	xvmovdp		vs43,	vs0
-	xvmovdp		vs44,	vs0
-	xvmovdp		vs45,	vs0
-	xvmovdp		vs46,	vs0
-	xvmovdp		vs47,	vs0
-
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
+	XVMOVDP(vs36,vs0)
+	XVMOVDP(vs37,vs0)
+	XVMOVDP(vs38,vs0)
+	XVMOVDP(vs39,vs0)
+	XVMOVDP(vs40,vs0)
+	XVMOVDP(vs41,vs0)
+	XVMOVDP(vs42,vs0)
+	XVMOVDP(vs43,vs0)
+	XVMOVDP(vs44,vs0)
+	XVMOVDP(vs45,vs0)
+	XVMOVDP(vs46,vs0)
+	XVMOVDP(vs47,vs0)
+
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_8x4', `
+#else
 .macro KERNEL_8x4
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -161,27 +189,43 @@
 	xvmaddadp	vs47,	vs3,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_4x4', `
+#else
 .macro INIT_4x4
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
-	xvmovdp		vs36,	vs0
-	xvmovdp		vs37,	vs0
-	xvmovdp		vs38,	vs0
-	xvmovdp		vs39,	vs0
-
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
+	XVMOVDP(vs36,vs0)
+	XVMOVDP(vs37,vs0)
+	XVMOVDP(vs38,vs0)
+	XVMOVDP(vs39,vs0)
+
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_4x4', `
+#else
 .macro KERNEL_4x4
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -206,23 +250,39 @@
 	xvmaddadp	vs39,	vs1,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_2x4', `
+#else
 .macro INIT_2x4
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_2x4', `
+#else
 .macro KERNEL_2x4
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -242,23 +302,39 @@
 	xvmaddadp	vs35,	vs0,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_1x4', `
+#else
 .macro INIT_1x4
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_1x4', `
+#else
 .macro KERNEL_1x4
+#endif
 
 
 	lxvdsx		vs0,	o0,	AO
@@ -278,14 +354,22 @@
 	xvmaddadp	vs35,	vs0,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 16x4
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_16x4', `
+#else
 .macro SOLVE_LT_16x4
+#endif
 
 //############### LOAD B #######################
 
@@ -1149,46 +1233,46 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs34,	o8,	T1
-	xxswapd		vs34,	vs34
+	XXSWAPD(vs34,vs34)
 	stxsdx		vs36,	o16,	T1
-	xxswapd		vs36,	vs36
+	XXSWAPD(vs36,vs36)
 	stxsdx		vs38,	o24,	T1
-	xxswapd		vs38,	vs38
+	XXSWAPD(vs38,vs38)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs40,	o0,	T1
-	xxswapd		vs40,	vs40
+	XXSWAPD(vs40,vs40)
 	stxsdx		vs42,	o8,	T1
-	xxswapd		vs42,	vs42
+	XXSWAPD(vs42,vs42)
 	stxsdx		vs44,	o16,	T1
-	xxswapd		vs44,	vs44
+	XXSWAPD(vs44,vs44)
 	stxsdx		vs46,	o24,	T1
-	xxswapd		vs46,	vs46
+	XXSWAPD(vs46,vs46)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs48,	o0,	T1
-	xxswapd		vs48,	vs48
+	XXSWAPD(vs48,vs48)
 	stxsdx		vs50,	o8,	T1
-	xxswapd		vs50,	vs50
+	XXSWAPD(vs50,vs50)
 	stxsdx		vs52,	o16,	T1
-	xxswapd		vs52,	vs52
+	XXSWAPD(vs52,vs52)
 	stxsdx		vs54,	o24,	T1
-	xxswapd		vs54,	vs54
+	XXSWAPD(vs54,vs54)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs56,	o0,	T1
-	xxswapd		vs56,	vs56
+	XXSWAPD(vs56,vs56)
 	stxsdx		vs58,	o8,	T1
-	xxswapd		vs58,	vs58
+	XXSWAPD(vs58,vs58)
 	stxsdx		vs60,	o16,	T1
-	xxswapd		vs60,	vs60
+	XXSWAPD(vs60,vs60)
 	stxsdx		vs62,	o24,	T1
-	xxswapd		vs62,	vs62
+	XXSWAPD(vs62,vs62)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs34,	o8,	T2
@@ -1225,46 +1309,46 @@
 
 
 	stxsdx		vs33,	o0,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 	stxsdx		vs35,	o8,	T1
-	xxswapd		vs35,	vs35
+	XXSWAPD(vs35,vs35)
 	stxsdx		vs37,	o16,	T1
-	xxswapd		vs37,	vs37
+	XXSWAPD(vs37,vs37)
 	stxsdx		vs39,	o24,	T1
-	xxswapd		vs39,	vs39
+	XXSWAPD(vs39,vs39)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs41,	o0,	T1
-	xxswapd		vs41,	vs41
+	XXSWAPD(vs41,vs41)
 	stxsdx		vs43,	o8,	T1
-	xxswapd		vs43,	vs43
+	XXSWAPD(vs43,vs43)
 	stxsdx		vs45,	o16,	T1
-	xxswapd		vs45,	vs45
+	XXSWAPD(vs45,vs45)
 	stxsdx		vs47,	o24,	T1
-	xxswapd		vs47,	vs47
+	XXSWAPD(vs47,vs47)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs49,	o0,	T1
-	xxswapd		vs49,	vs49
+	XXSWAPD(vs49,vs49)
 	stxsdx		vs51,	o8,	T1
-	xxswapd		vs51,	vs51
+	XXSWAPD(vs51,vs51)
 	stxsdx		vs53,	o16,	T1
-	xxswapd		vs53,	vs53
+	XXSWAPD(vs53,vs53)
 	stxsdx		vs55,	o24,	T1
-	xxswapd		vs55,	vs55
+	XXSWAPD(vs55,vs55)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs57,	o0,	T1
-	xxswapd		vs57,	vs57
+	XXSWAPD(vs57,vs57)
 	stxsdx		vs59,	o8,	T1
-	xxswapd		vs59,	vs59
+	XXSWAPD(vs59,vs59)
 	stxsdx		vs61,	o16,	T1
-	xxswapd		vs61,	vs61
+	XXSWAPD(vs61,vs61)
 	stxsdx		vs63,	o24,	T1
-	xxswapd		vs63,	vs63
+	XXSWAPD(vs63,vs63)
 
 	stxsdx		vs33,	o0,	T2
 	stxsdx		vs35,	o8,	T2
@@ -1292,14 +1376,22 @@
 	stxsdx		vs61,	o16,	T2
 	stxsdx		vs63,	o24,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 8x4
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_8x4', `
+#else
 .macro SOLVE_LT_8x4
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs34,	vs35,	0
@@ -1603,24 +1695,24 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs34,	o8,	T1
-	xxswapd		vs34,	vs34
+	XXSWAPD(vs34,vs34)
 	stxsdx		vs36,	o16,	T1
-	xxswapd		vs36,	vs36
+	XXSWAPD(vs36,vs36)
 	stxsdx		vs38,	o24,	T1
-	xxswapd		vs38,	vs38
+	XXSWAPD(vs38,vs38)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs40,	o0,	T1
-	xxswapd		vs40,	vs40
+	XXSWAPD(vs40,vs40)
 	stxsdx		vs42,	o8,	T1
-	xxswapd		vs42,	vs42
+	XXSWAPD(vs42,vs42)
 	stxsdx		vs44,	o16,	T1
-	xxswapd		vs44,	vs44
+	XXSWAPD(vs44,vs44)
 	stxsdx		vs46,	o24,	T1
-	xxswapd		vs46,	vs46
+	XXSWAPD(vs46,vs46)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs34,	o8,	T2
@@ -1643,24 +1735,24 @@
 
 
 	stxsdx		vs33,	o0,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 	stxsdx		vs35,	o8,	T1
-	xxswapd		vs35,	vs35
+	XXSWAPD(vs35,vs35)
 	stxsdx		vs37,	o16,	T1
-	xxswapd		vs37,	vs37
+	XXSWAPD(vs37,vs37)
 	stxsdx		vs39,	o24,	T1
-	xxswapd		vs39,	vs39
+	XXSWAPD(vs39,vs39)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs41,	o0,	T1
-	xxswapd		vs41,	vs41
+	XXSWAPD(vs41,vs41)
 	stxsdx		vs43,	o8,	T1
-	xxswapd		vs43,	vs43
+	XXSWAPD(vs43,vs43)
 	stxsdx		vs45,	o16,	T1
-	xxswapd		vs45,	vs45
+	XXSWAPD(vs45,vs45)
 	stxsdx		vs47,	o24,	T1
-	xxswapd		vs47,	vs47
+	XXSWAPD(vs47,vs47)
 
 	stxsdx		vs33,	o0,	T2
 	stxsdx		vs35,	o8,	T2
@@ -1674,14 +1766,22 @@
 	stxsdx		vs45,	o16,	T2
 	stxsdx		vs47,	o24,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 4x4
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_4x4', `
+#else
 .macro SOLVE_LT_4x4
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs34,	vs35,	0
@@ -1813,13 +1913,13 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs34,	o8,	T1
-	xxswapd		vs34,	vs34
+	XXSWAPD(vs34,vs34)
 	stxsdx		vs36,	o16,	T1
-	xxswapd		vs36,	vs36
+	XXSWAPD(vs36,vs36)
 	stxsdx		vs38,	o24,	T1
-	xxswapd		vs38,	vs38
+	XXSWAPD(vs38,vs38)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs34,	o8,	T2
@@ -1835,27 +1935,35 @@
 
 
 	stxsdx		vs33,	o0,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 	stxsdx		vs35,	o8,	T1
-	xxswapd		vs35,	vs35
+	XXSWAPD(vs35,vs35)
 	stxsdx		vs37,	o16,	T1
-	xxswapd		vs37,	vs37
+	XXSWAPD(vs37,vs37)
 	stxsdx		vs39,	o24,	T1
-	xxswapd		vs39,	vs39
+	XXSWAPD(vs39,vs39)
 
 	stxsdx		vs33,	o0,	T2
 	stxsdx		vs35,	o8,	T2
 	stxsdx		vs37,	o16,	T2
 	stxsdx		vs39,	o24,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 2x4
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_2x4', `
+#else
 .macro SOLVE_LT_2x4
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs34,	vs35,	0
@@ -1925,9 +2033,9 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs34,	o8,	T1
-	xxswapd		vs34,	vs34
+	XXSWAPD(vs34,vs34)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs34,	o8,	T2
@@ -1941,21 +2049,29 @@
 
 
 	stxsdx		vs33,	o0,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 	stxsdx		vs35,	o8,	T1
-	xxswapd		vs35,	vs35
+	XXSWAPD(vs35,vs35)
 
 	stxsdx		vs33,	o0,	T2
 	stxsdx		vs35,	o8,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 1x4
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_1x4', `
+#else
 .macro SOLVE_LT_1x4
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs34,	vs35,	0
@@ -2001,7 +2117,7 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 
 	stxsdx		vs32,	o0,	T2
 
@@ -2014,39 +2130,55 @@
 
 
 	stxsdx		vs33,	o0,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 
 	stxsdx		vs33,	o0,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_16x2', `
+#else
 .macro INIT_16x2
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
-	xvmovdp		vs36,	vs0
-	xvmovdp		vs37,	vs0
-	xvmovdp		vs38,	vs0
-	xvmovdp		vs39,	vs0
-	xvmovdp		vs40,	vs0
-	xvmovdp		vs41,	vs0
-	xvmovdp		vs42,	vs0
-	xvmovdp		vs43,	vs0
-	xvmovdp		vs44,	vs0
-	xvmovdp		vs45,	vs0
-	xvmovdp		vs46,	vs0
-	xvmovdp		vs47,	vs0
-
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
+	XVMOVDP(vs36,vs0)
+	XVMOVDP(vs37,vs0)
+	XVMOVDP(vs38,vs0)
+	XVMOVDP(vs39,vs0)
+	XVMOVDP(vs40,vs0)
+	XVMOVDP(vs41,vs0)
+	XVMOVDP(vs42,vs0)
+	XVMOVDP(vs43,vs0)
+	XVMOVDP(vs44,vs0)
+	XVMOVDP(vs45,vs0)
+	XVMOVDP(vs46,vs0)
+	XVMOVDP(vs47,vs0)
+
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_16x2', `
+#else
 .macro KERNEL_16x2
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -2086,27 +2218,43 @@
 	xvmaddadp	vs47,	vs7,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_8x2', `
+#else
 .macro INIT_8x2
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
-	xvmovdp		vs36,	vs0
-	xvmovdp		vs37,	vs0
-	xvmovdp		vs38,	vs0
-	xvmovdp		vs39,	vs0
-
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
+	XVMOVDP(vs36,vs0)
+	XVMOVDP(vs37,vs0)
+	XVMOVDP(vs38,vs0)
+	XVMOVDP(vs39,vs0)
+
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_8x2', `
+#else
 .macro KERNEL_8x2
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -2131,23 +2279,39 @@
 	xvmaddadp	vs39,	vs3,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_4x2', `
+#else
 .macro INIT_4x2
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_4x2', `
+#else
 .macro KERNEL_4x2
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -2166,21 +2330,37 @@
 	xvmaddadp	vs35,	vs1,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_2x2', `
+#else
 .macro INIT_2x2
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_2x2', `
+#else
 .macro KERNEL_2x2
+#endif
 
 
 	lxvd2x		vs0,	o0,	AO
@@ -2196,21 +2376,37 @@
 	xvmaddadp	vs33,	vs0,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_1x2', `
+#else
 .macro INIT_1x2
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_1x2', `
+#else
 .macro KERNEL_1x2
+#endif
 
 
 	lxvdsx		vs0,	o0,	AO
@@ -2226,14 +2422,22 @@
 	xvmaddadp	vs33,	vs0,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 16x2
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_16x2', `
+#else
 .macro SOLVE_LT_16x2
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs32,	vs33,	3
@@ -2821,46 +3025,46 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs33,	o8,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 	stxsdx		vs34,	o16,	T1
-	xxswapd		vs34,	vs34
+	XXSWAPD(vs34,vs34)
 	stxsdx		vs35,	o24,	T1
-	xxswapd		vs35,	vs35
+	XXSWAPD(vs35,vs35)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs36,	o0,	T1
-	xxswapd		vs36,	vs36
+	XXSWAPD(vs36,vs36)
 	stxsdx		vs37,	o8,	T1
-	xxswapd		vs37,	vs37
+	XXSWAPD(vs37,vs37)
 	stxsdx		vs38,	o16,	T1
-	xxswapd		vs38,	vs38
+	XXSWAPD(vs38,vs38)
 	stxsdx		vs39,	o24,	T1
-	xxswapd		vs39,	vs39
+	XXSWAPD(vs39,vs39)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs40,	o0,	T1
-	xxswapd		vs40,	vs40
+	XXSWAPD(vs40,vs40)
 	stxsdx		vs41,	o8,	T1
-	xxswapd		vs41,	vs41
+	XXSWAPD(vs41,vs41)
 	stxsdx		vs42,	o16,	T1
-	xxswapd		vs42,	vs42
+	XXSWAPD(vs42,vs42)
 	stxsdx		vs43,	o24,	T1
-	xxswapd		vs43,	vs43
+	XXSWAPD(vs43,vs43)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs44,	o0,	T1
-	xxswapd		vs44,	vs44
+	XXSWAPD(vs44,vs44)
 	stxsdx		vs45,	o8,	T1
-	xxswapd		vs45,	vs45
+	XXSWAPD(vs45,vs45)
 	stxsdx		vs46,	o16,	T1
-	xxswapd		vs46,	vs46
+	XXSWAPD(vs46,vs46)
 	stxsdx		vs47,	o24,	T1
-	xxswapd		vs47,	vs47
+	XXSWAPD(vs47,vs47)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs33,	o8,	T2
@@ -2888,14 +3092,22 @@
 	stxsdx		vs46,	o16,	T2
 	stxsdx		vs47,	o24,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 8x2
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_8x2', `
+#else
 .macro SOLVE_LT_8x2
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs32,	vs33,	3
@@ -3111,24 +3323,24 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs33,	o8,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 	stxsdx		vs34,	o16,	T1
-	xxswapd		vs34,	vs34
+	XXSWAPD(vs34,vs34)
 	stxsdx		vs35,	o24,	T1
-	xxswapd		vs35,	vs35
+	XXSWAPD(vs35,vs35)
 
 	addi		T1,	T1,	32
 
 	stxsdx		vs36,	o0,	T1
-	xxswapd		vs36,	vs36
+	XXSWAPD(vs36,vs36)
 	stxsdx		vs37,	o8,	T1
-	xxswapd		vs37,	vs37
+	XXSWAPD(vs37,vs37)
 	stxsdx		vs38,	o16,	T1
-	xxswapd		vs38,	vs38
+	XXSWAPD(vs38,vs38)
 	stxsdx		vs39,	o24,	T1
-	xxswapd		vs39,	vs39
+	XXSWAPD(vs39,vs39)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs33,	o8,	T2
@@ -3142,14 +3354,22 @@
 	stxsdx		vs38,	o16,	T2
 	stxsdx		vs39,	o24,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 4x2
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_4x2', `
+#else
 .macro SOLVE_LT_4x2
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs32,	vs33,	3
@@ -3245,27 +3465,35 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs33,	o8,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 	stxsdx		vs34,	o16,	T1
-	xxswapd		vs34,	vs34
+	XXSWAPD(vs34,vs34)
 	stxsdx		vs35,	o24,	T1
-	xxswapd		vs35,	vs35
+	XXSWAPD(vs35,vs35)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs33,	o8,	T2
 	stxsdx		vs34,	o16,	T2
 	stxsdx		vs35,	o24,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 2x2
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_2x2', `
+#else
 .macro SOLVE_LT_2x2
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 	xxpermdi	vs1,	vs32,	vs33,	3
@@ -3322,21 +3550,29 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 	stxsdx		vs33,	o8,	T1
-	xxswapd		vs33,	vs33
+	XXSWAPD(vs33,vs33)
 
 	stxsdx		vs32,	o0,	T2
 	stxsdx		vs33,	o8,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 1x2
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_1x2', `
+#else
 .macro SOLVE_LT_1x2
+#endif
 
 	xxpermdi	vs0,	vs32,	vs33,	0
 
@@ -3376,39 +3612,55 @@
 
 
 	stxsdx		vs32,	o0,	T1
-	xxswapd		vs32,	vs32
+	XXSWAPD(vs32,vs32)
 
 	stxsdx		vs32,	o0,	T2
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_16x1', `
+#else
 .macro INIT_16x1
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
-	xvmovdp		vs36,	vs0
-	xvmovdp		vs37,	vs0
-	xvmovdp		vs38,	vs0
-	xvmovdp		vs39,	vs0
-	xvmovdp		vs40,	vs0
-	xvmovdp		vs41,	vs0
-	xvmovdp		vs42,	vs0
-	xvmovdp		vs43,	vs0
-	xvmovdp		vs44,	vs0
-	xvmovdp		vs45,	vs0
-	xvmovdp		vs46,	vs0
-	xvmovdp		vs47,	vs0
-
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
+	XVMOVDP(vs36,vs0)
+	XVMOVDP(vs37,vs0)
+	XVMOVDP(vs38,vs0)
+	XVMOVDP(vs39,vs0)
+	XVMOVDP(vs40,vs0)
+	XVMOVDP(vs41,vs0)
+	XVMOVDP(vs42,vs0)
+	XVMOVDP(vs43,vs0)
+	XVMOVDP(vs44,vs0)
+	XVMOVDP(vs45,vs0)
+	XVMOVDP(vs46,vs0)
+	XVMOVDP(vs47,vs0)
+
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_16x1', `
+#else
 .macro KERNEL_16x1
+#endif
 
 
 	lxvdsx		vs0,	o0,	AO
@@ -3461,27 +3713,43 @@
 	xvmaddadp	vs47,	vs15,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_8x1', `
+#else
 .macro INIT_8x1
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
-	xvmovdp		vs36,	vs0
-	xvmovdp		vs37,	vs0
-	xvmovdp		vs38,	vs0
-	xvmovdp		vs39,	vs0
-
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
+	XVMOVDP(vs36,vs0)
+	XVMOVDP(vs37,vs0)
+	XVMOVDP(vs38,vs0)
+	XVMOVDP(vs39,vs0)
+
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_8x1', `
+#else
 .macro KERNEL_8x1
+#endif
 
 
 	lxvdsx		vs0,	o0,	AO
@@ -3512,23 +3780,39 @@
 	xvmaddadp	vs39,	vs7,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_4x1', `
+#else
 .macro INIT_4x1
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
-	xvmovdp		vs34,	vs0
-	xvmovdp		vs35,	vs0
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
+	XVMOVDP(vs34,vs0)
+	XVMOVDP(vs35,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_4x1', `
+#else
 .macro KERNEL_4x1
+#endif
 
 
 	lxvdsx		vs0,	o0,	AO
@@ -3548,21 +3832,37 @@
 	xvmaddadp	vs35,	vs3,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_2x1', `
+#else
 .macro INIT_2x1
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
-	xvmovdp		vs33,	vs0
+	XVMOVDP(vs32,vs0)
+	XVMOVDP(vs33,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_2x1', `
+#else
 .macro KERNEL_2x1
+#endif
 
 
 	lxvdsx		vs0,	o0,	AO
@@ -3578,20 +3878,36 @@
 	xvmaddadp	vs33,	vs1,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`INIT_1x1', `
+#else
 .macro INIT_1x1
+#endif
 
 
 	xxlxor		vs0,	vs0,	vs0
 
-	xvmovdp		vs32,	vs0
+	XVMOVDP(vs32,vs0)
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`KERNEL_1x1', `
+#else
 .macro KERNEL_1x1
+#endif
 
 
 	lxvdsx		vs0,	o0,	AO
@@ -3605,31 +3921,39 @@
 	xvmaddadp	vs32,	vs0,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 16x1
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_16x1', `
+#else
 .macro SOLVE_LT_16x1
-
-	xxswapd		vs0,	vs32
-	xxswapd		vs1,	vs33
-	xxswapd		vs2,	vs34
-	xxswapd		vs3,	vs35
-	xxswapd		vs4,	vs36
-	xxswapd		vs5,	vs37
-	xxswapd		vs6,	vs38
-	xxswapd		vs7,	vs39
-	xxswapd		vs8,	vs40
-	xxswapd		vs9,	vs41
-	xxswapd		vs10,	vs42
-	xxswapd		vs11,	vs43
-	xxswapd		vs12,	vs44
-	xxswapd		vs13,	vs45
-	xxswapd		vs14,	vs46
-	xxswapd		vs15,	vs47
+#endif
+
+	XXSWAPD(vs0,vs32)
+	XXSWAPD(vs1,vs33)
+	XXSWAPD(vs2,vs34)
+	XXSWAPD(vs3,vs35)
+	XXSWAPD(vs4,vs36)
+	XXSWAPD(vs5,vs37)
+	XXSWAPD(vs6,vs38)
+	XXSWAPD(vs7,vs39)
+	XXSWAPD(vs8,vs40)
+	XXSWAPD(vs9,vs41)
+	XXSWAPD(vs10,vs42)
+	XXSWAPD(vs11,vs43)
+	XXSWAPD(vs12,vs44)
+	XXSWAPD(vs13,vs45)
+	XXSWAPD(vs14,vs46)
+	XXSWAPD(vs15,vs47)
 
 //############### LOAD B #######################
 
@@ -4215,23 +4539,31 @@
 	stxsdx		vs46,	o16,	T1
 	stxsdx		vs47,	o24,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 8x1
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_8x1', `
+#else
 .macro SOLVE_LT_8x1
+#endif
 
-	xxswapd		vs0,	vs32
-	xxswapd		vs1,	vs33
-	xxswapd		vs2,	vs34
-	xxswapd		vs3,	vs35
-	xxswapd		vs4,	vs36
-	xxswapd		vs5,	vs37
-	xxswapd		vs6,	vs38
-	xxswapd		vs7,	vs39
+	XXSWAPD(vs0,vs32)
+	XXSWAPD(vs1,vs33)
+	XXSWAPD(vs2,vs34)
+	XXSWAPD(vs3,vs35)
+	XXSWAPD(vs4,vs36)
+	XXSWAPD(vs5,vs37)
+	XXSWAPD(vs6,vs38)
+	XXSWAPD(vs7,vs39)
 
 //############### LOAD B #######################
 
@@ -4443,19 +4775,27 @@
 	stxsdx		vs38,	o16,	T1
 	stxsdx		vs39,	o24,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 4x1
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_4x1', `
+#else
 .macro SOLVE_LT_4x1
+#endif
 
-	xxswapd		vs0,	vs32
-	xxswapd		vs1,	vs33
-	xxswapd		vs2,	vs34
-	xxswapd		vs3,	vs35
+	XXSWAPD(vs0,vs32)
+	XXSWAPD(vs1,vs33)
+	XXSWAPD(vs2,vs34)
+	XXSWAPD(vs3,vs35)
 
 //############### LOAD B #######################
 
@@ -4546,17 +4886,25 @@
 	stxsdx		vs34,	o16,	T1
 	stxsdx		vs35,	o24,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 2x1
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_2x1', `
+#else
 .macro SOLVE_LT_2x1
+#endif
 
-	xxswapd		vs0,	vs32
-	xxswapd		vs1,	vs33
+	XXSWAPD(vs0,vs32)
+	XXSWAPD(vs1,vs33)
 
 //############### LOAD B #######################
 
@@ -4609,16 +4957,24 @@
 	stxsdx		vs32,	o0,	T1
 	stxsdx		vs33,	o8,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /*##########################################################################################
 			 SOLVE_LT 1x1
 ##########################################################################################*/
 
+#if defined(_AIX)
+define(`SOLVE_LT_1x1', `
+#else
 .macro SOLVE_LT_1x1
+#endif
 
-	xxswapd		vs0,	vs32
+	XXSWAPD(vs0,vs32)
 
 //############### LOAD B #######################
 
@@ -4655,5 +5011,9 @@
 
 	stxsdx		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c
index 5bdc0a13c..623ac9fb0 100644
--- a/kernel/power/idamax.c
+++ b/kernel/power/idamax.c
@@ -58,8 +58,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t" 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"
 
             "xxlor  40,%x[start],%x[start]  \n\t" //{ 1,0} vs40 | v8 
             "vaddudm  9,8,%[adder]   \n\t" //{3,2} vs41 
@@ -69,7 +69,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "vaddudm  11,10,%[adder] \n\t" //{7,6} vs43
             "xxlxor  39,39,39        \n\t" //   vs39 vec_max_value
             "vaddudm 4,11,  %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
-            "xxspltd   36,36,0       \n\t"
+            XXSPLTD_S(36,36,0)
     
             "xvabsdp  44, 44 \n\t"
             "xvabsdp  45, 45 \n\t"
@@ -77,21 +77,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "xvabsdp  47, 47 \n\t"                         
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
   
             //jump first half forward 
-            "b 2f  \n\t"
+            "b two%=  \n\t"
 
 //===================================================================
 
-            ".p2align   5            \n\t"
+            ".align   5            \n\t"
 
-            "1: \n\t"
+            "one%=: \n\t"
             "xvcmpgtdp  2,45,44  \n\t "
             "xvcmpgtdp  3,47,46  \n\t "
             "xvcmpgtdp  4,49,48  \n\t "
-            "xvcmpgtdp  5,51,50  \n\t"
+            "xvcmpgtdp  5,7,6  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -100,7 +100,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "xxsel    34,40,41,4 \n\t"
             "xxsel    45,48,49,4 \n\t"
             "xxsel    35,42,43,5 \n\t"
-            "xxsel    47,50,51,5 \n\t"
+            "xxsel    47,6,7,5 \n\t"
 
             "xvcmpgtdp 2, 1,0     \n\t"
             "xvcmpgtdp 3,47, 45   \n\t"
@@ -134,8 +134,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             "vaddudm  1,1,5  \n\t"  //  get real index for first bigger  
 
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"
 
             //compare with previous to get vec_max_index(v6 | vs38 ) and vec_max_value (vs39)   
             "xvcmpgtdp 2, 3,39    \n\t"
@@ -155,16 +155,16 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
 //<-----------jump here from first load
-             "2:                  \n\t"
+             "two%=:                  \n\t"
     
             "xvcmpgtdp  2,45,44  \n\t "
             "xvcmpgtdp  3,47,46  \n\t "
             "xvcmpgtdp  4,49,48  \n\t "
-            "xvcmpgtdp  5,51,50  \n\t"
+            "xvcmpgtdp  5,7,6  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -173,7 +173,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "xxsel    34,40,41,4 \n\t"
             "xxsel    45,48,49,4 \n\t"
             "xxsel    35,42,43,5 \n\t"
-            "xxsel    47,50,51,5 \n\t"
+            "xxsel    47,6,7,5 \n\t"
 
             "xvcmpgtdp 2, 1,0     \n\t"
             "xvcmpgtdp 3,47, 45   \n\t"
@@ -203,8 +203,8 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             "vaddudm  1,1,5  \n\t"  //  get real index for first bigger  
 
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"
 
  
 
@@ -226,21 +226,21 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
             //decrement n
             "addic.    %[n], %[n], -32 \n\t"
            
             //Loop back if >0
-            "bgt+ 1b  \n\t"
+            "bgt+ one%=  \n\t"
 
 //==============================================================================
 
            "xvcmpgtdp  2,45,44  \n\t "
             "xvcmpgtdp  3,47,46  \n\t "
             "xvcmpgtdp  4,49,48  \n\t "
-            "xvcmpgtdp  5,51,50  \n\t"
+            "xvcmpgtdp  5,7,6  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -249,7 +249,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "xxsel    34,40,41,4 \n\t"
             "xxsel    45,48,49,4 \n\t"
             "xxsel    35,42,43,5 \n\t"
-            "xxsel    47,50,51,5 \n\t"
+            "xxsel    47,6,7,5 \n\t"
 
             "xvcmpgtdp 2, 1,0     \n\t"
             "xvcmpgtdp 3,47, 45   \n\t"
@@ -276,28 +276,28 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             ///////extract max value and max index from vector
 
-            "xxspltd   32,38,1     \n\t"
-            "xxspltd   40,39,1     \n\t"
+            XXSPLTD_S(32,38,1)
+            XXSPLTD_S(40,39,1)
             "xvcmpeqdp.  2, 40,39  \n\t"
     
             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
              //0b001110=14
-            "bc 14,24, 3f  \n\t" 
+            "bc 14,24, three%=  \n\t" 
             "xvcmpgtdp  4, 40,39  \n\t"
             "xxsel    0,39,40,4           \n\t"
             "xxsel    1,38,32,4  \n\t"
             "stxsdx    0,0,%[ptr_maxf]     \n\t" 
-            "b 4f    \n\t"
+            "b four%=    \n\t"
 
-            "3:      \n\t" 
+            "three%=:      \n\t" 
                 //if elements value are equal then choose minimum index
-            "xxspltd  0,40,0          \n\t"
+            XXSPLTD_S(0,40,0)
             "vminud   0,0,6    \n\t"  //vs32 vs38
             "xxlor 1,32,32     \n\t"
             "stxsdx   0,0,%[ptr_maxf]  \n\t"
           
 
-            "4:      \n\t"
+            "four%=:      \n\t"
             "mfvsrd   %[index],1 \n\t"
 
             : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@@ -306,7 +306,7 @@ static BLASLONG diamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
             [start] "v"(start),  [adder] "v"(temp_add_index)
             : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
-            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
+            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
             );
 
  
diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c
index 7fe0f8a33..b2705f2fa 100644
--- a/kernel/power/idamin.c
+++ b/kernel/power/idamin.c
@@ -58,8 +58,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t" 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"
 
             "xxlor  40,%x[start],%x[start]  \n\t" //{ 1,0} vs40 | v8 
             "vaddudm  9,8, %[adder]   \n\t" //{3,2} vs41 
@@ -69,7 +69,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "vaddudm  11,10,%[adder]  \n\t" //{7,6} vs43
             "lxvdsx  39,0,%[ptr_minf] \n\t" //   vs39 vec_min_value
             "vaddudm  4,11, %[adder]  \n\t" // {9,8} -{8;8} vs36 | v4
-            "xxspltd   36,36,0        \n\t"
+            XXSPLTD_S(36,36,0)
             "xvabsdp  39, 39 \n\t"             
     
             "xvabsdp  44, 44 \n\t"
@@ -78,21 +78,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xvabsdp  47, 47 \n\t"                         
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
   
             //jump first half forward 
-            "b 2f  \n\t"
+            "b two%=  \n\t"
 
 //===================================================================
 
-            ".p2align   5            \n\t"
+            ".align   5            \n\t"
 
-            "1: \n\t"
+            "one%=: \n\t"
             "xvcmpgtdp  2,44,45  \n\t "
             "xvcmpgtdp  3,46,47  \n\t "
             "xvcmpgtdp  4,48,49  \n\t "
-            "xvcmpgtdp  5,50,51  \n\t"
+            "xvcmpgtdp  5,6,7  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -101,7 +101,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xxsel    34,40,41,4 \n\t"
             "xxsel    45,48,49,4 \n\t"
             "xxsel    35,42,43,5 \n\t"
-            "xxsel    47,50,51,5 \n\t"
+            "xxsel    47,6,7,5 \n\t"
 
             "xvcmpgtdp 2,0, 1     \n\t"
             "xvcmpgtdp 3, 45,47   \n\t"
@@ -135,8 +135,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             "vaddudm  1,1,5  \n\t"  //  get real index for first smaller  
 
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"
 
             //compare with previous to get vec_min_index(v6 | vs38 ) and vec_min_value (vs39)   
             "xvcmpgtdp 2,39, 3    \n\t"
@@ -156,16 +156,16 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
 //<-----------jump here from first load
-             "2:                  \n\t"
+             "two%=:                  \n\t"
     
             "xvcmpgtdp  2,44,45  \n\t "
             "xvcmpgtdp  3,46,47  \n\t "
             "xvcmpgtdp  4,48,49  \n\t "
-            "xvcmpgtdp  5,50,51  \n\t"
+            "xvcmpgtdp  5,6,7  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -174,7 +174,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xxsel    34,40,41,4 \n\t"
             "xxsel    45,48,49,4 \n\t"
             "xxsel    35,42,43,5 \n\t"
-            "xxsel    47,50,51,5 \n\t"
+            "xxsel    47,6,7,5 \n\t"
 
             "xvcmpgtdp 2,0, 1     \n\t"
             "xvcmpgtdp 3, 45,47   \n\t"
@@ -204,8 +204,8 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             "vaddudm  1,1,5  \n\t"  //  get real index for first smaller  
 
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"
 
  
 
@@ -227,21 +227,21 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
             //decrement n
             "addic.    %[n], %[n], -32 \n\t"
            
             //Loop back if >0
-            "bgt+ 1b  \n\t"
+            "bgt+ one%=  \n\t"
 
 //==============================================================================
 
             "xvcmpgtdp  2,44,45  \n\t "
             "xvcmpgtdp  3,46,47  \n\t "
             "xvcmpgtdp  4,48,49  \n\t "
-            "xvcmpgtdp  5,50,51  \n\t"
+            "xvcmpgtdp  5,6,7  \n\t"
 
             "xxsel    32,40,41,2 \n\t"
             "xxsel     0,44,45,2 \n\t" 
@@ -250,7 +250,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xxsel    34,40,41,4 \n\t"
             "xxsel    45,48,49,4 \n\t"
             "xxsel    35,42,43,5 \n\t"
-            "xxsel    47,50,51,5 \n\t"
+            "xxsel    47,6,7,5 \n\t"
 
             "xvcmpgtdp 2,0, 1     \n\t"
             "xvcmpgtdp 3, 45,47   \n\t"
@@ -277,28 +277,28 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             ///////extract min value and min index from vector
 
-            "xxspltd   32,38,1     \n\t"
-            "xxspltd   40,39,1     \n\t"
+            XXSPLTD_S(32,38,1)
+            XXSPLTD_S(40,39,1)
             "xvcmpeqdp.  2, 40,39  \n\t"
     
             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
              //0b001110=14
-            "bc 14,24, 3f  \n\t" 
+            "bc 14,24, three%=  \n\t" 
             "xvcmpgtdp  4,39, 40  \n\t"
             "xxsel    0,39,40,4           \n\t"
             "xxsel    1,38,32,4  \n\t"
             "stxsdx    0,0,%[ptr_minf]     \n\t" 
-            "b 4f    \n\t"
+            "b four%=    \n\t"
 
-            "3:      \n\t" 
+            "three%=:      \n\t" 
                 //if elements value are equal then choose minimum index
-            "xxspltd  0,40,0          \n\t"
+            XXSPLTD_S(0,40,0)
             "vminud   0,0,6    \n\t"  //vs32 vs38
             "xxlor 1,32,32     \n\t"
             "stxsdx   0,0,%[ptr_minf]  \n\t"
           
 
-            "4:      \n\t"
+            "four%=:      \n\t"
             "mfvsrd   %[index],1 \n\t"
 
             : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@@ -307,7 +307,7 @@ static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) {
             [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
             [start] "v"(start),  [adder] "v"(temp_add_index)
             : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
-            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
+            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
             );
  
     return index;
diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c
index cfe78c8c0..339c3ccde 100644
--- a/kernel/power/izamax.c
+++ b/kernel/power/izamax.c
@@ -56,8 +56,8 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t"
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"      
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"      
 
             "xxlor  40,%x[start],%x[start]  \n\t" //{ 1,0} vs40 | v8 
             "vaddudm  9,8,%[adder]   \n\t" //{3,2} vs41 
@@ -67,7 +67,7 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "vaddudm  11,10,%[adder] \n\t" //{7,6} vs43
             "xxlxor  39,39,39        \n\t" //   vs39 vec_max_value is zero
             "vaddudm 4,11,  %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
-            "xxspltd   36,36,0       \n\t"
+            XXSPLTD_S(36,36,0)
 
 
 
@@ -77,24 +77,24 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "xvabsdp  47, 47 \n\t"   
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"   
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
             //jump first half forward 
-            "b 2f  \n\t"
+            "b two%=  \n\t"
 
-            ".p2align   5            \n\t"
-            "1: \n\t"
+            ".align   5            \n\t"
+            "one%=: \n\t"
 
  
-            "xxmrghd    0,44,45  \n\t"
-            "xxmrgld    1,44,45  \n\t"
-            "xxmrghd    2,46,47  \n\t"
-            "xxmrgld    3,46,47  \n\t"
-            "xxmrghd    4,48,49  \n\t"
-            "xxmrgld    5,48,49  \n\t"
-            "xxmrghd    44,50,51  \n\t"
-            "xxmrgld    45,50,51  \n\t"
+            XXMRGHD_S(0,44,45)
+            XXMRGLD_S(1,44,45)
+            XXMRGHD_S(2,46,47)
+            XXMRGLD_S(3,46,47)
+            XXMRGHD_S(4,48,49)
+            XXMRGLD_S(5,48,49)
+            XXMRGHD_S(44,6,7)
+            XXMRGLD_S(45,6,7)
 
             "xvadddp    46,  0,1 \n\t"
             "xvadddp    47,  2,3 \n\t" 
@@ -103,15 +103,15 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
 
 
-            "xvcmpgtdp  50,47,46  \n\t "
-            "xvcmpgtdp  51,49,48  \n\t "
+            "xvcmpgtdp  6,47,46  \n\t "
+            "xvcmpgtdp  7,49,48  \n\t "
 
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 
-            "xxsel    32,40,41,50 \n\t"
-            "xxsel     0,46,47,50 \n\t" 
-            "xxsel    33,42,43,51 \n\t"
-            "xxsel     1,48,49,51 \n\t"  
+            "xxsel    32,40,41,6 \n\t"
+            "xxsel     0,46,47,6 \n\t" 
+            "xxsel    33,42,43,7 \n\t"
+            "xxsel     1,48,49,7 \n\t"  
 
             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
@@ -133,8 +133,8 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"        
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"        
              //select with previous 
             "xxsel     38,38,32,4 \n\t" 
             "xxsel    39,39,3,4    \n\t" 
@@ -148,35 +148,35 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "xvabsdp  47, 47 \n\t"   
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"   
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
 
 //>>///////////////////////////////  half start
-            "2:   \n\t"
-            "xxmrghd    0,44,45  \n\t"
-            "xxmrgld    1,44,45  \n\t"
-            "xxmrghd    2,46,47  \n\t"
-            "xxmrgld    3,46,47  \n\t"
-            "xxmrghd    4,48,49  \n\t"
-            "xxmrgld    5,48,49  \n\t"
-            "xxmrghd    44,50,51  \n\t"
-            "xxmrgld    45,50,51  \n\t"
+            "two%=:   \n\t"
+            XXMRGHD_S(0,44,45)
+            XXMRGLD_S(1,44,45)
+            XXMRGHD_S(2,46,47)
+            XXMRGLD_S(3,46,47)
+            XXMRGHD_S(4,48,49)
+            XXMRGLD_S(5,48,49)
+            XXMRGHD_S(44,6,7)
+            XXMRGLD_S(45,6,7)
 
             "xvadddp    46,  0,1 \n\t"
             "xvadddp    47,  2,3 \n\t" 
             "xvadddp    48,  4,5 \n\t"
             "xvadddp    49,  44,45 \n\t"
 
-            "xvcmpgtdp  50,47,46  \n\t "
-            "xvcmpgtdp  51,49,48  \n\t "
+            "xvcmpgtdp  6,47,46  \n\t "
+            "xvcmpgtdp  7,49,48  \n\t "
 
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 
-            "xxsel    32,40,41,50 \n\t"
-            "xxsel     0,46,47,50 \n\t" 
-            "xxsel    33,42,43,51 \n\t"
-            "xxsel     1,48,49,51 \n\t"  
+            "xxsel    32,40,41,6 \n\t"
+            "xxsel     0,46,47,6 \n\t" 
+            "xxsel    33,42,43,7 \n\t"
+            "xxsel     1,48,49,7 \n\t"  
 
             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
@@ -198,8 +198,8 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"        
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"        
              //select with previous 
             "xxsel     38,38,32,4 \n\t" 
             "xxsel    39,39,3,4    \n\t" 
@@ -211,24 +211,24 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             "xvabsdp  47, 47 \n\t"   
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"   
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
 
             //decrement n
             "addic.    %[n], %[n], -16 \n\t"
             //Loop back if >0
-            "bgt+ 1b  \n\t"
+            "bgt+ one%=  \n\t"
 
 
-            "xxmrghd    0,44,45  \n\t"
-            "xxmrgld    1,44,45  \n\t"
-            "xxmrghd    2,46,47  \n\t"
-            "xxmrgld    3,46,47  \n\t"
-            "xxmrghd    4,48,49  \n\t"
-            "xxmrgld    5,48,49  \n\t"
-            "xxmrghd    44,50,51  \n\t"
-            "xxmrgld    45,50,51  \n\t"
+            XXMRGHD_S(0,44,45)
+            XXMRGLD_S(1,44,45)
+            XXMRGHD_S(2,46,47)
+            XXMRGLD_S(3,46,47)
+            XXMRGHD_S(4,48,49)
+            XXMRGLD_S(5,48,49)
+            XXMRGHD_S(44,6,7)
+            XXMRGLD_S(45,6,7)
 
             "xvadddp    46,  0,1 \n\t"
             "xvadddp    47,  2,3 \n\t" 
@@ -237,13 +237,13 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
 
 
-            "xvcmpgtdp  50,47,46  \n\t "
-            "xvcmpgtdp  51,49,48  \n\t "
+            "xvcmpgtdp  6,47,46  \n\t "
+            "xvcmpgtdp  7,49,48  \n\t "
 
-            "xxsel    32,40,41,50 \n\t"
-            "xxsel     0,46,47,50 \n\t" 
-            "xxsel    33,42,43,51 \n\t"
-            "xxsel     1,48,49,51 \n\t"  
+            "xxsel    32,40,41,6 \n\t"
+            "xxsel     0,46,47,6 \n\t" 
+            "xxsel    33,42,43,7 \n\t"
+            "xxsel     1,48,49,7 \n\t"  
 
             "xvcmpgtdp  2,1,0  \n\t " 
             "xxsel    32,32,33,2 \n\t" 
@@ -262,28 +262,28 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
 
             ///////extract max value and max index from vector
 
-            "xxspltd   32,38,1     \n\t"
-            "xxspltd   40,39,1     \n\t"
+            XXSPLTD_S(32,38,1)
+            XXSPLTD_S(40,39,1)
             "xvcmpeqdp.  2, 40,39  \n\t"
     
             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
              //0b001110=14
-            "bc 14,24, 3f  \n\t" 
+            "bc 14,24, three%=  \n\t" 
             "xvcmpgtdp  4, 40,39  \n\t"
             "xxsel    0,39,40,4           \n\t"
             "xxsel    1,38,32,4  \n\t"
             "stxsdx    0,0,%[ptr_maxf]     \n\t" 
-            "b 4f    \n\t"
+            "b four%=    \n\t"
 
-            "3:      \n\t" 
+            "three%=:      \n\t" 
                 //if elements value are equal then choose minimum index
-            "xxspltd  0,40,0          \n\t"
+            XXSPLTD_S(0,40,0)
             "vminud   0,0,6    \n\t"  //vs32 vs38
             "xxlor 1,32,32     \n\t"
             "stxsdx   0,0,%[ptr_maxf]  \n\t"
           
 
-            "4:      \n\t"
+            "four%=:      \n\t"
             "mfvsrd   %[index],1 \n\t"
 
             : [maxf] "=m"(*maxf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@@ -292,7 +292,7 @@ static BLASLONG   ziamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *maxf) {
             [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
             [start] "v"(start),  [adder] "v"(temp_add_index)
             : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
-            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
+            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
             );
  
     return index;
diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c
index 1ffa3ba8b..6d0d15547 100644
--- a/kernel/power/izamin.c
+++ b/kernel/power/izamin.c
@@ -54,8 +54,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "lxvd2x  47, %[i48],%[ptr_tmp] \n\t"
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"      
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"      
 
             "xxlor  40,%x[start],%x[start]  \n\t" //{ 1,0} vs40 | v8 
             "vaddudm  9,8,%[adder]   \n\t" //{3,2} vs41 
@@ -65,7 +65,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "vaddudm  11,10,%[adder] \n\t" //{7,6} vs43
             "lxvdsx  39,0,%[ptr_minf] \n\t" //   vs39 vec_min_value
             "vaddudm 4,11,  %[adder] \n\t" // {9,8} -{8;8} vs36 | v4
-            "xxspltd   36,36,0       \n\t"
+            XXSPLTD_S(36,36,0)
 
 
 
@@ -75,24 +75,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xvabsdp  47, 47 \n\t"   
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"   
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
             //jump first half forward 
-            "b 2f  \n\t"
+            "b two%=  \n\t"
 
-            ".p2align   5            \n\t"
-            "1: \n\t"
+            ".align   5            \n\t"
+            "one%=: \n\t"
 
  
-            "xxmrghd    0,44,45  \n\t"
-            "xxmrgld    1,44,45  \n\t"
-            "xxmrghd    2,46,47  \n\t"
-            "xxmrgld    3,46,47  \n\t"
-            "xxmrghd    4,48,49  \n\t"
-            "xxmrgld    5,48,49  \n\t"
-            "xxmrghd    44,50,51  \n\t"
-            "xxmrgld    45,50,51  \n\t"
+            XXMRGHD_S(0,44,45)
+            XXMRGLD_S(1,44,45)
+            XXMRGHD_S(2,46,47)
+            XXMRGLD_S(3,46,47)
+            XXMRGHD_S(4,48,49)
+            XXMRGLD_S(5,48,49)
+            XXMRGHD_S(44,6,7)
+            XXMRGLD_S(45,6,7)
 
             "xvadddp    46,  0,1 \n\t"
             "xvadddp    47,  2,3 \n\t" 
@@ -101,15 +101,15 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
 
 
-            "xvcmpgtdp  50,46,47  \n\t "
-            "xvcmpgtdp  51,48,49  \n\t "
+            "xvcmpgtdp  6,46,47  \n\t "
+            "xvcmpgtdp  7,48,49  \n\t "
 
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 
-            "xxsel    32,40,41,50 \n\t"
-            "xxsel     0,46,47,50 \n\t" 
-            "xxsel    33,42,43,51 \n\t"
-            "xxsel     1,48,49,51 \n\t"  
+            "xxsel    32,40,41,6 \n\t"
+            "xxsel     0,46,47,6 \n\t" 
+            "xxsel    33,42,43,7 \n\t"
+            "xxsel     1,48,49,7 \n\t"  
 
             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
@@ -131,8 +131,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"        
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"        
              //select with previous 
             "xxsel     38,38,32,4 \n\t" 
             "xxsel    39,39,3,4    \n\t" 
@@ -146,35 +146,35 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xvabsdp  47, 47 \n\t"   
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"   
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
 
 //>>///////////////////////////////  half start
-            "2:   \n\t"
-            "xxmrghd    0,44,45  \n\t"
-            "xxmrgld    1,44,45  \n\t"
-            "xxmrghd    2,46,47  \n\t"
-            "xxmrgld    3,46,47  \n\t"
-            "xxmrghd    4,48,49  \n\t"
-            "xxmrgld    5,48,49  \n\t"
-            "xxmrghd    44,50,51  \n\t"
-            "xxmrgld    45,50,51  \n\t"
+            "two%=:   \n\t"
+            XXMRGHD_S(0,44,45)
+            XXMRGLD_S(1,44,45)
+            XXMRGHD_S(2,46,47)
+            XXMRGLD_S(3,46,47)
+            XXMRGHD_S(4,48,49)
+            XXMRGLD_S(5,48,49)
+            XXMRGHD_S(44,6,7)
+            XXMRGLD_S(45,6,7)
 
             "xvadddp    46,  0,1 \n\t"
             "xvadddp    47,  2,3 \n\t" 
             "xvadddp    48,  4,5 \n\t"
             "xvadddp    49,  44,45 \n\t"
 
-            "xvcmpgtdp  50,46,47  \n\t "
-            "xvcmpgtdp  51,48,49  \n\t "
+            "xvcmpgtdp  6,46,47  \n\t "
+            "xvcmpgtdp  7,48,49  \n\t "
 
             "addi     %[ptr_tmp] ,%[ptr_tmp] , 128 \n\t"   
 
-            "xxsel    32,40,41,50 \n\t"
-            "xxsel     0,46,47,50 \n\t" 
-            "xxsel    33,42,43,51 \n\t"
-            "xxsel     1,48,49,51 \n\t"  
+            "xxsel    32,40,41,6 \n\t"
+            "xxsel     0,46,47,6 \n\t" 
+            "xxsel    33,42,43,7 \n\t"
+            "xxsel     1,48,49,7 \n\t"  
 
             "lxvd2x  44,      0,%[ptr_tmp] \n\t"
             "lxvd2x  45, %[i16],%[ptr_tmp] \n\t"
@@ -196,8 +196,8 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             "lxvd2x  48, %[i64],%[ptr_tmp] \n\t"
             "lxvd2x  49, %[i80],%[ptr_tmp] \n\t"  
-            "lxvd2x  50, %[i96],%[ptr_tmp] \n\t"
-            "lxvd2x  51,%[i112],%[ptr_tmp] \n\t"        
+            "lxvd2x  6, %[i96],%[ptr_tmp] \n\t"
+            "lxvd2x  7,%[i112],%[ptr_tmp] \n\t"        
              //select with previous 
             "xxsel     38,38,32,4 \n\t" 
             "xxsel    39,39,3,4    \n\t" 
@@ -209,24 +209,24 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             "xvabsdp  47, 47 \n\t"   
             "xvabsdp  48, 48 \n\t"
             "xvabsdp  49, 49 \n\t"   
-            "xvabsdp  50, 50 \n\t"
-            "xvabsdp  51, 51 \n\t"
+            "xvabsdp  6, 6 \n\t"
+            "xvabsdp  7, 7 \n\t"
 
 
             //decrement n
             "addic.    %[n], %[n], -16 \n\t"
             //Loop back if >0
-            "bgt+ 1b  \n\t"
+            "bgt+ one%=  \n\t"
 
 
-            "xxmrghd    0,44,45  \n\t"
-            "xxmrgld    1,44,45  \n\t"
-            "xxmrghd    2,46,47  \n\t"
-            "xxmrgld    3,46,47  \n\t"
-            "xxmrghd    4,48,49  \n\t"
-            "xxmrgld    5,48,49  \n\t"
-            "xxmrghd    44,50,51  \n\t"
-            "xxmrgld    45,50,51  \n\t"
+            XXMRGHD_S(0,44,45)
+            XXMRGLD_S(1,44,45)
+            XXMRGHD_S(2,46,47)
+            XXMRGLD_S(3,46,47)
+            XXMRGHD_S(4,48,49)
+            XXMRGLD_S(5,48,49)
+            XXMRGHD_S(44,6,7)
+            XXMRGLD_S(45,6,7)
 
             "xvadddp    46,  0,1 \n\t"
             "xvadddp    47,  2,3 \n\t" 
@@ -235,13 +235,13 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
 
 
-            "xvcmpgtdp  50,46,47  \n\t "
-            "xvcmpgtdp  51,48,49  \n\t "
+            "xvcmpgtdp  6,46,47  \n\t "
+            "xvcmpgtdp  7,48,49  \n\t "
 
-            "xxsel    32,40,41,50 \n\t"
-            "xxsel     0,46,47,50 \n\t" 
-            "xxsel    33,42,43,51 \n\t"
-            "xxsel     1,48,49,51 \n\t"  
+            "xxsel    32,40,41,6 \n\t"
+            "xxsel     0,46,47,6 \n\t" 
+            "xxsel    33,42,43,7 \n\t"
+            "xxsel     1,48,49,7 \n\t"  
 
             "xvcmpgtdp  2,0,1  \n\t " 
             "xxsel    32,32,33,2 \n\t" 
@@ -260,28 +260,28 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
 
             ///////extract min value and min index from vector
 
-            "xxspltd   32,38,1     \n\t"
-            "xxspltd   40,39,1     \n\t"
+            XXSPLTD_S(32,38,1)
+            XXSPLTD_S(40,39,1)
             "xvcmpeqdp.  2, 40,39  \n\t"
     
             //cr6 0 bit set if all true, cr6=4*6+bit_ind=24,0011at CR(BI)==1, at=10 hint that it occurs rarely
              //0b001110=14
-            "bc 14,24, 3f  \n\t" 
+            "bc 14,24, three%=  \n\t" 
             "xvcmpgtdp  4,39, 40  \n\t"
             "xxsel    0,39,40,4           \n\t"
             "xxsel    1,38,32,4  \n\t"
             "stxsdx    0,0,%[ptr_minf]     \n\t" 
-            "b 4f    \n\t"
+            "b four%=    \n\t"
 
-            "3:      \n\t" 
+            "three%=:      \n\t" 
                 //if elements value are equal then choose minimum index
-            "xxspltd  0,40,0          \n\t"
+            XXSPLTD_S(0,40,0)
             "vminud   0,0,6    \n\t"  //vs32 vs38
             "xxlor 1,32,32     \n\t"
             "stxsdx   0,0,%[ptr_minf]  \n\t"
           
 
-            "4:      \n\t"
+            "four%=:      \n\t"
             "mfvsrd   %[index],1 \n\t"
 
             : [minf] "=m"(*minf),[ptr_tmp] "+&b"(x),[index] "=r"(index), [n] "+&r"(n)
@@ -290,7 +290,7 @@ static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) {
             [i64] "b"(64), [i80] "b"(80), [i96] "b"(96), [i112] "b"(112),
             [start] "v"(start),  [adder] "v"(temp_add_index)
             : "cc", "vs0", "vs1","vs2","vs3", "vs4","vs5","vs32", "vs33", "vs34", "vs35", "vs36",
-            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51"
+            "vs37", "vs38", "vs39", "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs6", "vs7"
             );
  
     return index; 
diff --git a/kernel/power/lock.c b/kernel/power/lock.c
index 51348d63c..1c1b006b0 100644
--- a/kernel/power/lock.c
+++ b/kernel/power/lock.c
@@ -46,10 +46,10 @@ static void __inline blas_lock(volatile BLASULONG *address){
 	   "      .machine \"any\" ;"
 	   "0:    lwarx %0,0, %1 ;"
 	   "      cmpwi  0,%0,0;"
-	   "      bne 1f;"
+	   "      bne one%=;"
 	   "      stwcx. %2,0, %1 ;"
 	   "      bne- 0b;"
-	   "1:    "
+	   "one%=:    "
 	: "=&r"(ret)
 	: "r"(address), "r" (val)
 	: "cr0", "memory");
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
index 4bb515de8..aa465c38e 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -68,10 +68,10 @@ static float sasum_kernel_32 (long n, float *x)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvabssp		48, 40		\n\t"
        "xvabssp		49, 41		\n\t"
@@ -108,9 +108,9 @@ static float sasum_kernel_32 (long n, float *x)
        "xvaddsp		38, 38, %x5	\n\t"
        "xvaddsp		39, 39, %x6	\n\t"
 
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvabssp		48, 40		\n\t"
        "xvabssp		49, 41		\n\t"
diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
index 7a54d5e1e..da39789b1 100644
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -51,10 +51,10 @@ static void scopy_kernel_32 (long n, float *x, float *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "stxvd2x		40, 0, %3	\n\t"
        "stxvd2x		41, %5, %3	\n\t"
@@ -77,9 +77,9 @@ static void scopy_kernel_32 (long n, float *x, float *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "stxvd2x		40, 0, %3	\n\t"
        "stxvd2x		41, %5, %3	\n\t"
diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
index bfe100c8b..a8db6a8d6 100644
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -78,10 +78,10 @@ static float sdot_kernel_16 (long n, float *x, float *y)
        "addi		%3, %3, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmaddasp	32, 40, 48	\n\t"
        "lxvd2x		40, 0, %2	\n\t"
@@ -112,9 +112,9 @@ static float sdot_kernel_16 (long n, float *x, float *y)
        "addi		%3, %3, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmaddasp	32, 40, 48	\n\t"
        "xvmaddasp	33, 41, 49	\n\t"
diff --git a/kernel/power/sgemm_macros_16x8_power8.S b/kernel/power/sgemm_macros_16x8_power8.S
index 98414857f..9bcfca827 100644
--- a/kernel/power/sgemm_macros_16x8_power8.S
+++ b/kernel/power/sgemm_macros_16x8_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=8 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x16_1', `
+#else
 .macro LOAD8x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_I1', `
+#else
 .macro KERNEL8x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_1', `
+#else
 .macro KERNEL8x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_2', `
+#else
 .macro KERNEL8x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_E2', `
+#else
 .macro KERNEL8x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_SUBI1', `
+#else
 .macro KERNEL8x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_SUB1', `
+#else
 .macro KERNEL8x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x16', `
+#else
 .macro SAVE8x16
+#endif
 
 	mr		T1,	CO
 
@@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x8_1', `
+#else
 .macro LOAD8x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_I1', `
+#else
 .macro KERNEL8x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_1', `
+#else
 .macro KERNEL8x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_2', `
+#else
 .macro KERNEL8x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_E2', `
+#else
 .macro KERNEL8x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_SUBI1', `
+#else
 .macro KERNEL8x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_SUB1', `
+#else
 .macro KERNEL8x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x8', `
+#else
 .macro SAVE8x8
+#endif
 
 	mr		T1,	CO
 
@@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x4_1', `
+#else
 .macro LOAD8x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_I1', `
+#else
 .macro KERNEL8x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_1', `
+#else
 .macro KERNEL8x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_2', `
+#else
 .macro KERNEL8x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_E2', `
+#else
 .macro KERNEL8x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_SUBI1', `
+#else
 .macro KERNEL8x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_SUB1', `
+#else
 .macro KERNEL8x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x4', `
+#else
 .macro SAVE8x4
+#endif
 
 	mr		T1,	CO
 
@@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x2_1', `
+#else
 .macro LOAD8x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -1633,9 +1829,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_I1', `
+#else
 .macro KERNEL8x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -1686,9 +1890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_1', `
+#else
 .macro KERNEL8x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -1739,9 +1951,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_2', `
+#else
 .macro KERNEL8x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -1792,9 +2012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_E2', `
+#else
 .macro KERNEL8x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -1822,9 +2050,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_SUBI1', `
+#else
 .macro KERNEL8x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -1875,9 +2111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_SUB1', `
+#else
 .macro KERNEL8x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -1928,9 +2172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x2', `
+#else
 .macro SAVE8x2
+#endif
 
 	mr		T1,	CO
 
@@ -2103,14 +2355,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x1_1', `
+#else
 .macro LOAD8x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -2133,9 +2393,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_I1', `
+#else
 .macro KERNEL8x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -2177,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_1', `
+#else
 .macro KERNEL8x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -2221,9 +2497,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_2', `
+#else
 .macro KERNEL8x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -2265,9 +2549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_E2', `
+#else
 .macro KERNEL8x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -2287,9 +2579,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_SUBI1', `
+#else
 .macro KERNEL8x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -2331,9 +2631,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_SUB1', `
+#else
 .macro KERNEL8x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -2375,9 +2683,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x1', `
+#else
 .macro SAVE8x1
+#endif
 
 	mr		T1,	CO
 
@@ -2518,14 +2834,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x16_1', `
+#else
 .macro LOAD4x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -2543,9 +2867,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_I1', `
+#else
 .macro KERNEL4x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2586,9 +2918,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_1', `
+#else
 .macro KERNEL4x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2629,9 +2969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_2', `
+#else
 .macro KERNEL4x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -2672,9 +3020,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_E2', `
+#else
 .macro KERNEL4x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -2698,9 +3054,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUBI1', `
+#else
 .macro KERNEL4x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -2741,9 +3105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUB1', `
+#else
 .macro KERNEL4x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -2784,9 +3156,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x16', `
+#else
 .macro SAVE4x16
+#endif
 
 	mr		T1,	CO
 
@@ -2907,14 +3287,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x8_1', `
+#else
 .macro LOAD4x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -2930,9 +3318,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_I1', `
+#else
 .macro KERNEL4x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2963,9 +3359,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_1', `
+#else
 .macro KERNEL4x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2996,9 +3400,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_2', `
+#else
 .macro KERNEL4x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3029,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_E2', `
+#else
 .macro KERNEL4x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -3047,9 +3467,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUBI1', `
+#else
 .macro KERNEL4x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3080,9 +3508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUB1', `
+#else
 .macro KERNEL4x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3113,9 +3549,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x8', `
+#else
 .macro SAVE4x8
+#endif
 
 	mr		T1,	CO
 
@@ -3204,14 +3648,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x4_1', `
+#else
 .macro LOAD4x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -3226,9 +3678,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_I1', `
+#else
 .macro KERNEL4x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -3254,9 +3714,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_1', `
+#else
 .macro KERNEL4x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -3282,9 +3750,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_2', `
+#else
 .macro KERNEL4x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3310,9 +3786,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_E2', `
+#else
 .macro KERNEL4x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -3324,9 +3808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUBI1', `
+#else
 .macro KERNEL4x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3352,9 +3844,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUB1', `
+#else
 .macro KERNEL4x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3380,9 +3880,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x4', `
+#else
 .macro SAVE4x4
+#endif
 
 	mr		T1,	CO
 
@@ -3455,14 +3963,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x2_1', `
+#else
 .macro LOAD4x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -3479,9 +3995,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_I1', `
+#else
 .macro KERNEL4x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3513,9 +4037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_1', `
+#else
 .macro KERNEL4x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3547,9 +4079,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_2', `
+#else
 .macro KERNEL4x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3581,9 +4121,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_E2', `
+#else
 .macro KERNEL4x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -3599,9 +4147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUBI1', `
+#else
 .macro KERNEL4x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3633,9 +4189,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUB1', `
+#else
 .macro KERNEL4x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3667,9 +4231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x2', `
+#else
 .macro SAVE4x2
+#endif
 
 	mr		T1,	CO
 
@@ -3758,14 +4330,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x1_1', `
+#else
 .macro LOAD4x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -3781,9 +4361,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_I1', `
+#else
 .macro KERNEL4x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3810,9 +4398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_1', `
+#else
 .macro KERNEL4x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3839,9 +4435,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_2', `
+#else
 .macro KERNEL4x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3868,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_E2', `
+#else
 .macro KERNEL4x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -3882,9 +4494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUBI1', `
+#else
 .macro KERNEL4x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3911,9 +4531,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUB1', `
+#else
 .macro KERNEL4x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3940,9 +4568,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x1', `
+#else
 .macro SAVE4x1
+#endif
 
 	mr		T1,	CO
 
@@ -4015,14 +4651,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x16_1', `
+#else
 .macro LOAD2x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -4038,9 +4682,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_I1', `
+#else
 .macro KERNEL2x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4069,9 +4721,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_1', `
+#else
 .macro KERNEL2x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4100,9 +4760,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_2', `
+#else
 .macro KERNEL2x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4131,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_E2', `
+#else
 .macro KERNEL2x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -4147,9 +4823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUBI1', `
+#else
 .macro KERNEL2x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4178,9 +4862,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUB1', `
+#else
 .macro KERNEL2x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4209,9 +4901,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x16', `
+#else
 .macro SAVE2x16
+#endif
 
 	mr		T1,	CO
 
@@ -4274,14 +4974,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -4295,9 +5003,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4320,9 +5036,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4345,9 +5069,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4370,9 +5102,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -4382,9 +5122,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4407,9 +5155,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4432,9 +5188,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 	mr		T1,	CO
 
@@ -4481,14 +5245,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -4501,9 +5273,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4523,9 +5303,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4545,9 +5333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4567,9 +5363,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -4577,9 +5381,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4599,9 +5411,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4621,9 +5441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 	mr		T1,	CO
 
@@ -4662,14 +5490,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -4684,9 +5520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4710,9 +5554,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4736,9 +5588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4762,9 +5622,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -4774,9 +5642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4800,9 +5676,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4826,9 +5710,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 	mr		T1,	CO
 
@@ -4875,14 +5767,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -4896,9 +5796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4919,9 +5827,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4942,9 +5858,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4965,9 +5889,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -4975,9 +5907,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4998,9 +5938,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5021,9 +5969,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 	mr		T1,	CO
 
@@ -5062,14 +6018,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x16_1', `
+#else
 .macro LOAD1x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -5084,9 +6048,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_I1', `
+#else
 .macro KERNEL1x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5109,9 +6081,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_1', `
+#else
 .macro KERNEL1x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5134,9 +6114,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_2', `
+#else
 .macro KERNEL1x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5159,9 +6147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs7,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_E2', `
+#else
 .macro KERNEL1x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -5170,9 +6166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs7,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUBI1', `
+#else
 .macro KERNEL1x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5195,9 +6199,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUB1', `
+#else
 .macro KERNEL1x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5220,9 +6232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x16', `
+#else
 .macro SAVE1x16
+#endif
 
 	mr		T1,	CO
 
@@ -5256,14 +6276,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -5276,9 +6304,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5297,9 +6333,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5318,9 +6362,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5339,18 +6391,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
 	xvmaddasp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5369,9 +6437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5390,9 +6466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 	mr		T1,	CO
 
@@ -5418,14 +6502,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -5437,9 +6529,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5456,9 +6556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5475,9 +6583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5494,17 +6610,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5521,9 +6653,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5540,9 +6680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 	mr		T1,	CO
 
@@ -5564,14 +6712,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -5585,9 +6741,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5607,9 +6771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5629,9 +6801,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5651,18 +6831,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
 	xsmaddadp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5682,9 +6878,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5704,9 +6908,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 	mr		T1,	CO
 
@@ -5732,14 +6944,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -5752,9 +6972,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5772,9 +7000,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5792,9 +7028,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5812,17 +7056,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5840,9 +7100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5860,9 +7128,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 	mr		T1,	CO
 
@@ -5884,13 +7160,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 
 
 
+#if defined(_AIX)
+define(`COPYB_4x8', `
+#else
 .macro COPYB_4x8
+#endif
 
 
 	lxvw4x		vs5,	o0,	BO
@@ -5993,10 +7277,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs54,	o48,	BBO
 
 	addi		BBO,	BBO,	64
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`COPYB_1x8', `
+#else
 .macro COPYB_1x8
+#endif
 
 
 	lxvw4x		vs5,	o0,	BO
@@ -6026,5 +7318,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs14,	o48,	BBO
 
 	addi		BBO,	BBO,	64
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/sgemm_tcopy_macros_16_power8.S b/kernel/power/sgemm_tcopy_macros_16_power8.S
index 53f9c8b82..ed592a604 100644
--- a/kernel/power/sgemm_tcopy_macros_16_power8.S
+++ b/kernel/power/sgemm_tcopy_macros_16_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x16', `
+#else
 .macro COPY_4x16
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -88,13 +92,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs46,	o32,	T1
 	stxvw4x		vs47,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
 .macro COPY_4x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -124,13 +136,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs38,	o32,	T1
 	stxvw4x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
 .macro COPY_4x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 
@@ -150,13 +170,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x2', `
+#else
 .macro COPY_4x2
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -190,13 +218,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs38,	o0,	T1
 	stxsspx		vs39,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x1', `
+#else
 .macro COPY_4x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 
@@ -218,13 +254,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsspx		vs35,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x16', `
+#else
 .macro COPY_2x16
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -250,13 +294,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs38,	o32,	T1
 	stxvw4x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
 .macro COPY_2x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -272,13 +324,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs34,	o32,	T1
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
 .macro COPY_2x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 
@@ -290,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x2', `
+#else
 .macro COPY_2x2
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -314,13 +382,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs34,	o0,	T1
 	stxsspx		vs35,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x1', `
+#else
 .macro COPY_2x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 
@@ -332,13 +408,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsspx		vs33,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x16', `
+#else
 .macro COPY_1x16
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -352,13 +436,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs34,	o32,	T1
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
 .macro COPY_1x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -368,13 +460,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs32,	o0,	T1
 	stxvw4x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
 .macro COPY_1x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 
@@ -382,13 +482,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x2', `
+#else
 .macro COPY_1x2
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -398,13 +506,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs32,	o0,	T1
 	stxsspx		vs33,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x1', `
+#else
 .macro COPY_1x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 
@@ -412,5 +528,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsspx		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/sgemm_tcopy_macros_8_power8.S b/kernel/power/sgemm_tcopy_macros_8_power8.S
index 1b71d5bb3..f80f095dc 100644
--- a/kernel/power/sgemm_tcopy_macros_8_power8.S
+++ b/kernel/power/sgemm_tcopy_macros_8_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
 .macro COPY_4x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -68,13 +72,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs38,	o32,	T1
 	stxvw4x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
 .macro COPY_4x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 
@@ -94,13 +106,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x2', `
+#else
 .macro COPY_4x2
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -134,13 +154,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs38,	o0,	T1
 	stxsspx		vs39,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x1', `
+#else
 .macro COPY_4x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 
@@ -162,13 +190,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsspx		vs35,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
 .macro COPY_2x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -184,13 +220,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs34,	o32,	T1
 	stxvw4x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
 .macro COPY_2x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 
@@ -202,13 +246,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x2', `
+#else
 .macro COPY_2x2
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -226,13 +278,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs34,	o0,	T1
 	stxsspx		vs35,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x1', `
+#else
 .macro COPY_2x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 
@@ -244,13 +304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsspx		vs33,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
 .macro COPY_1x8
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 	lxvw4x		vs33,	o16,	A0
@@ -260,13 +328,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvw4x		vs32,	o0,	T1
 	stxvw4x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
 .macro COPY_1x4
+#endif
 
 	lxvw4x		vs32,	o0,	A0
 
@@ -274,13 +350,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvw4x		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x2', `
+#else
 .macro COPY_1x2
+#endif
 
 	lxsspx		vs32,	o0,	A0
 	lxsspx		vs33,	o4,	A0
@@ -290,13 +374,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxsspx		vs32,	o0,	T1
 	stxsspx		vs33,	o4,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x1', `
+#else
 .macro COPY_1x1
+#endif
 
 	lxsspx		vs32,	o0,	A0
 
@@ -304,5 +396,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxsspx		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
index 6eecb60a1..329a8cd06 100644
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -71,10 +71,10 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "addi		%4, %4, 64	\n\t"
 
        "addic.		%2, %2, -16	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmulsp		40, 32, 36	\n\t"	// c * x
        "xvmulsp		41, 33, 36	\n\t"
@@ -138,9 +138,9 @@ static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
        "addi		%4, %4, 128	\n\t"
 
        "addic.		%2, %2, -16	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmulsp		40, 32, 36	\n\t"	// c * x
        "xvmulsp		41, 33, 36	\n\t"
diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
index 058ff3399..88fba3166 100644
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -56,10 +56,10 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmulsp		40, 32, %x3	\n\t"
        "xvmulsp		41, 33, %x3	\n\t"
@@ -92,9 +92,9 @@ static void sscal_kernel_16 (long n, float *x, float alpha)
        "addi		%2, %2, 256	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmulsp		40, 32, %x3	\n\t"
        "xvmulsp		41, 33, %x3	\n\t"
@@ -147,8 +147,8 @@ static void sscal_kernel_16_zero (long n, float *x)
     (
        "xxlxor		%x3, %x3, %x3	\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "stxvd2x		%x3, 0, %2	\n\t"
        "stxvd2x		%x3, %4, %2	\n\t"
@@ -162,7 +162,7 @@ static void sscal_kernel_16_zero (long n, float *x)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
      "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
      :
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
index cfefdd6ef..a407018a8 100644
--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -39,8 +39,8 @@ static void sswap_kernel_32 (long n, float *x, float *y)
 {
   __asm__
     (
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "lxvd2x		32, 0, %4	\n\t"
        "lxvd2x		33, %5, %4	\n\t"
@@ -83,7 +83,7 @@ static void sswap_kernel_32 (long n, float *x, float *y)
        "addi		%4, %4, 128	\n\t"
 
        "addic.		%2, %2, -32	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
      "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :
diff --git a/kernel/power/strmm_macros_16x8_power8.S b/kernel/power/strmm_macros_16x8_power8.S
index 27bc1e89c..6c016d6fa 100644
--- a/kernel/power/strmm_macros_16x8_power8.S
+++ b/kernel/power/strmm_macros_16x8_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=8 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x16_1', `
+#else
 .macro LOAD8x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -63,9 +67,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_I1', `
+#else
 .macro KERNEL8x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -133,9 +145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_1', `
+#else
 .macro KERNEL8x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -203,9 +223,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_2', `
+#else
 .macro KERNEL8x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -273,9 +301,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_E2', `
+#else
 .macro KERNEL8x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -319,9 +355,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs7,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_SUBI1', `
+#else
 .macro KERNEL8x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x16_SUB1', `
+#else
 .macro KERNEL8x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -459,9 +511,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs63,	vs3,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x16', `
+#else
 .macro SAVE8x16
+#endif
 
 	mr		T1,	CO
 
@@ -698,14 +758,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x8_1', `
+#else
 .macro LOAD8x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -728,9 +796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_I1', `
+#else
 .macro KERNEL8x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -780,9 +856,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_1', `
+#else
 .macro KERNEL8x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -832,9 +916,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_2', `
+#else
 .macro KERNEL8x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -884,9 +976,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_E2', `
+#else
 .macro KERNEL8x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -914,9 +1014,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_SUBI1', `
+#else
 .macro KERNEL8x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -966,9 +1074,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x8_SUB1', `
+#else
 .macro KERNEL8x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1018,9 +1134,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x8', `
+#else
 .macro SAVE8x8
+#endif
 
 	mr		T1,	CO
 
@@ -1193,14 +1317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x4_1', `
+#else
 .macro LOAD8x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -1222,9 +1354,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_I1', `
+#else
 .macro KERNEL8x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -1265,9 +1405,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_1', `
+#else
 .macro KERNEL8x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -1308,9 +1456,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_2', `
+#else
 .macro KERNEL8x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1351,9 +1507,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_E2', `
+#else
 .macro KERNEL8x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -1373,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_SUBI1', `
+#else
 .macro KERNEL8x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1416,9 +1588,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x4_SUB1', `
+#else
 .macro KERNEL8x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -1459,9 +1639,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x4', `
+#else
 .macro SAVE8x4
+#endif
 
 	mr		T1,	CO
 
@@ -1602,14 +1790,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x2_1', `
+#else
 .macro LOAD8x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -1632,9 +1828,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_I1', `
+#else
 .macro KERNEL8x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -1684,9 +1888,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_1', `
+#else
 .macro KERNEL8x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -1736,9 +1948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_2', `
+#else
 .macro KERNEL8x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -1788,9 +2008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_E2', `
+#else
 .macro KERNEL8x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -1818,9 +2046,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs5,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_SUBI1', `
+#else
 .macro KERNEL8x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -1870,9 +2106,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x2_SUB1', `
+#else
 .macro KERNEL8x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -1922,9 +2166,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs47,	vs1,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x2', `
+#else
 .macro SAVE8x2
+#endif
 
 	mr		T1,	CO
 
@@ -2097,14 +2349,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=8 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD8x1_1', `
+#else
 .macro LOAD8x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -2126,9 +2386,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_I1', `
+#else
 .macro KERNEL8x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -2169,9 +2437,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_1', `
+#else
 .macro KERNEL8x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -2212,9 +2488,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_2', `
+#else
 .macro KERNEL8x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -2255,9 +2539,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_E2', `
+#else
 .macro KERNEL8x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -2277,9 +2569,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs4,	vs23
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_SUBI1', `
+#else
 .macro KERNEL8x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -2320,9 +2620,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL8x1_SUB1', `
+#else
 .macro KERNEL8x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -2363,9 +2671,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs0,	vs15
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE8x1', `
+#else
 .macro SAVE8x1
+#endif
 
 	mr		T1,	CO
 
@@ -2506,14 +2822,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x16_1', `
+#else
 .macro LOAD4x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -2531,9 +2855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_I1', `
+#else
 .macro KERNEL4x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2574,9 +2906,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_1', `
+#else
 .macro KERNEL4x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2617,9 +2957,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_2', `
+#else
 .macro KERNEL4x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -2660,9 +3008,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_E2', `
+#else
 .macro KERNEL4x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -2686,9 +3042,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs7,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUBI1', `
+#else
 .macro KERNEL4x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -2729,9 +3093,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x16_SUB1', `
+#else
 .macro KERNEL4x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -2772,9 +3144,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs47,	vs3,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x16', `
+#else
 .macro SAVE4x16
+#endif
 
 	mr		T1,	CO
 
@@ -2895,14 +3275,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x8_1', `
+#else
 .macro LOAD4x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -2918,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_I1', `
+#else
 .macro KERNEL4x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2951,9 +3347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_1', `
+#else
 .macro KERNEL4x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -2984,9 +3388,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_2', `
+#else
 .macro KERNEL4x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3017,9 +3429,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_E2', `
+#else
 .macro KERNEL4x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -3035,9 +3455,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUBI1', `
+#else
 .macro KERNEL4x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3068,9 +3496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x8_SUB1', `
+#else
 .macro KERNEL4x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3101,9 +3537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x8', `
+#else
 .macro SAVE4x8
+#endif
 
 	mr		T1,	CO
 
@@ -3192,14 +3636,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x4_1', `
+#else
 .macro LOAD4x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -3214,9 +3666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_I1', `
+#else
 .macro KERNEL4x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -3242,9 +3702,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_1', `
+#else
 .macro KERNEL4x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -3270,9 +3738,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_2', `
+#else
 .macro KERNEL4x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3298,9 +3774,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_E2', `
+#else
 .macro KERNEL4x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -3312,9 +3796,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUBI1', `
+#else
 .macro KERNEL4x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3340,9 +3832,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x4_SUB1', `
+#else
 .macro KERNEL4x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -3368,9 +3868,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x4', `
+#else
 .macro SAVE4x4
+#endif
 
 	mr		T1,	CO
 
@@ -3443,14 +3951,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x2_1', `
+#else
 .macro LOAD4x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -3466,9 +3982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_I1', `
+#else
 .macro KERNEL4x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3499,9 +4023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_1', `
+#else
 .macro KERNEL4x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3532,9 +4064,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_2', `
+#else
 .macro KERNEL4x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3565,9 +4105,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_E2', `
+#else
 .macro KERNEL4x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -3583,9 +4131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs5,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUBI1', `
+#else
 .macro KERNEL4x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3616,9 +4172,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x2_SUB1', `
+#else
 .macro KERNEL4x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3649,9 +4213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs39,	vs1,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x2', `
+#else
 .macro SAVE4x2
+#endif
 
 	mr		T1,	CO
 
@@ -3740,14 +4312,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD4x1_1', `
+#else
 .macro LOAD4x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -3762,9 +4342,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_I1', `
+#else
 .macro KERNEL4x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3790,9 +4378,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_1', `
+#else
 .macro KERNEL4x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -3818,9 +4414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_2', `
+#else
 .macro KERNEL4x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3846,9 +4450,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_E2', `
+#else
 .macro KERNEL4x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -3860,9 +4472,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs4,	vs19
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUBI1', `
+#else
 .macro KERNEL4x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3888,9 +4508,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL4x1_SUB1', `
+#else
 .macro KERNEL4x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -3916,9 +4544,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs0,	vs11
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE4x1', `
+#else
 .macro SAVE4x1
+#endif
 
 	mr		T1,	CO
 
@@ -3991,14 +4627,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x16_1', `
+#else
 .macro LOAD2x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -4014,9 +4658,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_I1', `
+#else
 .macro KERNEL2x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4045,9 +4697,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_1', `
+#else
 .macro KERNEL2x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4076,9 +4736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_2', `
+#else
 .macro KERNEL2x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4107,9 +4775,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_E2', `
+#else
 .macro KERNEL2x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -4123,9 +4799,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs7,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUBI1', `
+#else
 .macro KERNEL2x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4154,9 +4838,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x16_SUB1', `
+#else
 .macro KERNEL2x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4185,9 +4877,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs39,	vs3,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x16', `
+#else
 .macro SAVE2x16
+#endif
 
 	mr		T1,	CO
 
@@ -4250,14 +4950,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -4271,9 +4979,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4296,9 +5012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4321,9 +5045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4346,9 +5078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -4358,9 +5098,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4383,9 +5131,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4408,9 +5164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 	mr		T1,	CO
 
@@ -4457,14 +5221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -4477,9 +5249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4499,9 +5279,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -4521,9 +5309,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4543,9 +5339,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -4553,9 +5357,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4575,9 +5387,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -4597,9 +5417,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 	mr		T1,	CO
 
@@ -4638,14 +5466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -4659,9 +5495,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4684,9 +5528,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4709,9 +5561,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4734,9 +5594,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -4746,9 +5614,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs5,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4771,9 +5647,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4796,9 +5680,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs35,	vs1,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 	mr		T1,	CO
 
@@ -4845,14 +5737,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -4865,9 +5765,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4887,9 +5795,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -4909,9 +5825,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4931,9 +5855,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
@@ -4941,9 +5873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs4,	vs17
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4963,9 +5903,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -4985,9 +5933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs0,	vs9
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 	mr		T1,	CO
 
@@ -5026,14 +5982,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=16
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x16_1', `
+#else
 .macro LOAD1x16_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -5048,9 +6012,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_I1', `
+#else
 .macro KERNEL1x16_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5073,9 +6045,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_1', `
+#else
 .macro KERNEL1x16_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5098,9 +6078,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_2', `
+#else
 .macro KERNEL1x16_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5123,9 +6111,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs7,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_E2', `
+#else
 .macro KERNEL1x16_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
@@ -5134,9 +6130,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs7,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUBI1', `
+#else
 .macro KERNEL1x16_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5159,9 +6163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x16_SUB1', `
+#else
 .macro KERNEL1x16_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5184,9 +6196,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs35,	vs3,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x16', `
+#else
 .macro SAVE1x16
+#endif
 
 	mr		T1,	CO
 
@@ -5220,14 +6240,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 	lxvw4x		vs1,	o16,	AO
@@ -5240,9 +6268,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5261,9 +6297,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5282,9 +6326,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5303,18 +6355,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
 	xvmaddasp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5333,9 +6401,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5354,9 +6430,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 	mr		T1,	CO
 
@@ -5382,14 +6466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvw4x		vs0,	o0,	AO
 
@@ -5401,9 +6493,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5420,9 +6520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 
 	lxvw4x		vs4,	o0,	AO
@@ -5439,9 +6547,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5458,17 +6574,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddasp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5485,9 +6617,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmulsp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 
 	lxvw4x		vs0,	o0,	AO
@@ -5504,9 +6644,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddasp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 	mr		T1,	CO
 
@@ -5528,14 +6676,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 	lxsspx		vs1,	o4,	AO
@@ -5548,9 +6704,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5569,9 +6733,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5590,9 +6762,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5611,18 +6791,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
 	xsmaddadp	vs33,	vs5,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5641,9 +6837,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5662,9 +6866,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs33,	vs1,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 	mr		T1,	CO
 
@@ -5690,14 +6902,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	8
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxsspx		vs0,	o0,	AO
 
@@ -5709,9 +6929,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		BO,	BO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5728,9 +6956,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 
 	lxsspx		vs4,	o0,	AO
@@ -5747,9 +6983,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5766,17 +7010,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xsmaddadp	vs32,	vs4,	vs16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5793,9 +7053,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmuldp		vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 
 	lxsspx		vs0,	o0,	AO
@@ -5812,9 +7080,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xsmaddadp	vs32,	vs0,	vs8
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 	mr		T1,	CO
 
@@ -5836,5 +7112,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	addi		CO,	CO,	4
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c
index 82366902d..3f0af4232 100644
--- a/kernel/power/zasum_microk_power8.c
+++ b/kernel/power/zasum_microk_power8.c
@@ -68,10 +68,10 @@ static double zasum_kernel_8 (long n, double *x)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -8	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvabsdp		48, 40		\n\t"
        "xvabsdp		49, 41		\n\t"
@@ -108,9 +108,9 @@ static double zasum_kernel_8 (long n, double *x)
        "xvadddp		38, 38, %x5	\n\t"
        "xvadddp		39, 39, %x6	\n\t"
 
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvabsdp		48, 40		\n\t"
        "xvabsdp		49, 41		\n\t"
@@ -140,7 +140,7 @@ static double zasum_kernel_8 (long n, double *x)
 
        "xvadddp		32, 32, 36	\n\t"
 
-       "xxswapd		33, 32		\n\t"
+       XXSWAPD_S(33,32)
        "xsadddp		%x0, 32, 33	\n"
 
      "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c
index 124614f62..959050e5f 100644
--- a/kernel/power/zaxpy_microk_power8.c
+++ b/kernel/power/zaxpy_microk_power8.c
@@ -61,8 +61,8 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
 
   __asm__
     (
-       "xxspltd		32, %x19, 0	\n\t"	// alpha_r
-       "xxspltd		33, %x20, 0	\n\t"	// alpha_i
+       XXSPLTD_S(32,%x19,0)	// alpha_r
+       XXSPLTD_S(33,%x20,0)	// alpha_i
 
        "lxvd2x		36, 0, %21	\n\t"	// mvec
 
@@ -87,10 +87,10 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
        "lxvd2x		50, %23, %3	\n\t"	// y2
        "lxvd2x		51, %24, %3	\n\t"	// y3
 
-       "xxswapd		%x8, 40		\n\t"	// exchange real and imag part
-       "xxswapd		%x9, 41		\n\t"	// exchange real and imag part
-       "xxswapd		%x10, 42	\n\t"	// exchange real and imag part
-       "xxswapd		%x11, 43	\n\t"	// exchange real and imag part
+       XXSWAPD_S(%x8,40)	// exchange real and imag part
+       XXSWAPD_S(%x9,41)	// exchange real and imag part
+       XXSWAPD_S(%x10,42)	// exchange real and imag part
+       XXSWAPD_S(%x11,43)	// exchange real and imag part
 
        "addi		%2, %2, 64	\n\t"
        "addi		%3, %3, 64	\n\t"
@@ -105,19 +105,19 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
        "lxvd2x		%x6, %23, %3	\n\t"	// y6
        "lxvd2x		%x7, %24, %3	\n\t"	// y7
 
-       "xxswapd		%x12, 44	\n\t"	// exchange real and imag part
-       "xxswapd		%x13, 45	\n\t"	// exchange real and imag part
-       "xxswapd		%x14, 46	\n\t"	// exchange real and imag part
-       "xxswapd		%x15, 47	\n\t"	// exchange real and imag part
+       XXSWAPD_S(%x12,44)	// exchange real and imag part
+       XXSWAPD_S(%x13,45)	// exchange real and imag part
+       XXSWAPD_S(%x14,46)	// exchange real and imag part
+       XXSWAPD_S(%x15,47)	// exchange real and imag part
 
        "addi		%2, %2, 64	\n\t"
        "addi		%3, %3, 64	\n\t"
 
        "addic.		%1, %1, -8	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-       "1:				\n\t"
+       ".align	5		\n"
+       "one%=:				\n\t"
 
        "xvmaddadp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
        "xvmaddadp	49, 41, 32	\n\t"
@@ -163,31 +163,31 @@ static void zaxpy_kernel_4 (long n, double *x, double *y,
 
        "addi		%16, %16, 64	\n\t"
 
-       "xxswapd		%x8, 40		\n\t"	// exchange real and imag part
-       "xxswapd		%x9, 41		\n\t"	// exchange real and imag part
+       XXSWAPD_S(%x8,40)	// exchange real and imag part
+       XXSWAPD_S(%x9,41)	// exchange real and imag part
        "lxvd2x		48, 0, %3	\n\t"	// y0
        "lxvd2x		49, %22, %3	\n\t"	// y1
-       "xxswapd		%x10, 42	\n\t"	// exchange real and imag part
-       "xxswapd		%x11, 43	\n\t"	// exchange real and imag part
+       XXSWAPD_S(%x10,42)	// exchange real and imag part
+       XXSWAPD_S(%x11,43)	// exchange real and imag part
        "lxvd2x		50, %23, %3	\n\t"	// y2
        "lxvd2x		51, %24, %3	\n\t"	// y3
 
-       "xxswapd		%x12, 44	\n\t"	// exchange real and imag part
+       XXSWAPD_S(%x12,44)	// exchange real and imag part
        "addi		%3, %3, 64	\n\t"
-       "xxswapd		%x13, 45	\n\t"	// exchange real and imag part
+       XXSWAPD_S(%x13,45)	// exchange real and imag part
        "lxvd2x		%x4, 0, %3	\n\t"	// y4
        "lxvd2x		%x5, %22, %3	\n\t"	// y5
-       "xxswapd		%x14, 46	\n\t"	// exchange real and imag part
-       "xxswapd		%x15, 47	\n\t"	// exchange real and imag part
+       XXSWAPD_S(%x14,46)	// exchange real and imag part
+       XXSWAPD_S(%x15,47)	// exchange real and imag part
        "lxvd2x		%x6, %23, %3	\n\t"	// y6
        "lxvd2x		%x7, %24, %3	\n\t"	// y7
 
        "addi		%3, %3, 64	\n\t"
 
        "addic.		%1, %1, -8	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-       "2:				\n\t"
+       "two%=:				\n\t"
 
        "xvmaddadp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
        "xvmaddadp	49, 41, 32	\n\t"
diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c
index 5ca34b633..e29547047 100644
--- a/kernel/power/zcopy_microk_power8.c
+++ b/kernel/power/zcopy_microk_power8.c
@@ -62,10 +62,10 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "stxvd2x		32, 0, %3	\n\t"
        "stxvd2x		33, %5, %3	\n\t"
@@ -108,9 +108,9 @@ static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -16	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "stxvd2x		32, 0, %3	\n\t"
        "stxvd2x		33, %5, %3	\n\t"
diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c
index 71078b66c..dcde82433 100644
--- a/kernel/power/zdot_microk_power8.c
+++ b/kernel/power/zdot_microk_power8.c
@@ -60,10 +60,10 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
        "lxvd2x		43, %9, %2	\n\t"	// x3_r, x3_i
        "lxvd2x		51, %9, %3	\n\t"	// y3_r, y3_i
 
-       "xxswapd		0, 48		\n\t"	// y0_i, y0_r
-       "xxswapd		1, 49		\n\t"	// y1_i, y1_r
-       "xxswapd		2, 50		\n\t"	// y2_i, y2_r
-       "xxswapd		3, 51		\n\t"	// y3_i, y3_r
+       XXSWAPD_S(0,48)	// y0_i, y0_r
+       XXSWAPD_S(1,49)	// y1_i, y1_r
+       XXSWAPD_S(2,50)	// y2_i, y2_r
+       XXSWAPD_S(3,51)	// y3_i, y3_r
 
        "addi		%2, %2, 64	\n\t"
        "addi		%3, %3, 64	\n\t"
@@ -77,19 +77,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
        "lxvd2x		47, %9, %2	\n\t"	// x3_r, x3_i
        "lxvd2x		7, %9, %3	\n\t"	// y3_r, y3_i
 
-       "xxswapd		8, 4		\n\t"	// y0_i, y0_r
-       "xxswapd		9, 5		\n\t"	// y1_i, y1_r
-       "xxswapd		10, 6		\n\t"	// y2_i, y2_r
-       "xxswapd		11, 7		\n\t"	// y3_i, y3_r
+       XXSWAPD_S(8,4)	// y0_i, y0_r
+       XXSWAPD_S(9,5)	// y1_i, y1_r
+       XXSWAPD_S(10,6)	// y2_i, y2_r
+       XXSWAPD_S(11,7)	// y3_i, y3_r
 
        "addi		%2, %2, 64	\n\t"
        "addi		%3, %3, 64	\n\t"
 
        "addic.		%1, %1, -8	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmaddadp	32, 40, 48	\n\t"	// x0_r * y0_r , x0_i * y0_i
        "lxvd2x		48, 0, %3	\n\t"	// y0_r, y0_i
@@ -111,14 +111,14 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
        "xvmaddadp	39, 43, 3	\n\t"	// x3_r * y3_i , x3_i * y3_r
        "lxvd2x		43, %9, %2	\n\t"	// x3_r, x3_i
 
-       "xxswapd		0,48		\n\t"	// y0_i, y0_r
-       "xxswapd		1,49		\n\t"	// y1_i, y1_r
+       XXSWAPD_S(0,48)	// y0_i, y0_r
+       XXSWAPD_S(1,49)	// y1_i, y1_r
 
        "addi		%2, %2, 64	\n\t"
        "addi		%3, %3, 64	\n\t"
 
-       "xxswapd		2,50		\n\t"	// y2_i, y2_r
-       "xxswapd		3,51		\n\t"	// y3_i, y3_r
+       XXSWAPD_S(2,50)	// y2_i, y2_r
+       XXSWAPD_S(3,51)	// y3_i, y3_r
 
        "xvmaddadp	32, 44, 4	\n\t"	// x0_r * y0_r , x0_i * y0_i
        "lxvd2x		4, 0, %3	\n\t"	// y0_r, y0_i
@@ -138,19 +138,19 @@ static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
        "xvmaddadp	39, 47, 11	\n\t"	// x3_r * y3_i , x3_i * y3_r
        "lxvd2x		47, %9, %2	\n\t"	// x3_r, x3_i
 
-       "xxswapd		8,4		\n\t"	// y0_i, y0_r
-       "xxswapd		9,5		\n\t"	// y1_i, y1_r
+       XXSWAPD_S(8,4)	// y0_i, y0_r
+       XXSWAPD_S(9,5)	// y1_i, y1_r
 
        "addi		%2, %2, 64	\n\t"
        "addi		%3, %3, 64	\n\t"
 
-       "xxswapd		10,6		\n\t"	// y2_i, y2_r
-       "xxswapd		11,7		\n\t"	// y3_i, y3_r
+       XXSWAPD_S(10,6)	// y2_i, y2_r
+       XXSWAPD_S(11,7)	// y3_i, y3_r
 
        "addic.		%1, %1, -8	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmaddadp	32, 40, 48	\n\t"	// x0_r * y0_r , x0_i * y0_i
        "xvmaddadp	34, 41, 49	\n\t"	// x1_r * y1_r , x1_i * y1_i
diff --git a/kernel/power/zgemm_macros_8x2_power8.S b/kernel/power/zgemm_macros_8x2_power8.S
index c43a115b2..24a36470c 100644
--- a/kernel/power/zgemm_macros_8x2_power8.S
+++ b/kernel/power/zgemm_macros_8x2_power8.S
@@ -67,7 +67,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -91,9 +95,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -151,9 +163,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -211,9 +231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -271,9 +299,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -311,9 +347,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -371,9 +415,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -431,9 +483,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 
 	mr		T1,	CO
@@ -455,13 +515,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -479,13 +539,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -503,13 +563,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -527,13 +587,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -551,13 +611,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 
-	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
@@ -575,13 +635,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 
-	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
@@ -599,13 +659,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 
-	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
@@ -623,13 +683,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 
-	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
@@ -685,13 +745,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs49,	vs49			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs49,vs49)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
 
-	xxswapd		vs48,	vs48			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs49,	vs49			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs48,vs48)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs49,vs49)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
@@ -709,13 +769,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs51,	vs51			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs51,vs51)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
 
-	xxswapd		vs50,	vs50			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs51,	vs51			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs50,vs50)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs51,vs51)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
@@ -733,13 +793,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs53,	vs53			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs53,vs53)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
 
-	xxswapd		vs52,	vs52			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs53,	vs53			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs52,vs52)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs53,vs53)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
@@ -757,13 +817,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs55,	vs55			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs55,vs55)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
 
-	xxswapd		vs54,	vs54			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs55,	vs55			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs54,vs54)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs55,vs55)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
@@ -781,13 +841,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs57,	vs57			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs57,vs57)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
 
-	xxswapd		vs56,	vs56			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs57,	vs57			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs56,vs56)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs57,vs57)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
@@ -805,13 +865,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs59,	vs59			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs59,vs59)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
 
-	xxswapd		vs58,	vs58			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs59,	vs59			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs58,vs58)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs59,vs59)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
@@ -829,13 +889,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs61,	vs61			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs61,vs61)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
 
-	xxswapd		vs60,	vs60			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs61,	vs61			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs60,vs60)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs61,vs61)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
@@ -853,13 +913,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs63,	vs63			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs63,vs63)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
 
-	xxswapd		vs62,	vs62			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs63,	vs63			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs62,vs62)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs63,vs63)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
@@ -900,14 +960,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T2,	T2,	LDC
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -924,9 +992,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -961,9 +1037,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -998,9 +1082,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1035,9 +1127,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -1059,9 +1159,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1096,9 +1204,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1133,9 +1249,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 
 	mr		T1,	CO
@@ -1152,13 +1276,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -1176,13 +1300,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -1200,13 +1324,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -1224,13 +1348,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -1273,13 +1397,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 
-	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
@@ -1297,13 +1421,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 
-	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
@@ -1321,13 +1445,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 
-	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
@@ -1345,13 +1469,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 
-	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
@@ -1383,14 +1507,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -1405,9 +1537,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	32
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -1432,9 +1572,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -1459,9 +1607,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1486,9 +1642,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -1502,9 +1666,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1529,9 +1701,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1556,9 +1736,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 
 	mr		T1,	CO
@@ -1573,13 +1761,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -1597,13 +1785,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -1640,13 +1828,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -1664,13 +1852,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -1698,14 +1886,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -1719,9 +1915,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -1741,9 +1945,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -1763,9 +1975,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1785,9 +2005,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -1797,9 +2025,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1819,9 +2055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1841,9 +2085,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 
 	mr		T1,	CO
@@ -1857,13 +2109,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -1897,13 +2149,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -1929,14 +2181,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -1958,9 +2218,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -1999,9 +2267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2040,9 +2316,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2081,9 +2365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -2104,9 +2396,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2145,9 +2445,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2186,9 +2494,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 
 	mr		T1,	CO
@@ -2210,13 +2526,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -2234,13 +2550,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -2258,13 +2574,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -2282,13 +2598,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -2306,13 +2622,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 
-	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
@@ -2330,13 +2646,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 
-	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
@@ -2354,13 +2670,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 
-	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
@@ -2378,13 +2694,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 
-	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
@@ -2425,14 +2741,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T2,	T2,	LDC
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -2447,9 +2771,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2473,9 +2805,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2499,9 +2839,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2525,9 +2873,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -2540,9 +2896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2566,9 +2930,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2592,9 +2964,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 
 	mr		T1,	CO
@@ -2611,13 +2991,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -2635,13 +3015,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -2659,13 +3039,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -2683,13 +3063,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -2721,14 +3101,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -2741,9 +3129,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	32
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2761,9 +3157,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2781,9 +3185,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2801,9 +3213,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -2812,9 +3232,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2832,9 +3260,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2852,9 +3288,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 
 	mr		T1,	CO
@@ -2869,13 +3313,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -2893,13 +3337,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -2927,14 +3371,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxvd2x		vs16,	o0,	BO		// load real part from B
 	lxvd2x		vs17,	o16,	BO		// load imag part from B
@@ -2946,9 +3398,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -2963,9 +3423,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -2980,9 +3448,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -2997,18 +3473,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -3023,9 +3515,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -3040,9 +3540,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 
 	mr		T1,	CO
@@ -3056,13 +3564,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -3088,11 +3596,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 
+#if defined(_AIX)
+define(`ZCOPYB_1x1', `
+#else
 .macro ZCOPYB_1x1
+#endif
 
         lxvdsx          vs4,    o0,     BO              // b0_r
         lxvdsx          vs5,    o8,     BO              // b0_i
@@ -3101,10 +3617,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         stxvd2x         vs5,    o16,    BBO
         addi            BBO,    BBO,    32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
+#if defined(_AIX)
+define(`ZCOPYB_8x1', `
+#else
 .macro ZCOPYB_8x1
+#endif
 
         lxvd2x          vs32,   o0,     BO
         lxvd2x          vs33,  o16,     BO
@@ -3118,23 +3642,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         lxvd2x          vs39,  o48,     BO
         addi            BO,     BO,     64
 
-	xxspltd		vs40, vs32, 0
-	xxspltd		vs41, vs32, 1
-	xxspltd		vs42, vs33, 0
-	xxspltd		vs43, vs33, 1
-	xxspltd		vs44, vs34, 0
-	xxspltd		vs45, vs34, 1
-	xxspltd		vs46, vs35, 0
-	xxspltd		vs47, vs35, 1
-
-	xxspltd		vs48, vs36, 0
-	xxspltd		vs49, vs36, 1
-	xxspltd		vs50, vs37, 0
-	xxspltd		vs51, vs37, 1
-	xxspltd		vs52, vs38, 0
-	xxspltd		vs53, vs38, 1
-	xxspltd		vs54, vs39, 0
-	xxspltd		vs55, vs39, 1
+	XXSPLTD(vs40,vs32,0)
+	XXSPLTD(vs41,vs32,1)
+	XXSPLTD(vs42,vs33,0)
+	XXSPLTD(vs43,vs33,1)
+	XXSPLTD(vs44,vs34,0)
+	XXSPLTD(vs45,vs34,1)
+	XXSPLTD(vs46,vs35,0)
+	XXSPLTD(vs47,vs35,1)
+
+	XXSPLTD(vs48,vs36,0)
+	XXSPLTD(vs49,vs36,1)
+	XXSPLTD(vs50,vs37,0)
+	XXSPLTD(vs51,vs37,1)
+	XXSPLTD(vs52,vs38,0)
+	XXSPLTD(vs53,vs38,1)
+	XXSPLTD(vs54,vs39,0)
+	XXSPLTD(vs55,vs39,1)
 
         stxvd2x         vs40,    o0,     BBO
         stxvd2x         vs41,   o16,     BBO
@@ -3160,6 +3684,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         stxvd2x         vs55,   o48,     BBO
         addi            BBO,    BBO,    64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
diff --git a/kernel/power/zgemm_tcopy_macros_8_power8.S b/kernel/power/zgemm_tcopy_macros_8_power8.S
index 3f5a5ed03..654332375 100644
--- a/kernel/power/zgemm_tcopy_macros_8_power8.S
+++ b/kernel/power/zgemm_tcopy_macros_8_power8.S
@@ -38,7 +38,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=4 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x8', `
+#else
 .macro COPY_4x8
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -144,14 +148,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs12,	o32,	T1
 	stxvd2x		vs13,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x4', `
+#else
 .macro COPY_4x4
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -209,14 +221,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs46,	o32,	T1
 	stxvd2x		vs47,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x2', `
+#else
 .macro COPY_4x2
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -254,14 +274,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs38,	o32,	T1
 	stxvd2x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=4 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_4x1', `
+#else
 .macro COPY_4x1
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -289,14 +317,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvd2x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x8', `
+#else
 .macro COPY_2x8
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -350,14 +386,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs46,	o32,	T1
 	stxvd2x		vs47,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x4', `
+#else
 .macro COPY_2x4
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -387,14 +431,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs38,	o32,	T1
 	stxvd2x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x2', `
+#else
 .macro COPY_2x2
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -414,14 +466,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs34,	o32,	T1
 	stxvd2x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_2x1', `
+#else
 .macro COPY_2x1
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -437,14 +497,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvd2x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x8', `
+#else
 .macro COPY_1x8
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -472,14 +540,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs38,	o32,	T1
 	stxvd2x		vs39,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x4', `
+#else
 .macro COPY_1x4
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -495,14 +571,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs34,	o32,	T1
 	stxvd2x		vs35,	o48,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x2', `
+#else
 .macro COPY_1x2
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	lxvd2x		vs33,	o16,	A0
@@ -514,14 +598,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	stxvd2x		vs32,	o0,	T1
 	stxvd2x		vs33,	o16,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`COPY_1x1', `
+#else
 .macro COPY_1x1
+#endif
 
 	lxvd2x		vs32,	o0,	A0
 	addi		A0,	A0,	16
@@ -531,5 +623,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	stxvd2x		vs32,	o0,	T1
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
diff --git a/kernel/power/zrot.c b/kernel/power/zrot.c
index d45468fd5..c6d666178 100644
--- a/kernel/power/zrot.c
+++ b/kernel/power/zrot.c
@@ -40,8 +40,8 @@ static void   zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
 
  __asm__
     (
-       "xxspltd     36, %x[cos], 0 \n\t"   // load c to both dwords
-       "xxspltd     37, %x[sin], 0 \n\t"   // load s to both dwords 
+       XXSPLTD_S(36,%x[cos],0)   // load c to both dwords
+       XXSPLTD_S(37,%x[sin],0)   // load s to both dwords 
 
        "lxvd2x      32,  0,     %[x_ptr] \n\t"   // load x
        "lxvd2x      33, %[i16], %[x_ptr] \n\t"
@@ -57,10 +57,10 @@ static void   zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
        "addi        %[y_ptr], %[y_ptr], 64  \n\t"
 
        "addic.          %[temp_n], %[temp_n], -4    \n\t"
-       "ble       2f          \n\t"
+       "ble       two%=          \n\t"
 
-       ".p2align  5           \n"
-     "1:                      \n\t"
+       ".align  5           \n"
+     "one%=:                      \n\t"
 
        "xvmuldp         40, 32, 36  \n\t" // c * x
        "xvmuldp         41, 33, 36  \n\t"
@@ -124,9 +124,9 @@ static void   zrot_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT si
        "addi            %[y_ptr], %[y_ptr], 128 \n\t"
 
        "addic.          %[temp_n], %[temp_n], -4      \n\t"
-       "bgt+            1b          \n"
+       "bgt+            one%=          \n"
 
-     "2:                      \n\t"
+     "two%=:                      \n\t"
 
        "xvmuldp         40, 32, 36  \n\t" // c * x
        "xvmuldp         41, 33, 36  \n\t"
diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c
index aba9029a0..567331775 100644
--- a/kernel/power/zscal_microk_power8.c
+++ b/kernel/power/zscal_microk_power8.c
@@ -58,8 +58,8 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
        "dcbt		0, %2		\n\t"
 
        "xsnegdp		33, %x16	\n\t"	// -alpha_i
-       "xxspltd		32, %x15, 0	\n\t"	// alpha_r , alpha_r
-       "xxmrghd		33, 33, %x16	\n\t"	// -alpha_i , alpha_i
+       XXSPLTD_S(32,%x15,0)	// alpha_r , alpha_r
+       XXMRGHD_S(33,33,%x16)	// -alpha_i , alpha_i
 
        "lxvd2x		40, 0, %2	\n\t"	// x0_r, x0_i
        "lxvd2x		41, %17, %2	\n\t"
@@ -73,10 +73,10 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
        "addi		%2, %2, 128	\n\t"
 
        "addic.		%1, %1, -8	\n\t"
-       "ble		2f		\n\t"
+       "ble		two%=		\n\t"
 
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
 
        "xvmuldp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
        "xvmuldp		49, 41, 32	\n\t"
@@ -87,14 +87,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
        "xvmuldp		%x5, 46, 32	\n\t"
        "xvmuldp		%x6, 47, 32	\n\t"
 
-       "xxswapd		%x7, 40		\n\t"
-       "xxswapd		%x8, 41		\n\t"
-       "xxswapd		%x9, 42		\n\t"
-       "xxswapd		%x10, 43		\n\t"
-       "xxswapd		%x11, 44		\n\t"
-       "xxswapd		%x12, 45	\n\t"
-       "xxswapd		%x13, 46	\n\t"
-       "xxswapd		%x14, 47	\n\t"
+       XXSWAPD_S(%x7,40)
+       XXSWAPD_S(%x8,41)
+       XXSWAPD_S(%x9,42)
+       XXSWAPD_S(%x10,43)
+       XXSWAPD_S(%x11,44)
+       XXSWAPD_S(%x12,45)
+       XXSWAPD_S(%x13,46)
+       XXSWAPD_S(%x14,47)
 
        "xvmuldp		%x7, %x7, 33	\n\t"	// x0_i * -alpha_i, x0_r * alpha_i
        "xvmuldp		%x8, %x8, 33	\n\t"
@@ -147,9 +147,9 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
        "addi		%2, %2, 256	\n\t"
 
        "addic.		%1, %1, -8	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
-     "2:				\n\t"
+     "two%=:				\n\t"
 
        "xvmuldp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
        "xvmuldp		49, 41, 32	\n\t"
@@ -160,14 +160,14 @@ static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
        "xvmuldp		%x5, 46, 32	\n\t"
        "xvmuldp		%x6, 47, 32	\n\t"
 
-       "xxswapd		%x7, 40		\n\t"
-       "xxswapd		%x8, 41		\n\t"
-       "xxswapd		%x9, 42		\n\t"
-       "xxswapd		%x10, 43		\n\t"
-       "xxswapd		%x11, 44		\n\t"
-       "xxswapd		%x12, 45	\n\t"
-       "xxswapd		%x13, 46	\n\t"
-       "xxswapd		%x14, 47	\n\t"
+       XXSWAPD_S(%x7,40)
+       XXSWAPD_S(%x8,41)
+       XXSWAPD_S(%x9,42)
+       XXSWAPD_S(%x10,43)
+       XXSWAPD_S(%x11,44)
+       XXSWAPD_S(%x12,45)
+       XXSWAPD_S(%x13,46)
+       XXSWAPD_S(%x14,47)
 
        "addi		%2, %2, -128	\n\t"
 
diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c
index 54391ba5d..1e9fbe2cf 100644
--- a/kernel/power/zswap_microk_power8.c
+++ b/kernel/power/zswap_microk_power8.c
@@ -40,8 +40,8 @@ zswap_kernel_16 (long n, double *x, double *y)
 {
   __asm__
     (
-       ".p2align	5		\n"
-     "1:				\n\t"
+       ".align	5		\n"
+     "one%=:				\n\t"
        "lxvd2x		32, 0, %4	\n\t"
        "lxvd2x		33, %5, %4	\n\t"
        "lxvd2x		34, %6, %4	\n\t"
@@ -130,7 +130,7 @@ zswap_kernel_16 (long n, double *x, double *y)
 
        "addi		%4, %4, 128	\n\t"
        "addic.		%2, %2, -16	\n\t"
-       "bgt		1b		\n"
+       "bgt		one%=		\n"
 
      "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
      :
diff --git a/kernel/power/ztrmm_macros_8x2_power8.S b/kernel/power/ztrmm_macros_8x2_power8.S
index 701ec65c8..b3fbcd220 100644
--- a/kernel/power/ztrmm_macros_8x2_power8.S
+++ b/kernel/power/ztrmm_macros_8x2_power8.S
@@ -68,7 +68,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 * Macros for N=2 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x8_1', `
+#else
 .macro LOAD2x8_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -92,9 +96,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_I1', `
+#else
 .macro KERNEL2x8_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -152,9 +164,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_1', `
+#else
 .macro KERNEL2x8_1
+#endif
 
 
 	xvmaddadp	vs32,	vs0,	vs16		// real*real, imag*real
@@ -221,9 +241,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_2', `
+#else
 .macro KERNEL2x8_2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -289,9 +317,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 	addi		BO,	BO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_E2', `
+#else
 .macro KERNEL2x8_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -329,9 +365,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs63,	vs15,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUBI1', `
+#else
 .macro KERNEL2x8_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -389,9 +433,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs63,	vs7,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x8_SUB1', `
+#else
 .macro KERNEL2x8_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -449,9 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs63,	vs7,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x8', `
+#else
 .macro SAVE2x8
+#endif
 
 
 	mr		T1,	CO
@@ -473,13 +533,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -497,13 +557,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -521,13 +581,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -545,13 +605,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -569,13 +629,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 
-	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
@@ -593,13 +653,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 
-	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
@@ -617,13 +677,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 
-	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
@@ -641,13 +701,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 
-	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
@@ -703,13 +763,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs49,	vs49			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs49,vs49)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs48		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs49		// imagA*imagB
 
-	xxswapd		vs48,	vs48			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs49,	vs49			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs48,vs48)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs49,vs49)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs48		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs49		// imagA*realB
@@ -727,13 +787,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs51,	vs51			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs51,vs51)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs50		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs51		// imagA*imagB
 
-	xxswapd		vs50,	vs50			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs51,	vs51			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs50,vs50)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs51,vs51)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs50		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs51		// imagA*realB
@@ -751,13 +811,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs53,	vs53			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs53,vs53)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs52		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs53		// imagA*imagB
 
-	xxswapd		vs52,	vs52			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs53,	vs53			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs52,vs52)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs53,vs53)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs52		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs53		// imagA*realB
@@ -775,13 +835,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs55,	vs55			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs55,vs55)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs54		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs55		// imagA*imagB
 
-	xxswapd		vs54,	vs54			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs55,	vs55			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs54,vs54)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs55,vs55)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs54		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs55		// imagA*realB
@@ -799,13 +859,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs57,	vs57			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs57,vs57)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs56		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs57		// imagA*imagB
 
-	xxswapd		vs56,	vs56			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs57,	vs57			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs56,vs56)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs57,vs57)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs56		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs57		// imagA*realB
@@ -823,13 +883,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs59,	vs59			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs59,vs59)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs58		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs59		// imagA*imagB
 
-	xxswapd		vs58,	vs58			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs59,	vs59			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs58,vs58)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs59,vs59)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs58		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs59		// imagA*realB
@@ -847,13 +907,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs61,	vs61			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs61,vs61)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs60		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs61		// imagA*imagB
 
-	xxswapd		vs60,	vs60			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs61,	vs61			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs60,vs60)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs61,vs61)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs60		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs61		// imagA*realB
@@ -871,13 +931,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs63,	vs63			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs63,vs63)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs62		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs63		// imagA*imagB
 
-	xxswapd		vs62,	vs62			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs63,	vs63			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs62,vs62)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs63,vs63)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs62		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs63		// imagA*realB
@@ -918,14 +978,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T2,	T2,	LDC
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x4_1', `
+#else
 .macro LOAD2x4_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -942,9 +1010,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_I1', `
+#else
 .macro KERNEL2x4_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -979,9 +1055,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_1', `
+#else
 .macro KERNEL2x4_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -1016,9 +1100,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_2', `
+#else
 .macro KERNEL2x4_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1053,9 +1145,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_E2', `
+#else
 .macro KERNEL2x4_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -1077,9 +1177,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs11,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUBI1', `
+#else
 .macro KERNEL2x4_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1114,9 +1222,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x4_SUB1', `
+#else
 .macro KERNEL2x4_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1151,9 +1267,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs3,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x4', `
+#else
 .macro SAVE2x4
+#endif
 
 
 	mr		T1,	CO
@@ -1170,13 +1294,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -1194,13 +1318,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -1218,13 +1342,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -1242,13 +1366,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -1291,13 +1415,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 
-	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
@@ -1315,13 +1439,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 
-	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
@@ -1339,13 +1463,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 
-	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
@@ -1363,13 +1487,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 
-	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
@@ -1401,14 +1525,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x2_1', `
+#else
 .macro LOAD2x2_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -1423,9 +1555,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	32
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_I1', `
+#else
 .macro KERNEL2x2_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -1450,9 +1590,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_1', `
+#else
 .macro KERNEL2x2_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -1477,9 +1625,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_2', `
+#else
 .macro KERNEL2x2_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1504,9 +1660,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_E2', `
+#else
 .macro KERNEL2x2_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -1520,9 +1684,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs9,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUBI1', `
+#else
 .macro KERNEL2x2_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1547,9 +1719,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x2_SUB1', `
+#else
 .macro KERNEL2x2_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -1574,9 +1754,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs1,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x2', `
+#else
 .macro SAVE2x2
+#endif
 
 
 	mr		T1,	CO
@@ -1591,13 +1779,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -1615,13 +1803,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -1658,13 +1846,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -1682,13 +1870,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -1716,14 +1904,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=2 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD2x1_1', `
+#else
 .macro LOAD2x1_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -1737,9 +1933,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_I1', `
+#else
 .macro KERNEL2x1_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -1759,9 +1963,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_1', `
+#else
 .macro KERNEL2x1_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -1781,9 +1993,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_2', `
+#else
 .macro KERNEL2x1_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1803,9 +2023,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_E2', `
+#else
 .macro KERNEL2x1_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -1815,9 +2043,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs8,	vs23		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUBI1', `
+#else
 .macro KERNEL2x1_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1837,9 +2073,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL2x1_SUB1', `
+#else
 .macro KERNEL2x1_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -1859,9 +2103,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs0,	vs19		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE2x1', `
+#else
 .macro SAVE2x1
+#endif
 
 
 	mr		T1,	CO
@@ -1875,13 +2127,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -1915,13 +2167,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -1947,14 +2199,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=8
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x8_1', `
+#else
 .macro LOAD1x8_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -1976,9 +2236,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_I1', `
+#else
 .macro KERNEL1x8_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2017,9 +2285,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_1', `
+#else
 .macro KERNEL1x8_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2058,9 +2334,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_2', `
+#else
 .macro KERNEL1x8_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2099,9 +2383,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_E2', `
+#else
 .macro KERNEL1x8_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -2122,9 +2414,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs15,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUBI1', `
+#else
 .macro KERNEL1x8_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2163,9 +2463,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x8_SUB1', `
+#else
 .macro KERNEL1x8_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2204,9 +2512,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs47,	vs7,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x8', `
+#else
 .macro SAVE1x8
+#endif
 
 
 	mr		T1,	CO
@@ -2228,13 +2544,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -2252,13 +2568,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -2276,13 +2592,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -2300,13 +2616,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -2324,13 +2640,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs41,	vs41			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs41,vs41)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs40		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs41		// imagA*imagB
 
-	xxswapd		vs40,	vs40			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs41,	vs41			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs40,vs40)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs41,vs41)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs40		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs41		// imagA*realB
@@ -2348,13 +2664,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs43,	vs43			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs43,vs43)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs42		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs43		// imagA*imagB
 
-	xxswapd		vs42,	vs42			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs43,	vs43			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs42,vs42)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs43,vs43)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs42		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs43		// imagA*realB
@@ -2372,13 +2688,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs45,	vs45			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs45,vs45)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs44		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs45		// imagA*imagB
 
-	xxswapd		vs44,	vs44			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs45,	vs45			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs44,vs44)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs45,vs45)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs44		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs45		// imagA*realB
@@ -2396,13 +2712,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs47,	vs47			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs47,vs47)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs46		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs47		// imagA*imagB
 
-	xxswapd		vs46,	vs46			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs47,	vs47			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs46,vs46)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs47,vs47)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs46		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs47		// imagA*realB
@@ -2443,14 +2759,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T2,	T2,	LDC
 	addi		CO,	CO,	128
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=4
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x4_1', `
+#else
 .macro LOAD1x4_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -2465,9 +2789,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	64
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_I1', `
+#else
 .macro KERNEL1x4_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2491,9 +2823,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_1', `
+#else
 .macro KERNEL1x4_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2517,9 +2857,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_2', `
+#else
 .macro KERNEL1x4_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2543,9 +2891,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_E2', `
+#else
 .macro KERNEL1x4_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -2558,9 +2914,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs11,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUBI1', `
+#else
 .macro KERNEL1x4_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2584,9 +2948,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x4_SUB1', `
+#else
 .macro KERNEL1x4_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2610,9 +2982,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs39,	vs3,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x4', `
+#else
 .macro SAVE1x4
+#endif
 
 
 	mr		T1,	CO
@@ -2629,13 +3009,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -2653,13 +3033,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -2677,13 +3057,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs37,	vs37			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs37,vs37)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs36		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs37		// imagA*imagB
 
-	xxswapd		vs36,	vs36			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs37,	vs37			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs36,vs36)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs37,vs37)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs36		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs37		// imagA*realB
@@ -2701,13 +3081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs39,	vs39			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs39,vs39)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs38		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs39		// imagA*imagB
 
-	xxswapd		vs38,	vs38			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs39,	vs39			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs38,vs38)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs39,vs39)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs38		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs39		// imagA*realB
@@ -2739,14 +3119,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	64
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=2
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x2_1', `
+#else
 .macro LOAD1x2_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -2759,9 +3147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	32
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_I1', `
+#else
 .macro KERNEL1x2_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2779,9 +3175,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_1', `
+#else
 .macro KERNEL1x2_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 	lxvd2x		vs9,	o16,	AO		// load real,imag from A
@@ -2799,9 +3203,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_2', `
+#else
 .macro KERNEL1x2_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2819,9 +3231,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_E2', `
+#else
 .macro KERNEL1x2_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
@@ -2830,9 +3250,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs9,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUBI1', `
+#else
 .macro KERNEL1x2_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2850,9 +3278,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x2_SUB1', `
+#else
 .macro KERNEL1x2_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 	lxvd2x		vs1,	o16,	AO		// load real,imag from A
@@ -2870,9 +3306,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs35,	vs1,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x2', `
+#else
 .macro SAVE1x2
+#endif
 
 
 	mr		T1,	CO
@@ -2887,13 +3331,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -2911,13 +3355,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs35,	vs35			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs35,vs35)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs34		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs35		// imagA*imagB
 
-	xxswapd		vs34,	vs34			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs35,	vs35			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs34,vs34)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs35,vs35)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs34		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs35		// imagA*realB
@@ -2945,14 +3389,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	32
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
 
 /**********************************************************************************************
 * Macros for N=1 and M=1
 **********************************************************************************************/
 
+#if defined(_AIX)
+define(`LOAD1x1_1', `
+#else
 .macro LOAD1x1_1
+#endif
 
 	lxvdsx		vs16,	o0,	BO		// load real part from B
 	lxvdsx		vs17,	o8,	BO		// load imag part from B
@@ -2964,9 +3416,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	addi		AO,	AO,	16
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_I1', `
+#else
 .macro KERNEL1x1_I1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -2981,9 +3441,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_1', `
+#else
 .macro KERNEL1x1_1
+#endif
 
 	lxvd2x		vs8,	o0,	AO		// load real,imag from A
 
@@ -2998,9 +3466,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_2', `
+#else
 .macro KERNEL1x1_2
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -3015,18 +3491,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_E2', `
+#else
 .macro KERNEL1x1_E2
+#endif
 
 
 	xvmaddadp	vs32,	vs8,	vs20		// real*real, imag*real
 	xvmaddadp	vs33,	vs8,	vs21		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUBI1', `
+#else
 .macro KERNEL1x1_SUBI1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -3041,9 +3533,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmuldp		vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`KERNEL1x1_SUB1', `
+#else
 .macro KERNEL1x1_SUB1
+#endif
 
 	lxvd2x		vs0,	o0,	AO		// load real,imag from A
 
@@ -3058,9 +3558,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	xvmaddadp	vs33,	vs0,	vs17		// real*imag, imag*imag
 
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 
+#if defined(_AIX)
+define(`SAVE1x1', `
+#else
 .macro SAVE1x1
+#endif
 
 
 	mr		T1,	CO
@@ -3074,13 +3582,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 	xxlxor		vs0,	vs0,	vs0
 	xxlxor		vs1,	vs1,	vs1
-	xxswapd		vs33,	vs33			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
+	XXSWAPD(vs33,vs33)			// realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
 
 	XSFADD_R1	vs0,	vs0,	vs32		// realA*realB
 	XSFADD_R2	vs0,	vs0,	vs33		// imagA*imagB
 
-	xxswapd		vs32,	vs32			// realA*realB, imagA*realB -> imagA*realB, realA*realB
-	xxswapd		vs33,	vs33			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
+	XXSWAPD(vs32,vs32)			// realA*realB, imagA*realB -> imagA*realB, realA*realB
+	XXSWAPD(vs33,vs33)			// imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
 
 	XSFADD_I1	vs1,	vs1,	vs32		// realA*imagB
 	XSFADD_I2	vs1,	vs1,	vs33		// imagA*realB
@@ -3106,5 +3614,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add		T1,	T1,	LDC
 	addi		CO,	CO,	16
 
+#if defined(_AIX)
+')
+#else
 .endm
+#endif
 

From 3635fdbf2bfbb3bd56a7fb3e0c1a1e21ef4d0b72 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Aug 2019 22:52:17 +0200
Subject: [PATCH 011/210] Do not abuse the global ARCH variable as a local
 temporary

Setting it with a simple "uname -m" just to be able to decide whether to compile getarch.c with -march=native
may actually keep getarch from doing a proper probe. Fixes #2231, a regression caused by #2110
---
 Makefile.system | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index a54282f6c..fe6be79db 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -9,9 +9,11 @@ ifndef TOPDIR
 TOPDIR = .
 endif
 
-# If ARCH is not set, we use the host system's architecture.
+# If ARCH is not set, we use the host system's architecture for getarch compile options.
 ifndef ARCH
-ARCH := $(shell uname -m)
+HOSTARCH := $(shell uname -m)
+else
+HOSTARCH = $(ARCH)
 endif
 
 # Catch conflicting usage of ARCH in some BSD environments
@@ -143,7 +145,7 @@ endif
 
 
 # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch.
-ifeq ($(ARCH), x86_64)
+ifeq ($(HOSTARCH), x86_64)
 ifeq ($(findstring pgcc,$(HOSTCC)),)
 GETARCH_FLAGS += -march=native
 endif

From 7d380f7d79abe1a8d7ed6efd56efe677135c2415 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Aug 2019 11:31:20 +0200
Subject: [PATCH 012/210] Fix PGI build options (again)

for #2237
---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index fe6be79db..2cf1322a9 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -699,7 +699,7 @@ endif
 
 ifeq ($(C_COMPILER), PGI)
 ifdef BINARY64
-CCOMMON_OPT += -tp p7-64
+CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm
 else
 CCOMMON_OPT += -tp p7
 endif

From 3a55dca2dceafef421c6198d7dd1876f4bcc5663 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Aug 2019 11:35:31 +0200
Subject: [PATCH 013/210] Make x86_64 zdot compile with PGI and Sun C again

broken by #2222 as CREAL,CIMAG do not expand to a valid lvalue with these compilers
---
 kernel/x86_64/zdot.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index 48f855b0e..d11cb764f 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -181,10 +181,10 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha;
+	FLOAT zdotr=0., zdoti=0.;
 #endif
 	OPENBLAS_COMPLEX_FLOAT zdot;
-       CREAL(zdot) = 0.0;
-       CIMAG(zdot) = 0.0;
+        zdot=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
 
 #if defined(SMP)
 	if (inc_x == 0 || inc_y == 0 || n <= 10000)
@@ -211,15 +211,17 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 		ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
 		for (i = 0; i < nthreads; i++) {
-			CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
-			CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
+			zdotr += CREAL(*ptr);
+			zdoti += CIMAG(*ptr);
+//			CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
+//			CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
 			ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
 		}
+	zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(zdotr,zdoti);
 	}
 #else
 	zdot_compute(n, x, inc_x, y, inc_y, &zdot);
 #endif
-
 	return zdot;
 }
 

From bf0d92a3105ae0ed67117dcf0977a164ac9e2e7a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Aug 2019 17:35:56 +0200
Subject: [PATCH 014/210] Add arch data for cross-compiling to CORE2

for #2235
---
 cmake/prebuild.cmake | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index e508a46c2..c2600bd0d 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -106,7 +106,25 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
   file(APPEND ${TARGET_CONF_TEMP}
     "#define ${TCORE}\n"
     "#define CHAR_CORENAME \"${TCORE}\"\n")
-  if ("${TCORE}" STREQUAL "ARMV7")
+  if ("${TCORE}" STREQUAL "CORE2")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_DATA_SIZE\t32768\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L2_SIZE\t1048576\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define DTB_DEFAULT_ENTRIES\t256\n"
+      "#define DTB_SIZE\t4096\n"
+      "#define HAVE_CMOV\n"
+      "#define HAVE_MMX\n"
+      "#define HAVE_SSE\n"
+      "#define HAVE_SSE2\n"
+      "#define HAVE_SSE3\n"
+      "#define HAVE_SSSE3\n")
+      set(SGEMM_UNROLL_M 8)
+      set(SGEMM_UNROLL_N 4)
+      set(DGEMM_UNROLL_M 4)
+      set(DGEMM_UNROLL_N 4)
+  elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
       "#define L1_DATA_LINESIZE\t32\n"

From 11c59acfb1a061f35ec88f11c2176f60b4916e93 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 28 Aug 2019 18:07:44 +0200
Subject: [PATCH 015/210] Keep both PGI/SUN and default code paths to avoid
 breaking Clang/WIndows

---
 kernel/x86_64/zdot.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/zdot.c b/kernel/x86_64/zdot.c
index d11cb764f..01169e8e6 100644
--- a/kernel/x86_64/zdot.c
+++ b/kernel/x86_64/zdot.c
@@ -181,11 +181,19 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 #if defined(SMP)
 	int nthreads;
 	FLOAT dummy_alpha;
+#if defined(C_PGI) || defined(C_SUN)	
 	FLOAT zdotr=0., zdoti=0.;
+#endif	
 #endif
+	
 	OPENBLAS_COMPLEX_FLOAT zdot;
+#if defined(C_PGI) || defined(C_SUN)	
         zdot=OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0);
-
+#else
+	CREAL(zdot) = 0.0;
+	CIMAG(zdot) = 0.0;
+#endif
+	
 #if defined(SMP)
 	if (inc_x == 0 || inc_y == 0 || n <= 10000)
 		nthreads = 1;
@@ -211,17 +219,23 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 		ptr = (OPENBLAS_COMPLEX_FLOAT *)result;
 		for (i = 0; i < nthreads; i++) {
+#if defined(C_PGI) || defined(C_SUN)			
 			zdotr += CREAL(*ptr);
 			zdoti += CIMAG(*ptr);
-//			CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
-//			CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
+#else			
+			CREAL(zdot) = CREAL(zdot) + CREAL(*ptr);
+			CIMAG(zdot) = CIMAG(zdot) + CIMAG(*ptr);
+#endif
 			ptr = (void *)(((char *)ptr) + sizeof(double) * 2);
 		}
+#if defined(C_PGI) || defined(C_SUN)		
 	zdot = OPENBLAS_MAKE_COMPLEX_FLOAT(zdotr,zdoti);
+#endif
 	}
 #else
 	zdot_compute(n, x, inc_x, y, inc_y, &zdot);
 #endif
+	
 	return zdot;
 }
 

From be09551cdf2efdddb3d671c1355c46560e4610f2 Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Thu, 29 Aug 2019 23:22:23 +0000
Subject: [PATCH 016/210] aligned

---
 kernel/power/caxpy.c     | 7 +++----
 kernel/power/cdot.c      | 6 +++---
 kernel/power/cgemv_n.c   | 6 +++---
 kernel/power/cgemv_t.c   | 6 +++---
 kernel/power/dgemv_n.c   | 2 +-
 kernel/power/dgemv_t.c   | 6 +++---
 kernel/power/sgemv_n.c   | 3 ++-
 kernel/power/sgemv_n_8.c | 3 ++-
 kernel/power/sgemv_t.c   | 7 +++----
 kernel/power/sgemv_t_8.c | 4 ++--
 kernel/power/zgemv_n_4.c | 4 ++--
 kernel/power/zgemv_t_4.c | 4 ++--
 12 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c
index 4bdf13c34..00f2ec5e0 100644
--- a/kernel/power/caxpy.c
+++ b/kernel/power/caxpy.c
@@ -24,12 +24,11 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
-
 #include "common.h"
- 
-
 #ifndef HAVE_ASM_KERNEL
 #include <altivec.h> 
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+
 static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
 {
 
@@ -43,7 +42,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
     register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i};
 #endif
 
-    __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
     register __vector float *vy = (__vector float *) y;
     register __vector float *vx = (__vector float *) x;
     BLASLONG i=0;
diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c
index f86a33f22..51d341ada 100644
--- a/kernel/power/cdot.c
+++ b/kernel/power/cdot.c
@@ -25,12 +25,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *****************************************************************************/
 
 #include "common.h"
-
 #ifndef HAVE_KERNEL_8
 #include <altivec.h> 
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
 static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
 {
-    __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+    __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
     register __vector float *vy = (__vector float *) y;
     register __vector float *vx = (__vector float *) x;
     BLASLONG i = 0;
@@ -96,7 +96,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
     BLASLONG i = 0;
     BLASLONG ix=0, iy=0;
     OPENBLAS_COMPLEX_FLOAT result;
-    FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
+    FLOAT dot[4] __attribute__((aligned(16))) = {0.0, 0.0, 0.0, 0.0};
 
     if (n <= 0) {
         CREAL(result) = 0.0;
diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c
index cb01e196e..6a195d6d1 100644
--- a/kernel/power/cgemv_n.c
+++ b/kernel/power/cgemv_n.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define NBMAX 1024
 
 
-static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
 
  
 static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) {
@@ -247,8 +247,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     BLASLONG m2;
     BLASLONG m3;
     BLASLONG n2;
-
-    FLOAT xbuffer[8], *ybuffer;
+    FLOAT xbuffer[8] __attribute__((aligned(16)));
+    FLOAT *ybuffer;
 
     if (m < 1) return (0);
     if (n < 1) return (0);
diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c
index c646618cf..68bbdd60a 100644
--- a/kernel/power/cgemv_t.c
+++ b/kernel/power/cgemv_t.c
@@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define NBMAX 1024 
 #include <altivec.h> 
-static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
+static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
 
 static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
     BLASLONG i;
@@ -260,8 +260,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     BLASLONG m2;
     BLASLONG m3;
     BLASLONG n2;
-
-    FLOAT ybuffer[8], *xbuffer;
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer;
 
     if (m < 1) return (0);
     if (n < 1) return (0);
diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c
index b458e11fc..1a3d7669c 100644
--- a/kernel/power/dgemv_n.c
+++ b/kernel/power/dgemv_n.c
@@ -145,7 +145,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG m3;
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
-	FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
+	FLOAT xbuffer[8] __attribute__ ((aligned (16)));
 	FLOAT *ybuffer;
 
         if ( m < 1 ) return(0);
diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c
index b8589a131..d05d7b7d3 100644
--- a/kernel/power/dgemv_t.c
+++ b/kernel/power/dgemv_t.c
@@ -581,9 +581,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
     BLASLONG m1;
     BLASLONG m2;
     BLASLONG m3;
-    BLASLONG n2;
-
-    FLOAT ybuffer[8], *xbuffer;
+    BLASLONG n2; 
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer;
 
     if (m < 1) return (0);
     if (n < 1) return (0);
diff --git a/kernel/power/sgemv_n.c b/kernel/power/sgemv_n.c
index 9704757fe..81ac031a3 100644
--- a/kernel/power/sgemv_n.c
+++ b/kernel/power/sgemv_n.c
@@ -174,7 +174,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
 	BLASLONG lda8 =  lda << 3;
-	FLOAT xbuffer[8],*ybuffer;
+	FLOAT xbuffer[8] __attribute__((aligned(16)));
+	FLOAT *ybuffer;
 
         if ( m < 1 ) return(0);
         if ( n < 1 ) return(0);
diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c
index 9bc93ced6..64696236a 100644
--- a/kernel/power/sgemv_n_8.c
+++ b/kernel/power/sgemv_n_8.c
@@ -213,7 +213,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
 	BLASLONG lda8 =  lda << 3;
-	FLOAT xbuffer[8],*ybuffer;
+	FLOAT xbuffer[8] __attribute__((aligned(16)));
+	FLOAT *ybuffer;
 
         if ( m < 1 ) return(0);
         if ( n < 1 ) return(0);
diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c
index 96434a13f..3d8a442dc 100644
--- a/kernel/power/sgemv_t.c
+++ b/kernel/power/sgemv_t.c
@@ -177,10 +177,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
     BLASLONG m1;
     BLASLONG m2;
     BLASLONG m3;
-    BLASLONG n2;
-
-    FLOAT ybuffer[8], *xbuffer;
-
+    BLASLONG n2; 
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer; 
     if (m < 1) return (0);
     if (n < 1) return (0);
 
diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c
index 5e9cd63ac..b90512162 100644
--- a/kernel/power/sgemv_t_8.c
+++ b/kernel/power/sgemv_t_8.c
@@ -204,8 +204,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
     BLASLONG m3;
     BLASLONG n2;
 
-    FLOAT ybuffer[8], *xbuffer;
-
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer; 
     if (m < 1) return (0);
     if (n < 1) return (0);
 
diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c
index 167b0a158..ba019d6a5 100644
--- a/kernel/power/zgemv_n_4.c
+++ b/kernel/power/zgemv_n_4.c
@@ -614,8 +614,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     BLASLONG m2;
     BLASLONG m3;
     BLASLONG n2;
-
-    FLOAT xbuffer[8], *ybuffer;
+    FLOAT xbuffer[8] __attribute__((aligned(16)));
+    FLOAT *ybuffer;
 
     if (m < 1) return (0);
     if (n < 1) return (0);
diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c
index 20a0812dd..b34199af6 100644
--- a/kernel/power/zgemv_t_4.c
+++ b/kernel/power/zgemv_t_4.c
@@ -532,8 +532,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
     BLASLONG m2;
     BLASLONG m3;
     BLASLONG n2;
-
-    FLOAT ybuffer[8], *xbuffer;
+    FLOAT ybuffer[8] __attribute__((aligned(16)));
+    FLOAT *xbuffer;
 
     if (m < 1) return (0);
     if (n < 1) return (0);

From e79712d96941c099d2f5e4b11544b2a20d97fbdf Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Fri, 30 Aug 2019 02:52:04 +0000
Subject: [PATCH 017/210] cgemv using vec_vsx_ld instead of letting gcc to
 decide

---
 kernel/power/cgemv_n.c | 97 +++++++++++++++++++++++-------------------
 kernel/power/cgemv_t.c | 97 +++++++++++++++++++++++++++---------------
 2 files changed, 115 insertions(+), 79 deletions(-)

diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c
index 6a195d6d1..e85517ffa 100644
--- a/kernel/power/cgemv_n.c
+++ b/kernel/power/cgemv_n.c
@@ -62,23 +62,24 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
     register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]};
     register __vector float vx3_i = {x[7], x[7],x[7], x[7]};
 #endif
-    register __vector float *vy = (__vector float *) y;
+    register __vector float *vptr_y = (__vector float *) y;
     register __vector float *vptr_a0 = (__vector float *) a0;
     register __vector float *vptr_a1 = (__vector float *) a1;
     register __vector float *vptr_a2 = (__vector float *) a2;
     register __vector float *vptr_a3 = (__vector float *) a3; 
     BLASLONG  i = 0; 
-    for (;i< n / 2; i+=2) {
-        register __vector float vy_0  = vy[i];
-        register __vector float vy_1  = vy[i + 1];
-        register __vector float va0   = vptr_a0[i];
-        register __vector float va1   = vptr_a1[i];
-        register __vector float va2   = vptr_a2[i];
-        register __vector float va3   = vptr_a3[i];
-        register __vector float va0_1 = vptr_a0[i + 1];
-        register __vector float va1_1 = vptr_a1[i + 1];
-        register __vector float va2_1 = vptr_a2[i + 1];
-        register __vector float va3_1 = vptr_a3[i + 1];
+    BLASLONG i2=16;
+    for (;i< n * 8; i+=32,i2+=32) {
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
+        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
+        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
+        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
+        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
 
         vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r;
         vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r;
@@ -93,8 +94,8 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
         vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i;
         vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i;
 
-        vy[i] = vy_0;
-        vy[i + 1] = vy_1;
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
     }
 
 }	
@@ -118,17 +119,19 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
     register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]};
     register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; 
 #endif
-    register __vector float *vy = (__vector float *) y;
+    register __vector float *vptr_y = (__vector float *) y;
     register __vector float *vptr_a0 = (__vector float *) a0;
     register __vector float *vptr_a1 = (__vector float *) a1; 
-    BLASLONG  i = 0; 
-    for (;i< n / 2; i+=2) {
-        register __vector float vy_0  = vy[i];
-        register __vector float vy_1  = vy[i + 1];
-        register __vector float va0   = vptr_a0[i];
-        register __vector float va1   = vptr_a1[i]; 
-        register __vector float va0_1 = vptr_a0[i + 1];
-        register __vector float va1_1 = vptr_a1[i + 1]; 
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
+
         register __vector float va0x   = vec_perm(va0, va0,swap_mask);
         register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
         register __vector float va1x   = vec_perm(va1, va1,swap_mask);
@@ -136,8 +139,8 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
         vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i;
         vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; 
 
-        vy[i] = vy_0;
-        vy[i + 1] = vy_1;
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
     }
 
 }
@@ -154,21 +157,23 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
     register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]};
     register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; 
 #endif
-    register __vector float *vy = (__vector float *) y;
+    register __vector float *vptr_y = (__vector float *) y;
     register __vector float *vptr_a0 = (__vector float *) ap; 
     BLASLONG  i = 0; 
-    for (;i< n / 2; i+=2) {
-        register __vector float vy_0  = vy[i];
-        register __vector float vy_1  = vy[i + 1];
-        register __vector float va0   = vptr_a0[i];
-        register __vector float va0_1 = vptr_a0[i + 1]; 
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0); 
+
         register __vector float va0x   = vec_perm(va0, va0,swap_mask);
         register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask);
         vy_0 += va0*vx0_r + va0x*vx0_i;
         vy_1 += va0_1*vx0_r + va0x_1*vx0_i; 
 
-        vy[i] = vy_0;
-        vy[i + 1] = vy_1;
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
     }
 }
 
@@ -213,20 +218,24 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
 
         register __vector float *vptr_src = (__vector float *) src;
         register __vector float *vptr_y = (__vector float *) dest; 
-        for (i = 0; i < n/2; i += 2 ){
 
-            register __vector float vy_0 = vptr_y[i];
-            register __vector float vy_1 = vptr_y[i +1]; 
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vy_0  = vec_vsx_ld(i,vptr_y);
+        register __vector float vy_1  = vec_vsx_ld(i2,vptr_y);
+
+
+        register __vector float vsrc  = vec_vsx_ld(i,vptr_src);
+        register __vector float vsrc_1  = vec_vsx_ld(i2,vptr_src);
+
+        register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
+        register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
 
-            register __vector float vsrc = vptr_src[i];
-            register __vector float vsrc_1 = vptr_src[i + 1]; 
-            register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask);
-            register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask);
+        vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
+        vy_1 += vsrc_1*valpha_r +  vsrcx_1*valpha_i;  
 
-            vy_0 += vsrc*valpha_r + vsrcx*valpha_i;
-            vy_1 += vsrc_1*valpha_r +  vsrcx_1*valpha_i;  
-            vptr_y[i] = vy_0;
-            vptr_y[i+1 ] = vy_1;  
+        vec_vsx_st(vy_0 ,i,  vptr_y);
+        vec_vsx_st(vy_1,i2,vptr_y);
 
         }
  
diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c
index 68bbdd60a..57eb066b0 100644
--- a/kernel/power/cgemv_t.c
+++ b/kernel/power/cgemv_t.c
@@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
 
 static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
+
     FLOAT *a0, *a1, *a2, *a3;
     a0 = ap;
     a1 = ap + lda;
@@ -48,26 +48,39 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
     register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0};
     register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0};
     register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0};
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1;
-    __vector float* va2 = (__vector float*) a2;
-    __vector float* va3 = (__vector float*) a3;
+    __vector float* vptr_a0 = (__vector float*) a0;
+    __vector float* vptr_a1 = (__vector float*) a1;
+    __vector float* vptr_a2 = (__vector float*) a2;
+    __vector float* vptr_a3 = (__vector float*) a3;
     __vector float* v_x = (__vector float*) x;
 
-    for (i = 0; i < n / 2; i+=2) {
-        register __vector float vx_0  = v_x[i]; 
-        register __vector float vx_1  = v_x[i+1];         
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
         register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
         register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
 
-        vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
-        vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; 
-        vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
-        vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; 
-        vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1];
-        vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; 
-        vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1];
-        vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; 
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1);
+        register __vector float va2   = vec_vsx_ld(i ,vptr_a2);
+        register __vector float va3   = vec_vsx_ld(i ,vptr_a3);
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1);
+        register __vector float va2_1 = vec_vsx_ld(i2 ,vptr_a2);
+        register __vector float va3_1 = vec_vsx_ld(i2 ,vptr_a3);
+
+
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
+        vtemp1_p += vx_0*va1 + vx_1*va1_1;
+        vtemp1_r += vxr_0*va1 + vxr_1*va1_1; 
+        vtemp2_p += vx_0*va2 + vx_1*va2_1;
+        vtemp2_r += vxr_0*va2 + vxr_1*va2_1; 
+        vtemp3_p += vx_0*va3 + vx_1*va3_1;
+        vtemp3_r += vxr_0*va3 + vxr_1*va3_1; 
 
     }
 
@@ -128,7 +141,7 @@ static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
  
 
 static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
+
     FLOAT *a0, *a1;
     a0 = ap;
     a1 = ap + lda; 
@@ -138,23 +151,33 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
     register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0};
     register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0};
     register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; 
-    __vector float* va0 = (__vector float*) a0;
-    __vector float* va1 = (__vector float*) a1; 
+
+
+    __vector float* vptr_a0 = (__vector float*) a0;
+    __vector float* vptr_a1 = (__vector float*) a1; 
     __vector float* v_x = (__vector float*) x;
 
-    for (i = 0; i < n / 2; i+=2) {
-        register __vector float vx_0  = v_x[i]; 
-        register __vector float vx_1  = v_x[i+1];         
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
         register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
         register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
 
-        vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
-        vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; 
-        vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1];
-        vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1];  
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0);
+        register __vector float va1   = vec_vsx_ld(i, vptr_a1); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);
+        register __vector float va1_1 = vec_vsx_ld(i2 ,vptr_a1); 
 
-    }
 
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1; 
+        vtemp1_p += vx_0*va1 + vx_1*va1_1;
+        vtemp1_r += vxr_0*va1 + vxr_1*va1_1;  
+
+    }
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 
     register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3];
@@ -193,23 +216,27 @@ static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA
  
 
 static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) {
-   BLASLONG i;  
+ 
     __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
     //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real)
     register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0};
     register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; 
-    __vector float* va0 = (__vector float*) ap; 
+    __vector float* vptr_a0 = (__vector float*) ap; 
     __vector float* v_x = (__vector float*) x;
-
-    for (i = 0; i < n / 2; i+=2) {
-        register __vector float vx_0  = v_x[i]; 
-        register __vector float vx_1  = v_x[i+1];         
+    BLASLONG  i = 0;
+    BLASLONG  i2 = 16;  
+    for (;i< n * 8; i+=32, i2+=32) { 
+        register __vector float vx_0  = vec_vsx_ld( i,v_x) ; 
+        register __vector float vx_1  = vec_vsx_ld(i2, v_x); 
+                
         register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask);
         register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask);
 
-        vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ;
-        vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1];  
+        register __vector float va0   = vec_vsx_ld(i,vptr_a0); 
+        register __vector float va0_1 = vec_vsx_ld(i2 ,vptr_a0);  
 
+        vtemp0_p += vx_0*va0 + vx_1*va0_1 ;
+        vtemp0_r += vxr_0*va0 + vxr_1*va0_1;  
     }
 
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )

From 4c22828812a9d5f0962c836d4c8bf486fde4d9cb Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Fri, 30 Aug 2019 04:09:15 +0000
Subject: [PATCH 018/210] caxpy and cdot are using vec_vsx_ld

---
 kernel/power/caxpy.c | 67 ++++++++++++++++++++++++++------------------
 kernel/power/cdot.c  | 52 +++++++++++++++++++---------------
 2 files changed, 69 insertions(+), 50 deletions(-)

diff --git a/kernel/power/caxpy.c b/kernel/power/caxpy.c
index 00f2ec5e0..0545766b1 100644
--- a/kernel/power/caxpy.c
+++ b/kernel/power/caxpy.c
@@ -27,6 +27,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #ifndef HAVE_ASM_KERNEL
 #include <altivec.h> 
+
+#define  offset_0 0
+#define  offset_1 16
+#define  offset_2 32
+#define  offset_3 48
+#define  offset_4 64
+#define  offset_5 80
+#define  offset_6 96
+#define  offset_7 112
+
 static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
 
 static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i)
@@ -43,27 +53,28 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
 #endif
 
     __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    register __vector float *vy = (__vector float *) y;
-    register __vector float *vx = (__vector float *) x;
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_x = (__vector float *) x; 
     BLASLONG i=0;
-    for (; i < n/2; i += 8) {
+    for(;i<n/2;i+=8){ 
+
+        register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
+        register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
+        register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
+        register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ;
+        register __vector float vy_4 = vec_vsx_ld( offset_4 ,vptr_y ) ;
+        register __vector float vy_5 = vec_vsx_ld( offset_5 ,vptr_y ) ;
+        register __vector float vy_6 = vec_vsx_ld( offset_6 ,vptr_y ) ;
+        register __vector float vy_7 = vec_vsx_ld( offset_7 ,vptr_y ) ;
 
-        register __vector float vy_0 = vy[i];
-        register __vector float vy_1 = vy[i + 1];
-        register __vector float vy_2 = vy[i + 2];
-        register __vector float vy_3 = vy[i + 3];
-        register __vector float vy_4 = vy[i + 4];
-        register __vector float vy_5 = vy[i + 5];
-        register __vector float vy_6 = vy[i + 6];
-        register __vector float vy_7 = vy[i + 7];
-        register __vector float vx_0 = vx[i];
-        register __vector float vx_1 = vx[i + 1];
-        register __vector float vx_2 = vx[i + 2];
-        register __vector float vx_3 = vx[i + 3];
-        register __vector float vx_4 = vx[i + 4];
-        register __vector float vx_5 = vx[i + 5];
-        register __vector float vx_6 = vx[i + 6];
-        register __vector float vx_7 = vx[i + 7];
+        register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
+        register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
+        register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
+        register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;
+        register __vector float vx_4 = vec_vsx_ld( offset_4 ,vptr_x ) ;
+        register __vector float vx_5 = vec_vsx_ld( offset_5 ,vptr_x ) ;
+        register __vector float vx_6 = vec_vsx_ld( offset_6 ,vptr_x ) ;
+        register __vector float vx_7 = vec_vsx_ld( offset_7 ,vptr_x ) ;
         vy_0 += vx_0*valpha_r;
         vy_1 += vx_1*valpha_r;
         vy_2 += vx_2*valpha_r;
@@ -88,15 +99,17 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT
         vy_5 += vx_5*valpha_i;
         vy_6 += vx_6*valpha_i;
         vy_7 += vx_7*valpha_i;
-        vy[i] = vy_0;
-        vy[i + 1] = vy_1;
-        vy[i + 2] = vy_2;
-        vy[i + 3] = vy_3;
-        vy[i + 4] = vy_4;
-        vy[i + 5] = vy_5 ;
-        vy[i + 6] = vy_6 ;
-        vy[i + 7] = vy_7 ;        
+        vec_vsx_st( vy_0, offset_0 ,vptr_y ) ;
+        vec_vsx_st( vy_1, offset_1 ,vptr_y ) ;
+        vec_vsx_st( vy_2, offset_2 ,vptr_y ) ;
+        vec_vsx_st( vy_3, offset_3 ,vptr_y ) ;
+        vec_vsx_st( vy_4, offset_4 ,vptr_y ) ;
+        vec_vsx_st( vy_5, offset_5 ,vptr_y ) ;
+        vec_vsx_st( vy_6, offset_6 ,vptr_y ) ;
+        vec_vsx_st( vy_7, offset_7 ,vptr_y ) ;   
 
+        vptr_x+=8;
+        vptr_y+=8;   
     }
 }
 #endif
diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c
index 51d341ada..d5b18729a 100644
--- a/kernel/power/cdot.c
+++ b/kernel/power/cdot.c
@@ -27,13 +27,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common.h"
 #ifndef HAVE_KERNEL_8
 #include <altivec.h> 
+
+#define  offset_0 0
+#define  offset_1 16
+#define  offset_2 32
+#define  offset_3 48
+
+
+
 static const unsigned char __attribute__((aligned(16))) swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11};
 static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
 {
     __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr);
-    register __vector float *vy = (__vector float *) y;
-    register __vector float *vx = (__vector float *) x;
-    BLASLONG i = 0;
+    register __vector float *vptr_y = (__vector float *) y;
+    register __vector float *vptr_x = (__vector float *) x;
     register __vector float vd_0  = { 0 };
     register __vector float vd_1  = { 0 };
     register __vector float vd_2  = { 0 };
@@ -41,26 +48,23 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
     register __vector float vdd_0 = { 0 };
     register __vector float vdd_1 = { 0 };
     register __vector float vdd_2 = { 0 };
-    register __vector float vdd_3 = { 0 };
-    for (; i < n/2; i += 4) {
-
-        register __vector float vyy_0 ;
-        register __vector float vyy_1 ;
-        register __vector float vyy_2 ;
-        register __vector float vyy_3 ;
-
-        register __vector float vy_0 = vy[i];
-        register __vector float vy_1 = vy[i + 1];
-        register __vector float vy_2 = vy[i + 2];
-        register __vector float vy_3 = vy[i + 3]; 
-        register __vector float vx_0= vx[i];
-        register __vector float vx_1 = vx[i + 1];
-        register __vector float vx_2 = vx[i + 2];
-        register __vector float vx_3 = vx[i + 3]; 
-        vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
-        vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
-        vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
-        vyy_3 = vec_perm(vy_3, vy_3, swap_mask);  
+    register __vector float vdd_3 = { 0 };     
+    BLASLONG i=0;
+    for(;i<n/2;i+=4){ 
+
+        register __vector float vy_0 = vec_vsx_ld( offset_0 ,vptr_y ) ;
+        register __vector float vy_1 = vec_vsx_ld( offset_1 ,vptr_y ) ;
+        register __vector float vy_2 = vec_vsx_ld( offset_2 ,vptr_y ) ;
+        register __vector float vy_3 = vec_vsx_ld( offset_3 ,vptr_y ) ; 
+
+        register __vector float vx_0 = vec_vsx_ld( offset_0 ,vptr_x ) ;
+        register __vector float vx_1 = vec_vsx_ld( offset_1 ,vptr_x ) ;
+        register __vector float vyy_0 = vec_perm(vy_0, vy_0, swap_mask);
+        register __vector float vyy_1 = vec_perm(vy_1, vy_1, swap_mask);
+        register __vector float vx_2 = vec_vsx_ld( offset_2 ,vptr_x ) ;
+        register __vector float vx_3 = vec_vsx_ld( offset_3 ,vptr_x ) ;  
+        register __vector float vyy_2 = vec_perm(vy_2, vy_2, swap_mask);
+        register __vector float vyy_3 = vec_perm(vy_3, vy_3, swap_mask);  
 
         vd_0 += vx_0 * vy_0;
         vd_1 += vx_1 * vy_1;
@@ -72,6 +76,8 @@ static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot)
         vdd_2 += vx_2 * vyy_2;
         vdd_3 += vx_3 * vyy_3;       
        
+       vptr_x+=4;
+       vptr_y+=4;
 
     }
     //aggregate

From 847c20c9b79ecf37fb714341066ecd27893b1c9d Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Fri, 30 Aug 2019 11:14:55 +0000
Subject: [PATCH 019/210]  fix uninitialized variables i

---
 kernel/power/cgemv_n.c | 4 ++--
 kernel/power/cgemv_t.c | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c
index e85517ffa..eec3fa37c 100644
--- a/kernel/power/cgemv_n.c
+++ b/kernel/power/cgemv_n.c
@@ -181,7 +181,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) {
 
 
 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) {
-    BLASLONG i;
+    BLASLONG i=0;
 
 
     if (inc_dest != 2) {
@@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT
 
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) {
-    BLASLONG i;
+    BLASLONG i=0;
     FLOAT *a_ptr;
     FLOAT *x_ptr;
     FLOAT *y_ptr;
diff --git a/kernel/power/cgemv_t.c b/kernel/power/cgemv_t.c
index 57eb066b0..691f7a3d3 100644
--- a/kernel/power/cgemv_t.c
+++ b/kernel/power/cgemv_t.c
@@ -276,8 +276,8 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) {
 }
 
 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) {
-    BLASLONG i;
-    BLASLONG j;
+    BLASLONG i=0;
+    BLASLONG j=0;
     FLOAT *a_ptr;
     FLOAT *x_ptr;
     FLOAT *y_ptr;

From b5af7b9c7808f06d6c10c2b2db90a054ab746970 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 31 Aug 2019 18:06:12 +0200
Subject: [PATCH 020/210] Disable ppc64le test environment on Travis CI

as this semi-official beta option has suddenly reverted to a standard x86_64 environment causing spurious failures
---
 .travis.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index a92bb0687..27ecba6c8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,14 +25,14 @@ matrix:
         - TARGET_BOX=LINUX64
         - BTYPE="BINARY=64"
 
-    - <<: *test-ubuntu
-      os: linux-ppc64le
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
-      env:
-        # for matrix annotation only
-        - TARGET_BOX=PPC64LE_LINUX
-        - BTYPE="BINARY=64 USE_OPENMP=1"
+        #    - <<: *test-ubuntu
+        #      os: linux-ppc64le
+        #      before_script:
+        #        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+        #      env:
+        #        # for matrix annotation only
+        #        - TARGET_BOX=PPC64LE_LINUX
+        #        - BTYPE="BINARY=64 USE_OPENMP=1"
 
     - <<: *test-ubuntu
       env:

From 1fec0570f6b0561a52d72e5d37bbae5fb8d467cb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 2 Sep 2019 15:03:45 +0200
Subject: [PATCH 021/210] Add cgemm and zgemm unroll factors for core2

---
 cmake/prebuild.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index c2600bd0d..2fe168a1c 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -124,6 +124,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       set(SGEMM_UNROLL_N 4)
       set(DGEMM_UNROLL_M 4)
       set(DGEMM_UNROLL_N 4)
+      set(CGEMM_DEFAULT_UNROLL_M 4)
+      set(CGEMM_DEFAULT_UNROLL_N 2)
+      set(ZGEMM_DEFAULT_UNROLL_M 2)
+      set(ZGEMM_DEFAULT_UNROLL_N 2)      
   elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"

From fde8a8e6a02b1186a178f8bbe3b3e5d84c8786e1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Sep 2019 22:41:17 +0200
Subject: [PATCH 022/210] Improve cmake build behaviour with non-host cpu
 targets (#2246)

1. Supply appropriate values for C/Z GEMM unroll when cross-compiling for CORE2 or ARMV7
2. Add the required xLOCAL_BUFFER_SIZE parameters for cross-compiling CORE2
3. Add -DFORCE_<target> option to getarch when building with -DTARGET=target
for #2245
---
 cmake/prebuild.cmake | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 2fe168a1c..da185db5a 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -105,6 +105,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
   # Perhaps this should be inside a different file as it grows larger
   file(APPEND ${TARGET_CONF_TEMP}
     "#define ${TCORE}\n"
+    "#define CORE_${TCORE}\n"
     "#define CHAR_CORENAME \"${TCORE}\"\n")
   if ("${TCORE}" STREQUAL "CORE2")
     file(APPEND ${TARGET_CONF_TEMP}
@@ -119,15 +120,19 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       "#define HAVE_SSE\n"
       "#define HAVE_SSE2\n"
       "#define HAVE_SSE3\n"
-      "#define HAVE_SSSE3\n")
+      "#define HAVE_SSSE3\n"
+      "#define SLOCAL_BUFFER_SIZE\t16384\n"
+      "#define DLOCAL_BUFFER_SIZE\t16384\n"
+      "#define CLOCAL_BUFFER_SIZE\t16384\n"
+      "#define ZLOCAL_BUFFER_SIZE\t16384\n")
       set(SGEMM_UNROLL_M 8)
       set(SGEMM_UNROLL_N 4)
       set(DGEMM_UNROLL_M 4)
       set(DGEMM_UNROLL_N 4)
-      set(CGEMM_DEFAULT_UNROLL_M 4)
-      set(CGEMM_DEFAULT_UNROLL_N 2)
-      set(ZGEMM_DEFAULT_UNROLL_M 2)
-      set(ZGEMM_DEFAULT_UNROLL_N 2)      
+      set(CGEMM_UNROLL_M 4)
+      set(CGEMM_UNROLL_N 2)
+      set(ZGEMM_UNROLL_M 2)
+      set(ZGEMM_UNROLL_N 2)
   elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
@@ -143,6 +148,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(SGEMM_UNROLL_N 4)
     set(DGEMM_UNROLL_M 4)
     set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 2)
+    set(CGEMM_UNROLL_N 2)
+    set(ZGEMM_UNROLL_M 2)
+    set(ZGEMM_UNROLL_N 2)
   elseif ("${TCORE}" STREQUAL "ARMV8")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t32768\n"
@@ -331,6 +340,9 @@ else(NOT CMAKE_CROSSCOMPILING)
     set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_GENERIC)
   else()
     list(APPEND GETARCH_SRC ${PROJECT_SOURCE_DIR}/cpuid.S)
+    if (DEFINED TARGET_CORE)
+    set(GETARCH_FLAGS ${GETARCH_FLAGS} -DFORCE_${TARGET_CORE})
+  endif ()
   endif ()
 
   if ("${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore")

From 4de545aa7da84c6bbb5d2d843d91a4900ad9a3e1 Mon Sep 17 00:00:00 2001
From: Andrew <16061801+brada4@users.noreply.github.com>
Date: Sat, 7 Sep 2019 10:21:08 +0300
Subject: [PATCH 024/210] address minor warnings from gcc7

---
 driver/others/openblas_get_config.c | 8 ++++----
 lapack/trtri/trtri_L_parallel.c     | 6 +++---
 lapack/trtri/trtri_U_parallel.c     | 6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c
index 81648fb7c..7fefee33d 100644
--- a/driver/others/openblas_get_config.c
+++ b/driver/others/openblas_get_config.c
@@ -78,10 +78,10 @@ char tmpstr[20];
 #ifdef DYNAMIC_ARCH
   strcat(tmp_config_str, gotoblas_corename());
 #endif
-if (openblas_get_parallel() == 0)
-  sprintf(tmpstr, " SINGLE_THREADED");
-else 
-  snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
+  if (openblas_get_parallel() == 0)
+    sprintf(tmpstr, " SINGLE_THREADED");
+  else 
+    snprintf(tmpstr,19," MAX_THREADS=%d",MAX_CPU_NUMBER);
   strcat(tmp_config_str, tmpstr);
   return tmp_config_str;
 }
diff --git a/lapack/trtri/trtri_L_parallel.c b/lapack/trtri/trtri_L_parallel.c
index 5dc60b862..fb8c8fc77 100644
--- a/lapack/trtri/trtri_L_parallel.c
+++ b/lapack/trtri/trtri_L_parallel.c
@@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
   BLASLONG n, info;
   BLASLONG bk, i, blocking, start_i;
   int mode;
-  BLASLONG lda, range_N[2];
+  BLASLONG lda; // , range_N[2];
   blas_arg_t newarg;
   FLOAT *a;
   FLOAT alpha[2] = { ONE, ZERO};
@@ -100,8 +100,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     bk = n - i;
     if (bk > blocking) bk = blocking;
 
-    range_N[0] = i;
-    range_N[1] = i + bk;
+    /* range_N[0] = i;
+    range_N[1] = i + bk; */
 
     newarg.lda = lda;
     newarg.ldb = lda;
diff --git a/lapack/trtri/trtri_U_parallel.c b/lapack/trtri/trtri_U_parallel.c
index fc48a33f1..5287421d6 100644
--- a/lapack/trtri/trtri_U_parallel.c
+++ b/lapack/trtri/trtri_U_parallel.c
@@ -54,7 +54,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
   BLASLONG n, info;
   BLASLONG bk, i, blocking;
   int mode;
-  BLASLONG lda, range_N[2];
+  BLASLONG lda; //, range_N[2];
   blas_arg_t newarg;
   FLOAT *a;
   FLOAT alpha[2] = { ONE, ZERO};
@@ -96,8 +96,8 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa,
     bk = n - i;
     if (bk > blocking) bk = blocking;
 
-    range_N[0] = i;
-    range_N[1] = i + bk;
+    /* range_N[0] = i;
+    range_N[1] = i + bk; */
 
     newarg.lda = lda;
     newarg.ldb = lda;

From ea747cf933a3ab6b82fbb726a51c70d34b3b91dc Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 30 Aug 2019 15:06:38 -0400
Subject: [PATCH 025/210] start working on ?trtrs

---
 common_macro.h           | 117 ++++++++++++++++++++++++++-
 interface/lapack/trtrs.c | 171 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 284 insertions(+), 4 deletions(-)
 create mode 100644 interface/lapack/trtrs.c

diff --git a/common_macro.h b/common_macro.h
index d2503aa65..e8a4a66ed 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -641,7 +641,7 @@
 #define IMATCOPY_K_CT		DIMATCOPY_K_CT
 #define IMATCOPY_K_RT		DIMATCOPY_K_RT
 
-#define GEADD_K                 DGEADD_K 
+#define GEADD_K                 DGEADD_K
 #else
 
 #define	AMAX_K			SAMAX_K
@@ -944,7 +944,7 @@
 #define IMATCOPY_K_CT		SIMATCOPY_K_CT
 #define IMATCOPY_K_RT		SIMATCOPY_K_RT
 
-#define GEADD_K 		SGEADD_K 
+#define GEADD_K 		SGEADD_K
 #endif
 #else
 #ifdef XDOUBLE
@@ -1770,7 +1770,7 @@
 #define IMATCOPY_K_CTC		ZIMATCOPY_K_CTC
 #define IMATCOPY_K_RTC		ZIMATCOPY_K_RTC
 
-#define GEADD_K                 ZGEADD_K 
+#define GEADD_K                 ZGEADD_K
 
 #else
 
@@ -2193,7 +2193,7 @@
 #define IMATCOPY_K_CTC		CIMATCOPY_K_CTC
 #define IMATCOPY_K_RTC		CIMATCOPY_K_RTC
 
-#define GEADD_K                 CGEADD_K 
+#define GEADD_K                 CGEADD_K
 
 #endif
 #endif
@@ -2806,3 +2806,112 @@ typedef struct {
 #endif
 
 #endif
+
+#ifndef COMPLEX
+#ifdef XDOUBLE
+#define  TRTRS_UNU_SINGLE  qtrtrs_UNU_single
+#define  TRTRS_UNN_SINGLE  qtrtrs_UNN_single
+#define  TRTRS_UTU_SINGLE  qtrtrs_UTU_single
+#define  TRTRS_UTN_SINGLE  qtrtrs_UTN_single
+#define  TRTRS_LNU_SINGLE  qtrtrs_LNU_single
+#define  TRTRS_LNN_SINGLE  qtrtrs_LNN_single
+#define  TRTRS_LTU_SINGLE  qtrtrs_LTU_single
+#define  TRTRS_LTN_SINGLE  qtrtrs_LTN_single
+#define  TRTRS_UNU_PARALLEL  qtrtrs_UNU_parallel
+#define  TRTRS_UNN_PARALLEL  qtrtrs_UNN_parallel
+#define  TRTRS_UTU_PARALLEL  qtrtrs_UTU_parallel
+#define  TRTRS_UTN_PARALLEL  qtrtrs_UTN_parallel
+#define  TRTRS_LNU_PARALLEL  qtrtrs_LNU_parallel
+#define  TRTRS_LNN_PARALLEL  qtrtrs_LNN_parallel
+#define  TRTRS_LTU_PARALLEL  qtrtrs_LTU_parallel
+#define  TRTRS_LTN_PARALLEL  qtrtrs_LTN_parallel
+
+#elif defined(DOUBLE)
+#define  TRTRS_UNU_SINGLE  dtrtrs_UNU_single
+#define  TRTRS_UNN_SINGLE  dtrtrs_UNN_single
+#define  TRTRS_UTU_SINGLE  dtrtrs_UTU_single
+#define  TRTRS_UTN_SINGLE  dtrtrs_UTN_single
+#define  TRTRS_LNU_SINGLE  dtrtrs_LNU_single
+#define  TRTRS_LNN_SINGLE  dtrtrs_LNN_single
+#define  TRTRS_LTU_SINGLE  dtrtrs_LTU_single
+#define  TRTRS_LTN_SINGLE  dtrtrs_LTN_single
+#define  TRTRS_UNU_PARALLEL  dtrtrs_UNU_parallel
+#define  TRTRS_UNN_PARALLEL  dtrtrs_UNN_parallel
+#define  TRTRS_UTU_PARALLEL  dtrtrs_UTU_parallel
+#define  TRTRS_UTN_PARALLEL  dtrtrs_UTN_parallel
+#define  TRTRS_LNU_PARALLEL  dtrtrs_LNU_parallel
+#define  TRTRS_LNN_PARALLEL  dtrtrs_LNN_parallel
+#define  TRTRS_LTU_PARALLEL  dtrtrs_LTU_parallel
+#define  TRTRS_LTN_PARALLEL  dtrtrs_LTN_parallel
+#else
+#define  TRTRS_UNU_SINGLE  strtrs_UNU_single
+#define  TRTRS_UNN_SINGLE  strtrs_UNN_single
+#define  TRTRS_UTU_SINGLE  strtrs_UTU_single
+#define  TRTRS_UTN_SINGLE  strtrs_UTN_single
+#define  TRTRS_LNU_SINGLE  strtrs_LNU_single
+#define  TRTRS_LNN_SINGLE  strtrs_LNN_single
+#define  TRTRS_LTU_SINGLE  strtrs_LTU_single
+#define  TRTRS_LTN_SINGLE  strtrs_LTN_single
+#define  TRTRS_UNU_PARALLEL  strtrs_UNU_parallel
+#define  TRTRS_UNN_PARALLEL  strtrs_UNN_parallel
+#define  TRTRS_UTU_PARALLEL  strtrs_UTU_parallel
+#define  TRTRS_UTN_PARALLEL  strtrs_UTN_parallel
+#define  TRTRS_LNU_PARALLEL  strtrs_LNU_parallel
+#define  TRTRS_LNN_PARALLEL  strtrs_LNN_parallel
+#define  TRTRS_LTU_PARALLEL  strtrs_LTU_parallel
+#define  TRTRS_LTN_PARALLEL  strtrs_LTN_parallel
+#endif
+#else
+#ifdef XDOUBLE
+#define  TRTRS_UNU_SINGLE  xtrtrs_UNU_single
+#define  TRTRS_UNN_SINGLE  xtrtrs_UNN_single
+#define  TRTRS_UTU_SINGLE  xtrtrs_UTU_single
+#define  TRTRS_UTN_SINGLE  xtrtrs_UTN_single
+#define  TRTRS_LNU_SINGLE  xtrtrs_LNU_single
+#define  TRTRS_LNN_SINGLE  xtrtrs_LNN_single
+#define  TRTRS_LTU_SINGLE  xtrtrs_LTU_single
+#define  TRTRS_LTN_SINGLE  xtrtrs_LTN_single
+#define  TRTRS_UNU_PARALLEL  xtrtrs_UNU_parallel
+#define  TRTRS_UNN_PARALLEL  xtrtrs_UNN_parallel
+#define  TRTRS_UTU_PARALLEL  xtrtrs_UTU_parallel
+#define  TRTRS_UTN_PARALLEL  xtrtrs_UTN_parallel
+#define  TRTRS_LNU_PARALLEL  xtrtrs_LNU_parallel
+#define  TRTRS_LNN_PARALLEL  xtrtrs_LNN_parallel
+#define  TRTRS_LTU_PARALLEL  xtrtrs_LTU_parallel
+#define  TRTRS_LTN_PARALLEL  xtrtrs_LTN_parallel
+#elif defined(DOUBLE)
+#define  TRTRS_UNU_SINGLE  ztrtrs_UNU_single
+#define  TRTRS_UNN_SINGLE  ztrtrs_UNN_single
+#define  TRTRS_UTU_SINGLE  ztrtrs_UTU_single
+#define  TRTRS_UTN_SINGLE  ztrtrs_UTN_single
+#define  TRTRS_LNU_SINGLE  ztrtrs_LNU_single
+#define  TRTRS_LNN_SINGLE  ztrtrs_LNN_single
+#define  TRTRS_LTU_SINGLE  ztrtrs_LTU_single
+#define  TRTRS_LTN_SINGLE  ztrtrs_LTN_single
+#define  TRTRS_UNU_PARALLEL  ztrtrs_UNU_parallel
+#define  TRTRS_UNN_PARALLEL  ztrtrs_UNN_parallel
+#define  TRTRS_UTU_PARALLEL  ztrtrs_UTU_parallel
+#define  TRTRS_UTN_PARALLEL  ztrtrs_UTN_parallel
+#define  TRTRS_LNU_PARALLEL  ztrtrs_LNU_parallel
+#define  TRTRS_LNN_PARALLEL  ztrtrs_LNN_parallel
+#define  TRTRS_LTU_PARALLEL  ztrtrs_LTU_parallel
+#define  TRTRS_LTN_PARALLEL  ztrtrs_LTN_parallel
+#else
+#define  TRTRS_UNU_SINGLE  ctrtrs_UNU_single
+#define  TRTRS_UNN_SINGLE  ctrtrs_UNN_single
+#define  TRTRS_UTU_SINGLE  ctrtrs_UTU_single
+#define  TRTRS_UTN_SINGLE  ctrtrs_UTN_single
+#define  TRTRS_LNU_SINGLE  ctrtrs_LNU_single
+#define  TRTRS_LNN_SINGLE  ctrtrs_LNN_single
+#define  TRTRS_LTU_SINGLE  ctrtrs_LTU_single
+#define  TRTRS_LTN_SINGLE  ctrtrs_LTN_single
+#define  TRTRS_UNU_PARALLEL  ctrtrs_UNU_parallel
+#define  TRTRS_UNN_PARALLEL  ctrtrs_UNN_parallel
+#define  TRTRS_UTU_PARALLEL  ctrtrs_UTU_parallel
+#define  TRTRS_UTN_PARALLEL  ctrtrs_UTN_parallel
+#define  TRTRS_LNU_PARALLEL  ctrtrs_LNU_parallel
+#define  TRTRS_LNN_PARALLEL  ctrtrs_LNN_parallel
+#define  TRTRS_LTU_PARALLEL  ctrtrs_LTU_parallel
+#define  TRTRS_LTN_PARALLEL  ctrtrs_LTN_parallel
+#endif
+#endif
diff --git a/interface/lapack/trtrs.c b/interface/lapack/trtrs.c
new file mode 100644
index 000000000..261b07ec6
--- /dev/null
+++ b/interface/lapack/trtrs.c
@@ -0,0 +1,171 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#ifdef XDOUBLE
+#define ERROR_NAME "QTRTRS"
+#elif defined(DOUBLE)
+#define ERROR_NAME "DTRTRS"
+#else
+#define ERROR_NAME "STRTRS"
+#endif
+
+static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
+    TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE,
+};
+
+#ifdef SMP
+static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
+    TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL,
+};
+#endif
+
+int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
+  FLOAT *b, blasint *ldB, blasint *Info){
+
+    char uplo_arg = *UPLO;
+    char trans_arg = *TRANS;
+    char diag_arg = *DIAG;
+
+  blas_arg_t args;
+
+  blasint info;
+  int uplo, trans, diag;
+  FLOAT *buffer;
+#ifdef PPC440
+  extern
+#endif
+  FLOAT *sa, *sb;
+
+  PRINT_DEBUG_NAME;
+
+  args.m    = *N;
+  args.n    = *NRHS;
+  args.a    = (void *)a;
+  args.lda  = *ldA;
+  args.b    = (void *)b;
+  args.ldb  = *ldB;
+
+  info = 0;
+
+  TOUPPER(trans_arg);
+  trans = -1;
+  if (trans_arg == 'N') trans = 0;
+  if (trans_arg == 'T') trans = 1;
+  if (trans_arg == 'R') trans = 0;
+  if (trans_arg == 'C') trans = 1;
+
+  uplo = -1;
+  if (uplo_arg == 'U') uplo = 0;
+  if (uplo_arg == 'L') uplo = 1;
+
+  diag = -1;
+  if (diag_arg == 'U') diag = 0;
+  if (diag_arg == 'N') diag = 1;
+
+  if (args.ldb  < MAX(1, args.m)) info = 7;
+  if (args.lda  < MAX(1, args.m)) info = 9;
+  if (args.n    < 0) info = 5;
+  if (args.m    < 0) info = 4;
+  if (trans     < 0) info = 2;
+  if (uplo      < 0) info = 1;
+  if (diag      < 0) info = 3;
+
+  if (info != 0) {
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    *Info = - info;
+    return 0;
+  }
+
+  args.alpha = NULL;
+  args.beta  = NULL;
+
+  *Info = 0;
+
+  if (args.m == 0 || args.n == 0) return 0;
+
+  if (diag) {
+    if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) {
+      *Info = IAMIN_K(args.n, args.a, args.lda + 1);
+      return 0;
+    }
+  }
+
+
+  IDEBUG_START;
+
+  FUNCTION_PROFILE_START();
+
+#ifndef PPC440
+  buffer = (FLOAT *)blas_memory_alloc(1);
+
+  sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
+  sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
+
+#ifdef SMP
+  args.common = NULL;
+  args.nthreads = num_cpu_avail(4);
+
+  if (args.nthreads == 1) {
+#endif
+
+      (trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
+
+#ifdef SMP
+  } else {
+    (trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
+  }
+#endif
+
+#ifndef PPC440
+  blas_memory_free(buffer);
+#endif
+
+  FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
+
+  IDEBUG_END;
+
+  return 0;
+
+}

From 733d97b2df64d6e5674a5a6b673254584b1d75af Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 30 Aug 2019 16:31:25 -0400
Subject: [PATCH 026/210] add files

---
 lapack/trtrs/Makefile       | 308 ++++++++++++++++++++++++++++++++++++
 lapack/trtrs/trtrs_single.c |  68 ++++++++
 2 files changed, 376 insertions(+)
 create mode 100644 lapack/trtrs/Makefile
 create mode 100644 lapack/trtrs/trtrs_single.c

diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile
new file mode 100644
index 000000000..6f41a9319
--- /dev/null
+++ b/lapack/trtrs/Makefile
@@ -0,0 +1,308 @@
+TOPDIR	= ../..
+include ../../Makefile.system
+
+SBLASOBJS = strtrs_UNU_single.$(SUFFIX) strtrs_UNN_single.$(SUFFIX) strtrs_UTU_single.$(SUFFIX) strtrs_UTN_single.$(SUFFIX) strtrs_LNU_single.$(SUFFIX) strtrs_LNN_single.$(SUFFIX) strtrs_LTU_single.$(SUFFIX) strtrs_LTN_single.$(SUFFIX)
+DBLASOBJS = dtrtrs_UNU_single.$(SUFFIX) dtrtrs_UNN_single.$(SUFFIX) dtrtrs_UTU_single.$(SUFFIX) dtrtrs_UTN_single.$(SUFFIX) dtrtrs_LNU_single.$(SUFFIX) dtrtrs_LNN_single.$(SUFFIX) dtrtrs_LTU_single.$(SUFFIX) dtrtrs_LTN_single.$(SUFFIX)
+QBLASOBJS = qtrtrs_UNU_single.$(SUFFIX) qtrtrs_UNN_single.$(SUFFIX) qtrtrs_UTU_single.$(SUFFIX) qtrtrs_UTN_single.$(SUFFIX) qtrtrs_LNU_single.$(SUFFIX) qtrtrs_LNN_single.$(SUFFIX) qtrtrs_LTU_single.$(SUFFIX) qtrtrs_LTN_single.$(SUFFIX)
+CBLASOBJS = cgetrs_N_single.$(SUFFIX) cgetrs_T_single.$(SUFFIX) cgetrs_R_single.$(SUFFIX) cgetrs_C_single.$(SUFFIX)
+ZBLASOBJS = zgetrs_N_single.$(SUFFIX) zgetrs_T_single.$(SUFFIX) zgetrs_R_single.$(SUFFIX) zgetrs_C_single.$(SUFFIX)
+XBLASOBJS = xgetrs_N_single.$(SUFFIX) xgetrs_T_single.$(SUFFIX) xgetrs_R_single.$(SUFFIX) xgetrs_C_single.$(SUFFIX)
+
+ifdef SMP
+SBLASOBJS += sgetrs_N_parallel.$(SUFFIX) sgetrs_T_parallel.$(SUFFIX)
+DBLASOBJS += dgetrs_N_parallel.$(SUFFIX) dgetrs_T_parallel.$(SUFFIX)
+QBLASOBJS += qgetrs_N_parallel.$(SUFFIX) qgetrs_T_parallel.$(SUFFIX)
+CBLASOBJS += cgetrs_N_parallel.$(SUFFIX) cgetrs_T_parallel.$(SUFFIX) cgetrs_R_parallel.$(SUFFIX) cgetrs_C_parallel.$(SUFFIX)
+ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_parallel.$(SUFFIX) zgetrs_C_parallel.$(SUFFIX)
+XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX)
+endif
+
+strtrs_UNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+strtrs_UNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+strtrs_UTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+strtrs_UTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+strtrs_LNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+strtrs_LNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+strtrs_LTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+strtrs_LTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+strtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+strtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+strtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+strtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+strtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+strtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+strtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+strtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+dtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+dtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+dtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+dtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+dtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+dtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+dtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ctrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+ctrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ctrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ctrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+ctrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ctrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ctrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ztrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+ztrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ztrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ztrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+ztrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+ztrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+ztrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+
+qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+
+include ../../Makefile.tail
diff --git a/lapack/trtrs/trtrs_single.c b/lapack/trtrs/trtrs_single.c
new file mode 100644
index 000000000..0dbb03869
--- /dev/null
+++ b/lapack/trtrs/trtrs_single.c
@@ -0,0 +1,68 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
+
+#ifndef TRANS
+  LASWP_PLUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1);
+
+  if (args -> n == 1){
+    TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+  } else {
+    TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    TRSM_LNUN (args, range_m, range_n, sa, sb, 0);
+  }
+
+#else
+
+  if (args -> n == 1){
+    TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+  } else {
+    TRSM_LTUN  (args, range_m, range_n, sa, sb, 0);
+    TRSM_LTLU  (args, range_m, range_n, sa, sb, 0);
+  }
+
+  LASWP_MINUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1);
+#endif
+
+  return 0;  }

From a4f17a9297c444180be4f2a73cc2940ddda7b00f Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Mon, 2 Sep 2019 21:15:20 -0400
Subject: [PATCH 027/210] add missing objects

---
 lapack/Makefile       |   2 +-
 lapack/trtrs/Makefile | 210 +++++++++++++++++++++---------------------
 2 files changed, 106 insertions(+), 106 deletions(-)

diff --git a/lapack/Makefile b/lapack/Makefile
index aff5209d5..2bbb4603f 100644
--- a/lapack/Makefile
+++ b/lapack/Makefile
@@ -2,7 +2,7 @@ TOPDIR	= ..
 include ../Makefile.system
 
 #SUBDIRS	= laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs
-SUBDIRS	= getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri
+SUBDIRS	= getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs
 
 FLAMEDIRS = laswp getf2 potf2 lauu2 trti2
 
diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile
index 6f41a9319..400b8b653 100644
--- a/lapack/trtrs/Makefile
+++ b/lapack/trtrs/Makefile
@@ -4,17 +4,17 @@ include ../../Makefile.system
 SBLASOBJS = strtrs_UNU_single.$(SUFFIX) strtrs_UNN_single.$(SUFFIX) strtrs_UTU_single.$(SUFFIX) strtrs_UTN_single.$(SUFFIX) strtrs_LNU_single.$(SUFFIX) strtrs_LNN_single.$(SUFFIX) strtrs_LTU_single.$(SUFFIX) strtrs_LTN_single.$(SUFFIX)
 DBLASOBJS = dtrtrs_UNU_single.$(SUFFIX) dtrtrs_UNN_single.$(SUFFIX) dtrtrs_UTU_single.$(SUFFIX) dtrtrs_UTN_single.$(SUFFIX) dtrtrs_LNU_single.$(SUFFIX) dtrtrs_LNN_single.$(SUFFIX) dtrtrs_LTU_single.$(SUFFIX) dtrtrs_LTN_single.$(SUFFIX)
 QBLASOBJS = qtrtrs_UNU_single.$(SUFFIX) qtrtrs_UNN_single.$(SUFFIX) qtrtrs_UTU_single.$(SUFFIX) qtrtrs_UTN_single.$(SUFFIX) qtrtrs_LNU_single.$(SUFFIX) qtrtrs_LNN_single.$(SUFFIX) qtrtrs_LTU_single.$(SUFFIX) qtrtrs_LTN_single.$(SUFFIX)
-CBLASOBJS = cgetrs_N_single.$(SUFFIX) cgetrs_T_single.$(SUFFIX) cgetrs_R_single.$(SUFFIX) cgetrs_C_single.$(SUFFIX)
-ZBLASOBJS = zgetrs_N_single.$(SUFFIX) zgetrs_T_single.$(SUFFIX) zgetrs_R_single.$(SUFFIX) zgetrs_C_single.$(SUFFIX)
-XBLASOBJS = xgetrs_N_single.$(SUFFIX) xgetrs_T_single.$(SUFFIX) xgetrs_R_single.$(SUFFIX) xgetrs_C_single.$(SUFFIX)
+CBLASOBJS = ctrtrs_UNU_single.$(SUFFIX) ctrtrs_UNN_single.$(SUFFIX) ctrtrs_UTU_single.$(SUFFIX) ctrtrs_UTN_single.$(SUFFIX) ctrtrs_URU_single.$(SUFFIX) ctrtrs_URN_single.$(SUFFIX) ctrtrs_UCU_single.$(SUFFIX) ctrtrs_UCN_single.$(SUFFIX) ctrtrs_LNU_single.$(SUFFIX) ctrtrs_LNN_single.$(SUFFIX) ctrtrs_LTU_single.$(SUFFIX) ctrtrs_LTN_single.$(SUFFIX) ctrtrs_LRU_single.$(SUFFIX) ctrtrs_LRN_single.$(SUFFIX) ctrtrs_LCU_single.$(SUFFIX) ctrtrs_LCN_single.$(SUFFIX)
+ZBLASOBJS = ztrtrs_UNU_single.$(SUFFIX) ztrtrs_UNN_single.$(SUFFIX) ztrtrs_UTU_single.$(SUFFIX) ztrtrs_UTN_single.$(SUFFIX) ztrtrs_URU_single.$(SUFFIX) ztrtrs_URN_single.$(SUFFIX) ztrtrs_UCU_single.$(SUFFIX) ztrtrs_UCN_single.$(SUFFIX) ztrtrs_LNU_single.$(SUFFIX) ztrtrs_LNN_single.$(SUFFIX) ztrtrs_LTU_single.$(SUFFIX) ztrtrs_LTN_single.$(SUFFIX) ztrtrs_LRU_single.$(SUFFIX) ztrtrs_LRN_single.$(SUFFIX) ztrtrs_LCU_single.$(SUFFIX) ztrtrs_LCN_single.$(SUFFIX)
+XBLASOBJS = xtrtrs_UNU_single.$(SUFFIX) xtrtrs_UNN_single.$(SUFFIX) xtrtrs_UTU_single.$(SUFFIX) xtrtrs_UTN_single.$(SUFFIX) xtrtrs_URU_single.$(SUFFIX) xtrtrs_URN_single.$(SUFFIX) xtrtrs_UCU_single.$(SUFFIX) xtrtrs_UCN_single.$(SUFFIX) xtrtrs_LNU_single.$(SUFFIX) xtrtrs_LNN_single.$(SUFFIX) xtrtrs_LTU_single.$(SUFFIX) xtrtrs_LTN_single.$(SUFFIX) xtrtrs_LRU_single.$(SUFFIX) xtrtrs_LRN_single.$(SUFFIX) xtrtrs_LCU_single.$(SUFFIX) xtrtrs_LCN_single.$(SUFFIX)
 
 ifdef SMP
-SBLASOBJS += sgetrs_N_parallel.$(SUFFIX) sgetrs_T_parallel.$(SUFFIX)
-DBLASOBJS += dgetrs_N_parallel.$(SUFFIX) dgetrs_T_parallel.$(SUFFIX)
-QBLASOBJS += qgetrs_N_parallel.$(SUFFIX) qgetrs_T_parallel.$(SUFFIX)
-CBLASOBJS += cgetrs_N_parallel.$(SUFFIX) cgetrs_T_parallel.$(SUFFIX) cgetrs_R_parallel.$(SUFFIX) cgetrs_C_parallel.$(SUFFIX)
-ZBLASOBJS += zgetrs_N_parallel.$(SUFFIX) zgetrs_T_parallel.$(SUFFIX) zgetrs_R_parallel.$(SUFFIX) zgetrs_C_parallel.$(SUFFIX)
-XBLASOBJS += xgetrs_N_parallel.$(SUFFIX) xgetrs_T_parallel.$(SUFFIX) xgetrs_R_parallel.$(SUFFIX) xgetrs_C_parallel.$(SUFFIX)
+SBLASOBJS += strtrs_UNU_parallel.$(SUFFIX) strtrs_UNN_parallel.$(SUFFIX) strtrs_UTU_parallel.$(SUFFIX) strtrs_UTN_parallel.$(SUFFIX) strtrs_LNU_parallel.$(SUFFIX) strtrs_LNN_parallel.$(SUFFIX) strtrs_LTU_parallel.$(SUFFIX) strtrs_LTN_parallel.$(SUFFIX)
+DBLASOBJS += dtrtrs_UNU_parallel.$(SUFFIX) dtrtrs_UNN_parallel.$(SUFFIX) dtrtrs_UTU_parallel.$(SUFFIX) dtrtrs_UTN_parallel.$(SUFFIX) dtrtrs_LNU_parallel.$(SUFFIX) dtrtrs_LNN_parallel.$(SUFFIX) dtrtrs_LTU_parallel.$(SUFFIX) dtrtrs_LTN_parallel.$(SUFFIX)
+QBLASOBJS += qtrtrs_UNU_parallel.$(SUFFIX) qtrtrs_UNN_parallel.$(SUFFIX) qtrtrs_UTU_parallel.$(SUFFIX) qtrtrs_UTN_parallel.$(SUFFIX) qtrtrs_LNU_parallel.$(SUFFIX) qtrtrs_LNN_parallel.$(SUFFIX) qtrtrs_LTU_parallel.$(SUFFIX) qtrtrs_LTN_parallel.$(SUFFIX)
+CBLASOBJS += ctrtrs_UNU_parallel.$(SUFFIX) ctrtrs_UNN_parallel.$(SUFFIX) ctrtrs_UTU_parallel.$(SUFFIX) ctrtrs_UTN_parallel.$(SUFFIX) ctrtrs_URU_parallel.$(SUFFIX) ctrtrs_URN_parallel.$(SUFFIX) ctrtrs_UCU_parallel.$(SUFFIX) ctrtrs_UCN_parallel.$(SUFFIX) ctrtrs_LNU_parallel.$(SUFFIX) ctrtrs_LNN_parallel.$(SUFFIX) ctrtrs_LTU_parallel.$(SUFFIX) ctrtrs_LTN_parallel.$(SUFFIX) ctrtrs_LRU_parallel.$(SUFFIX) ctrtrs_LRN_parallel.$(SUFFIX) ctrtrs_LCU_parallel.$(SUFFIX) ctrtrs_LCN_parallel.$(SUFFIX)
+ZBLASOBJS += ztrtrs_UNU_parallel.$(SUFFIX) ztrtrs_UNN_parallel.$(SUFFIX) ztrtrs_UTU_parallel.$(SUFFIX) ztrtrs_UTN_parallel.$(SUFFIX) ztrtrs_URU_parallel.$(SUFFIX) ztrtrs_URN_parallel.$(SUFFIX) ztrtrs_UCU_parallel.$(SUFFIX) ztrtrs_UCN_parallel.$(SUFFIX) ztrtrs_LNU_parallel.$(SUFFIX) ztrtrs_LNN_parallel.$(SUFFIX) ztrtrs_LTU_parallel.$(SUFFIX) ztrtrs_LTN_parallel.$(SUFFIX) ztrtrs_LRU_parallel.$(SUFFIX) ztrtrs_LRN_parallel.$(SUFFIX) ztrtrs_LCU_parallel.$(SUFFIX) ztrtrs_LCN_parallel.$(SUFFIX)
+XBLASOBJS += xtrtrs_UNU_parallel.$(SUFFIX) xtrtrs_UNN_parallel.$(SUFFIX) xtrtrs_UTU_parallel.$(SUFFIX) xtrtrs_UTN_parallel.$(SUFFIX) xtrtrs_URU_parallel.$(SUFFIX) xtrtrs_URN_parallel.$(SUFFIX) xtrtrs_UCU_parallel.$(SUFFIX) xtrtrs_UCN_parallel.$(SUFFIX) xtrtrs_LNU_parallel.$(SUFFIX) xtrtrs_LNN_parallel.$(SUFFIX) xtrtrs_LTU_parallel.$(SUFFIX) xtrtrs_LTN_parallel.$(SUFFIX) xtrtrs_LRU_parallel.$(SUFFIX) xtrtrs_LRN_parallel.$(SUFFIX) xtrtrs_LCU_parallel.$(SUFFIX) xtrtrs_LCN_parallel.$(SUFFIX)
 endif
 
 strtrs_UNU_single.$(SUFFIX) : trtrs_single.c
@@ -161,148 +161,148 @@ qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
 qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
 
-ctrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+ctrtrs_UNU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-ctrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+ctrtrs_UNN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-ctrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+ctrtrs_UTU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-ctrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+ctrtrs_UTN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-ctrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+ctrtrs_URU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-ctrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+ctrtrs_URN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-ctrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ctrtrs_UCU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-ctrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ctrtrs_UCN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-ctrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+ctrtrs_LNU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-ctrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+ctrtrs_LNN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-ctrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+ctrtrs_LTU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-ctrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+ctrtrs_LTN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-ctrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+ctrtrs_LRU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-ctrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+ctrtrs_LRN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-ctrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ctrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-ctrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ctrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-ztrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+ztrtrs_UNU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-ztrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+ztrtrs_UNN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-ztrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+ztrtrs_UTU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-ztrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+ztrtrs_UTN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-ztrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+ztrtrs_URU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-ztrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+ztrtrs_URN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-ztrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ztrtrs_UCU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-ztrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ztrtrs_UCN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-ztrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+ztrtrs_LNU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-ztrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+ztrtrs_LNN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-ztrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+ztrtrs_LTU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-ztrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+ztrtrs_LTN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-ztrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+ztrtrs_LRU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-ztrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+ztrtrs_LRN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-ztrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ztrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-ztrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+ztrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-qtrtrs_UNU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+xtrtrs_UNU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-qtrtrs_UNN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+xtrtrs_UNN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-qtrtrs_UTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+xtrtrs_UTU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-qtrtrs_UTN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+xtrtrs_UTN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-qtrtrs_LNU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+xtrtrs_URU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+xtrtrs_URN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+xtrtrs_UCU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+xtrtrs_UCN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-qtrtrs_UNU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -UDIAG $< -o $(@F)
+xtrtrs_LNU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-qtrtrs_UNN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -UTRANS -DDIAG $< -o $(@F)
+xtrtrs_LNN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-qtrtrs_UTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -UDIAG $< -o $(@F)
+xtrtrs_LTU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-qtrtrs_UTN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS -DDIAG $< -o $(@F)
+xtrtrs_LTN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-qtrtrs_LNU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -UDIAG $< -o $(@F)
+xtrtrs_LRU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
+xtrtrs_LRN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+xtrtrs_LCU_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+xtrtrs_LCN_single.$(SUFFIX) : xtrtrs_single.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
 include ../../Makefile.tail

From 42203dafdcb8e2ab1bd9d68ede5ed78769e2b7a1 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Mon, 2 Sep 2019 21:57:28 -0400
Subject: [PATCH 028/210] add logic

---
 lapack/trtrs/trtrs_single.c | 79 ++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 22 deletions(-)

diff --git a/lapack/trtrs/trtrs_single.c b/lapack/trtrs/trtrs_single.c
index 0dbb03869..a690d4a25 100644
--- a/lapack/trtrs/trtrs_single.c
+++ b/lapack/trtrs/trtrs_single.c
@@ -41,28 +41,63 @@
 
 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
 
-#ifndef TRANS
-  LASWP_PLUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, 1);
-
-  if (args -> n == 1){
-    TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    TRSV_NUN (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-  } else {
-    TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    TRSM_LNUN (args, range_m, range_n, sa, sb, 0);
-  }
-
+#ifndef UPLO
+#ifndef DIAG
+#ifndef DIAG
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
 #else
-
-  if (args -> n == 1){
-    TRSV_TUN (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    TRSV_TLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-  } else {
-    TRSM_LTUN  (args, range_m, range_n, sa, sb, 0);
-    TRSM_LTLU  (args, range_m, range_n, sa, sb, 0);
-  }
-
-  LASWP_MINUS(args -> n, 1, args -> m, ZERO, args -> b, args -> ldb, NULL, 0, args -> c, -1);
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
+#endif
+#else
+#ifndef DIAG
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
+#else
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
+#endif
+#else
+#ifndef DIAG
+#ifndef DIAG
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
+#else
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
+#endif
+#else
+#ifndef DIAG
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
+#else
+    if (args -> n == 1){
+        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+    }
+#endif
 #endif
-
   return 0;  }

From 9f6984fe4bd5ca2c44df01c647462f54c7b84762 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Tue, 3 Sep 2019 14:45:43 -0400
Subject: [PATCH 029/210] add missing files

---
 lapack/trtrs/Makefile          | 144 +++++++++++++++++++++++++++++++++
 lapack/trtrs/trtrs_parallel.c  | 111 +++++++++++++++++++++++++
 lapack/trtrs/trtrs_single.c    |  84 +++++++------------
 lapack/trtrs/ztrtrs_parallel.c | 118 +++++++++++++++++++++++++++
 lapack/trtrs/ztrtrs_single.c   |  98 ++++++++++++++++++++++
 5 files changed, 499 insertions(+), 56 deletions(-)
 create mode 100644 lapack/trtrs/trtrs_parallel.c
 create mode 100644 lapack/trtrs/ztrtrs_parallel.c
 create mode 100644 lapack/trtrs/ztrtrs_single.c

diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile
index 400b8b653..f9faaf9b9 100644
--- a/lapack/trtrs/Makefile
+++ b/lapack/trtrs/Makefile
@@ -305,4 +305,148 @@ xtrtrs_LCU_single.$(SUFFIX) : xtrtrs_single.c
 xtrtrs_LCN_single.$(SUFFIX) : xtrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
+ctrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
+
+ctrtrs_UNN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
+
+ctrtrs_UTU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
+
+ctrtrs_UTN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
+
+ctrtrs_URU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
+
+ctrtrs_URN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
+
+ctrtrs_UCU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
+
+ctrtrs_UCN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
+
+ctrtrs_LNU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
+
+ctrtrs_LNN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
+
+ctrtrs_LTU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
+
+ctrtrs_LTN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
+
+ctrtrs_LRU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
+
+ctrtrs_LRN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
+
+ctrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
+
+ctrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
+
+ztrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
+
+ztrtrs_UNN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
+
+ztrtrs_UTU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
+
+ztrtrs_UTN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
+
+ztrtrs_URU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
+
+ztrtrs_URN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
+
+ztrtrs_UCU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
+
+ztrtrs_UCN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
+
+ztrtrs_LNU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
+
+ztrtrs_LNN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
+
+ztrtrs_LTU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
+
+ztrtrs_LTN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
+
+ztrtrs_LRU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
+
+ztrtrs_LRN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
+
+ztrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
+
+ztrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
+
+xtrtrs_UNU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
+
+xtrtrs_UNN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
+
+xtrtrs_UTU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
+
+xtrtrs_UTN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
+
+xtrtrs_URU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
+
+xtrtrs_URN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
+
+xtrtrs_UCU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
+
+xtrtrs_UCN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
+
+xtrtrs_LNU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
+
+xtrtrs_LNN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
+
+xtrtrs_LTU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
+
+xtrtrs_LTN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
+
+xtrtrs_LRU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
+
+xtrtrs_LRN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
+
+xtrtrs_LCU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
+
+xtrtrs_LCN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
+
 include ../../Makefile.tail
diff --git a/lapack/trtrs/trtrs_parallel.c b/lapack/trtrs/trtrs_parallel.c
new file mode 100644
index 000000000..52f42f693
--- /dev/null
+++ b/lapack/trtrs/trtrs_parallel.c
@@ -0,0 +1,111 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if   !defined(TRANS) && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNUU
+#define TRSV TRSV_NUU
+#elif !defined(TRANS) && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNUN
+#define TRSV TRSV_NUN
+#elif !defined(TRANS) && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNLU
+#define TRSV TRSV_NLU
+#elif !defined(TRANS) && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNLN
+#define TRSV TRSV_NLN
+#elif defined(TRANS) && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTUU
+#define TRSV TRSV_TUU
+#elif defined(TRANS) && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTUN
+#define TRSV TRSV_TUN
+#elif defined(TRANS) && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTLU
+#define TRSV TRSV_TLU
+#elif defined(TRANS) && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTLN
+#define TRSV TRSV_TLN
+#endif
+
+static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
+			 FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
+
+  TRSM (args, range_m, range_n, sa, sb, 0);
+
+  return 0;
+}
+
+blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
+
+  int mode;
+
+#ifndef TRANS
+    if (args -> n  == 1){
+      TRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+#ifdef XDOUBLE
+      mode  =  BLAS_XDOUBLE | BLAS_REAL;
+#elif defined(DOUBLE)
+      mode  =  BLAS_DOUBLE  | BLAS_REAL;
+#else
+      mode  =  BLAS_SINGLE  | BLAS_REAL;
+#endif
+
+      gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb,  args -> nthreads);
+    }
+#else
+    if (args -> n == 1){
+      TRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+#ifdef XDOUBLE
+      mode  =  BLAS_XDOUBLE | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT);
+#elif defined(DOUBLE)
+      mode  =  BLAS_DOUBLE  | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT);
+#else
+      mode  =  BLAS_SINGLE  | BLAS_REAL | (1 << BLAS_TRANSA_SHIFT);
+#endif
+
+      gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb,  args -> nthreads);
+    }
+#endif
+
+    return 0;
+  }
diff --git a/lapack/trtrs/trtrs_single.c b/lapack/trtrs/trtrs_single.c
index a690d4a25..c82b81303 100644
--- a/lapack/trtrs/trtrs_single.c
+++ b/lapack/trtrs/trtrs_single.c
@@ -39,65 +39,37 @@
 #include <stdio.h>
 #include "common.h"
 
+#if   !defined(TRANS) && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNUU
+#define TRSV TRSV_NUU
+#elif !defined(TRANS) && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNUN
+#define TRSV TRSV_NUN
+#elif !defined(TRANS) && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNLU
+#define TRSV TRSV_NLU
+#elif !defined(TRANS) && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNLN
+#define TRSV TRSV_NLN
+#elif defined(TRANS) && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTUU
+#define TRSV TRSV_TUU
+#elif defined(TRANS) && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTUN
+#define TRSV TRSV_TUN
+#elif defined(TRANS) && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTLU
+#define TRSV TRSV_TLU
+#elif defined(TRANS) && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTLN
+#define TRSV TRSV_TLN
+#endif
+
 blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
 
-#ifndef UPLO
-#ifndef DIAG
-#ifndef DIAG
-    if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    }
-#else
-    if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    }
-#endif
-#else
-#ifndef DIAG
-    if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    }
-#else
-    if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    }
-#endif
-#else
-#ifndef DIAG
-#ifndef DIAG
-    if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    }
-#else
-    if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    }
-#endif
-#else
-#ifndef DIAG
     if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+        TRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb);
     } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
+        TRSM (args, range_m, range_n, sa, sb, 0);
     }
-#else
-    if (args -> n == 1){
-        TRSV_NLU (args -> m, args -> a, args -> lda, args -> b, 1, sb);
-    } else {
-        TRSM_LNLU (args, range_m, range_n, sa, sb, 0);
-    }
-#endif
-#endif
   return 0;  }
diff --git a/lapack/trtrs/ztrtrs_parallel.c b/lapack/trtrs/ztrtrs_parallel.c
new file mode 100644
index 000000000..d5248f21b
--- /dev/null
+++ b/lapack/trtrs/ztrtrs_parallel.c
@@ -0,0 +1,118 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if   TRANS == 1 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNUU
+#define ZTRSV ZTRSV_NUU
+#elif TRANS == 1 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNUN
+#define ZTRSV ZTRSV_NUN
+#elif TRANS == 1 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNLU
+#define ZTRSV ZTRSV_NLU
+#elif TRANS == 1 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNLN
+#define ZTRSV ZTRSV_NLN
+#elif TRANS == 2 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTUU
+#define ZTRSV ZTRSV_TUU
+#elif TRANS == 2 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTUN
+#define ZTRSV ZTRSV_TUN
+#elif TRANS == 2 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTLU
+#define ZTRSV ZTRSV_TLU
+#elif TRANS == 2 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTLN
+#define ZTRSV ZTRSV_TLN
+#elif TRANS == 3 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LRUU
+#define ZTRSV ZTRSV_RUU
+#elif TRANS == 3 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LRUN
+#define ZTRSV ZTRSV_RUN
+#elif TRANS == 3 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LRLU
+#define ZTRSV ZTRSV_RLU
+#elif TRANS == 3 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LRLN
+#define ZTRSV ZTRSV_RLN
+#elif TRANS == 4 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LCUU
+#define ZTRSV ZTRSV_CUU
+#elif TRANS == 4 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LCUN
+#define ZTRSV ZTRSV_CUN
+#elif TRANS == 4 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LCLU
+#define ZTRSV ZTRSV_CLU
+#elif TRANS == 4 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LCLN
+#define ZTRSV ZTRSV_CLN
+#endif
+
+static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
+			 FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
+
+  TRSM (args, range_m, range_n, sa, sb, 0);
+  return 0;
+}
+
+blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
+
+  int mode;
+
+    if (args -> n == 1){
+      ZTRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+#ifdef XDOUBLE
+      mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
+#elif defined(DOUBLE)
+      mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
+#else
+      mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
+#endif
+
+      gemm_thread_n(mode, args, NULL, NULL, inner_thread, sa, sb, args -> nthreads);
+    }
+
+   return 0;
+  }
diff --git a/lapack/trtrs/ztrtrs_single.c b/lapack/trtrs/ztrtrs_single.c
new file mode 100644
index 000000000..f39d72900
--- /dev/null
+++ b/lapack/trtrs/ztrtrs_single.c
@@ -0,0 +1,98 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+
+#if   TRANS == 1 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNUU
+#define ZTRSV ZTRSV_NUU
+#elif TRANS == 1 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNUN
+#define ZTRSV ZTRSV_NUN
+#elif TRANS == 1 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LNLU
+#define ZTRSV ZTRSV_NLU
+#elif TRANS == 1 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LNLN
+#define ZTRSV ZTRSV_NLN
+#elif TRANS == 2 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTUU
+#define ZTRSV ZTRSV_TUU
+#elif TRANS == 2 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTUN
+#define ZTRSV ZTRSV_TUN
+#elif TRANS == 2 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LTLU
+#define ZTRSV ZTRSV_TLU
+#elif TRANS == 2 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LTLN
+#define ZTRSV ZTRSV_TLN
+#elif TRANS == 3 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LRUU
+#define ZTRSV ZTRSV_RUU
+#elif TRANS == 3 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LRUN
+#define ZTRSV ZTRSV_RUN
+#elif TRANS == 3 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LRLU
+#define ZTRSV ZTRSV_RLU
+#elif TRANS == 3 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LRLN
+#define ZTRSV ZTRSV_RLN
+#elif TRANS == 4 && !defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LCUU
+#define ZTRSV ZTRSV_CUU
+#elif TRANS == 4 && !defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LCUN
+#define ZTRSV ZTRSV_CUN
+#elif TRANS == 4 && defined(UPLO) && !defined(DIAG)
+#define TRSM TRSM_LCLU
+#define ZTRSV ZTRSV_CLU
+#elif TRANS == 4 && defined(UPLO) && defined(DIAG)
+#define TRSM TRSM_LCLN
+#define ZTRSV ZTRSV_CLN
+#endif
+
+blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos) {
+    if (args -> n == 1){
+        ZTRSV (args -> m, args -> a, args -> lda, args -> b, 1, sb);
+    } else {
+        TRSM (args, range_m, range_n, sa, sb, 0);
+    }
+  return 0;  }

From 9b2f0323d6ecc36b16038c6a806a4e3106245d5a Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 6 Sep 2019 16:01:55 -0400
Subject: [PATCH 030/210] update Makefile

---
 interface/Makefile | 33 ++++++++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index f0577796d..2edf6387a 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -394,7 +394,7 @@ XBLASOBJS    = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS)
 SLAPACKOBJS	= \
 	sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \
 	spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX)  \
-	slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX)
+	slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX)
 
 
 #DLAPACKOBJS	= \
@@ -405,14 +405,14 @@ SLAPACKOBJS	= \
 DLAPACKOBJS	= \
 	dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \
 	dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX)  \
-	dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX)
+	dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX)
 
 
 QLAPACKOBJS	= \
 	qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \
 	qpotf2.$(SUFFIX) qpotrf.$(SUFFIX) qtrti2.$(SUFFIX) qtrtri.$(SUFFIX) \
-	qlaswp.$(SUFFIX) qgetrs.$(SUFFIX) qgesv.$(SUFFIX)  qpotri.$(SUFFIX) \
-
+	qlaswp.$(SUFFIX) qtrtrs.$(SUFFIX) qgesv.$(SUFFIX)  qpotri.$(SUFFIX) \
+	qtrtrs.$(SUFFIX)
 
 #CLAPACKOBJS	= \
 #	cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
@@ -423,7 +423,7 @@ QLAPACKOBJS	= \
 CLAPACKOBJS	= \
 	cgetrf.$(SUFFIX) cgetrs.$(SUFFIX) cpotrf.$(SUFFIX) cgetf2.$(SUFFIX) \
 	cpotf2.$(SUFFIX) claswp.$(SUFFIX) cgesv.$(SUFFIX) clauu2.$(SUFFIX) \
-	clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) 
+	clauum.$(SUFFIX) ctrti2.$(SUFFIX) ctrtri.$(SUFFIX) ctrtrs.$(SUFFIX)
 
 
 #ZLAPACKOBJS	= \
@@ -435,13 +435,14 @@ CLAPACKOBJS	= \
 ZLAPACKOBJS	= \
 	zgetrf.$(SUFFIX) zgetrs.$(SUFFIX) zpotrf.$(SUFFIX) zgetf2.$(SUFFIX) \
 	zpotf2.$(SUFFIX) zlaswp.$(SUFFIX) zgesv.$(SUFFIX)  zlauu2.$(SUFFIX) \
-	zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) 
+	zlauum.$(SUFFIX) ztrti2.$(SUFFIX) ztrtri.$(SUFFIX) ztrtrs.$(SUFFIX)
 
 
 XLAPACKOBJS	= \
 	xgetf2.$(SUFFIX) xgetrf.$(SUFFIX) xlauu2.$(SUFFIX) xlauum.$(SUFFIX) \
 	xpotf2.$(SUFFIX) xpotrf.$(SUFFIX) xtrti2.$(SUFFIX) xtrtri.$(SUFFIX) \
-	xlaswp.$(SUFFIX) xgetrs.$(SUFFIX) xgesv.$(SUFFIX)  xpotri.$(SUFFIX) \
+	xlaswp.$(SUFFIX) xtrtrs.$(SUFFIX) xgesv.$(SUFFIX)  xpotri.$(SUFFIX) \
+	xtrtrs.$(SUFFIX)
 
 ifneq ($(NO_LAPACK), 1)
 SBLASOBJS += $(SLAPACKOBJS)
@@ -2043,6 +2044,24 @@ zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
 xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
+strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : trtrs.c
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
+xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : ztrtrs.c
+	$(CC) -c $(CFLAGS) $< -o $(@F)
+
 sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 

From c7b5a459b6191ceb5ad244b3ff2d059a9751375e Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 6 Sep 2019 16:48:18 -0400
Subject: [PATCH 031/210] add missing defines and headers

---
 common_lapack.h | 146 ++++++++++++++++++++++++++++++++++++++++++++++++
 common_macro.h  |  48 ++++++++++++++++
 2 files changed, 194 insertions(+)

diff --git a/common_lapack.h b/common_lapack.h
index f6d1956fc..f9c36646a 100644
--- a/common_lapack.h
+++ b/common_lapack.h
@@ -293,4 +293,150 @@ blasint zlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLO
 blasint xlarf_L(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
 blasint xlarf_R(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
 
+blasint strtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint dtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint qtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint ctrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ztrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint xtrtrs_UNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_URU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_URN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LNU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LNN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LTU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LTN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LRU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LRN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LCU_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LCN_single(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+
+blasint strtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint strtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint dtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint dtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint qtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint qtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint ctrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ctrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
+blasint ztrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint ztrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, double *, double *, BLASLONG);
+blasint xtrtrs_UNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_URU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_URN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_UCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LNU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LNN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LTU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LTN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LRU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
+
 #endif
diff --git a/common_macro.h b/common_macro.h
index e8a4a66ed..13bb85794 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -2867,51 +2867,99 @@ typedef struct {
 #define  TRTRS_UNN_SINGLE  xtrtrs_UNN_single
 #define  TRTRS_UTU_SINGLE  xtrtrs_UTU_single
 #define  TRTRS_UTN_SINGLE  xtrtrs_UTN_single
+#define  TRTRS_URU_SINGLE  xtrtrs_URU_single
+#define  TRTRS_URN_SINGLE  xtrtrs_URN_single
+#define  TRTRS_UCU_SINGLE  xtrtrs_UCU_single
+#define  TRTRS_UCN_SINGLE  xtrtrs_UCN_single
 #define  TRTRS_LNU_SINGLE  xtrtrs_LNU_single
 #define  TRTRS_LNN_SINGLE  xtrtrs_LNN_single
 #define  TRTRS_LTU_SINGLE  xtrtrs_LTU_single
 #define  TRTRS_LTN_SINGLE  xtrtrs_LTN_single
+#define  TRTRS_LRU_SINGLE  xtrtrs_LRU_single
+#define  TRTRS_LRN_SINGLE  xtrtrs_LRN_single
+#define  TRTRS_LCU_SINGLE  xtrtrs_LCU_single
+#define  TRTRS_LCN_SINGLE  xtrtrs_LCN_single
 #define  TRTRS_UNU_PARALLEL  xtrtrs_UNU_parallel
 #define  TRTRS_UNN_PARALLEL  xtrtrs_UNN_parallel
 #define  TRTRS_UTU_PARALLEL  xtrtrs_UTU_parallel
 #define  TRTRS_UTN_PARALLEL  xtrtrs_UTN_parallel
+#define  TRTRS_URU_PARALLEL  xtrtrs_URU_parallel
+#define  TRTRS_URN_PARALLEL  xtrtrs_URN_parallel
+#define  TRTRS_UCU_PARALLEL  xtrtrs_UCU_parallel
+#define  TRTRS_UCN_PARALLEL  xtrtrs_UCN_parallel
 #define  TRTRS_LNU_PARALLEL  xtrtrs_LNU_parallel
 #define  TRTRS_LNN_PARALLEL  xtrtrs_LNN_parallel
 #define  TRTRS_LTU_PARALLEL  xtrtrs_LTU_parallel
 #define  TRTRS_LTN_PARALLEL  xtrtrs_LTN_parallel
+#define  TRTRS_LRU_PARALLEL  xtrtrs_LRU_parallel
+#define  TRTRS_LRN_PARALLEL  xtrtrs_LRN_parallel
+#define  TRTRS_LCU_PARALLEL  xtrtrs_LCU_parallel
+#define  TRTRS_LCN_PARALLEL  xtrtrs_LCN_parallel
 #elif defined(DOUBLE)
 #define  TRTRS_UNU_SINGLE  ztrtrs_UNU_single
 #define  TRTRS_UNN_SINGLE  ztrtrs_UNN_single
 #define  TRTRS_UTU_SINGLE  ztrtrs_UTU_single
 #define  TRTRS_UTN_SINGLE  ztrtrs_UTN_single
+#define  TRTRS_URU_SINGLE  ztrtrs_URU_single
+#define  TRTRS_URN_SINGLE  ztrtrs_URN_single
+#define  TRTRS_UCU_SINGLE  ztrtrs_UCU_single
+#define  TRTRS_UCN_SINGLE  ztrtrs_UCN_single
 #define  TRTRS_LNU_SINGLE  ztrtrs_LNU_single
 #define  TRTRS_LNN_SINGLE  ztrtrs_LNN_single
 #define  TRTRS_LTU_SINGLE  ztrtrs_LTU_single
 #define  TRTRS_LTN_SINGLE  ztrtrs_LTN_single
+#define  TRTRS_LRU_SINGLE  ztrtrs_LRU_single
+#define  TRTRS_LRN_SINGLE  ztrtrs_LRN_single
+#define  TRTRS_LCU_SINGLE  ztrtrs_LCU_single
+#define  TRTRS_LCN_SINGLE  ztrtrs_LCN_single
 #define  TRTRS_UNU_PARALLEL  ztrtrs_UNU_parallel
 #define  TRTRS_UNN_PARALLEL  ztrtrs_UNN_parallel
 #define  TRTRS_UTU_PARALLEL  ztrtrs_UTU_parallel
 #define  TRTRS_UTN_PARALLEL  ztrtrs_UTN_parallel
+#define  TRTRS_URU_PARALLEL  ztrtrs_URU_parallel
+#define  TRTRS_URN_PARALLEL  ztrtrs_URN_parallel
+#define  TRTRS_UCU_PARALLEL  ztrtrs_UCU_parallel
+#define  TRTRS_UCN_PARALLEL  ztrtrs_UCN_parallel
 #define  TRTRS_LNU_PARALLEL  ztrtrs_LNU_parallel
 #define  TRTRS_LNN_PARALLEL  ztrtrs_LNN_parallel
 #define  TRTRS_LTU_PARALLEL  ztrtrs_LTU_parallel
 #define  TRTRS_LTN_PARALLEL  ztrtrs_LTN_parallel
+#define  TRTRS_LRU_PARALLEL  ztrtrs_LRU_parallel
+#define  TRTRS_LRN_PARALLEL  ztrtrs_LRN_parallel
+#define  TRTRS_LCU_PARALLEL  ztrtrs_LCU_parallel
+#define  TRTRS_LCN_PARALLEL  ztrtrs_LCN_parallel
 #else
 #define  TRTRS_UNU_SINGLE  ctrtrs_UNU_single
 #define  TRTRS_UNN_SINGLE  ctrtrs_UNN_single
 #define  TRTRS_UTU_SINGLE  ctrtrs_UTU_single
 #define  TRTRS_UTN_SINGLE  ctrtrs_UTN_single
+#define  TRTRS_URU_SINGLE  ctrtrs_URU_single
+#define  TRTRS_URN_SINGLE  ctrtrs_URN_single
+#define  TRTRS_UCU_SINGLE  ctrtrs_UCU_single
+#define  TRTRS_UCN_SINGLE  ctrtrs_UCN_single
 #define  TRTRS_LNU_SINGLE  ctrtrs_LNU_single
 #define  TRTRS_LNN_SINGLE  ctrtrs_LNN_single
 #define  TRTRS_LTU_SINGLE  ctrtrs_LTU_single
 #define  TRTRS_LTN_SINGLE  ctrtrs_LTN_single
+#define  TRTRS_LRU_SINGLE  ctrtrs_LRU_single
+#define  TRTRS_LRN_SINGLE  ctrtrs_LRN_single
+#define  TRTRS_LCU_SINGLE  ctrtrs_LCU_single
+#define  TRTRS_LCN_SINGLE  ctrtrs_LCN_single
 #define  TRTRS_UNU_PARALLEL  ctrtrs_UNU_parallel
 #define  TRTRS_UNN_PARALLEL  ctrtrs_UNN_parallel
 #define  TRTRS_UTU_PARALLEL  ctrtrs_UTU_parallel
 #define  TRTRS_UTN_PARALLEL  ctrtrs_UTN_parallel
+#define  TRTRS_URU_PARALLEL  ctrtrs_URU_parallel
+#define  TRTRS_URN_PARALLEL  ctrtrs_URN_parallel
+#define  TRTRS_UCU_PARALLEL  ctrtrs_UCU_parallel
+#define  TRTRS_UCN_PARALLEL  ctrtrs_UCN_parallel
 #define  TRTRS_LNU_PARALLEL  ctrtrs_LNU_parallel
 #define  TRTRS_LNN_PARALLEL  ctrtrs_LNN_parallel
 #define  TRTRS_LTU_PARALLEL  ctrtrs_LTU_parallel
 #define  TRTRS_LTN_PARALLEL  ctrtrs_LTN_parallel
+#define  TRTRS_LRU_PARALLEL  ctrtrs_LRU_parallel
+#define  TRTRS_LRN_PARALLEL  ctrtrs_LRN_parallel
+#define  TRTRS_LCU_PARALLEL  ctrtrs_LCU_parallel
+#define  TRTRS_LCN_PARALLEL  ctrtrs_LCN_parallel
 #endif
 #endif

From af9ac0898af4357cf66d34215d9711df64f6e858 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 6 Sep 2019 16:49:12 -0400
Subject: [PATCH 032/210] fix Makefile

---
 interface/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 2edf6387a..3f0dcca28 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -2032,7 +2032,7 @@ sgetrs.$(SUFFIX) sgetrs.$(PSUFFIX) : lapack/getrs.c
 dgetrs.$(SUFFIX) dgetrs.$(PSUFFIX) : lapack/getrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
-qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : getrs.c
+qgetrs.$(SUFFIX) qgetrs.$(PSUFFIX) : lapack/getrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
@@ -2041,7 +2041,7 @@ cgetrs.$(SUFFIX) cgetrs.$(PSUFFIX) : lapack/zgetrs.c
 zgetrs.$(SUFFIX) zgetrs.$(PSUFFIX) : lapack/zgetrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
-xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : zgetrs.c
+xgetrs.$(SUFFIX) xgetrs.$(PSUFFIX) : lapack/zgetrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
@@ -2050,7 +2050,7 @@ strtrs.$(SUFFIX) strtrs.$(PSUFFIX) : lapack/trtrs.c
 dtrtrs.$(SUFFIX) dtrtrs.$(PSUFFIX) : lapack/trtrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
-qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : trtrs.c
+qtrtrs.$(SUFFIX) qtrtrs.$(PSUFFIX) : lapack/trtrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
@@ -2059,7 +2059,7 @@ ctrtrs.$(SUFFIX) ctrtrs.$(PSUFFIX) : lapack/ztrtrs.c
 ztrtrs.$(SUFFIX) ztrtrs.$(PSUFFIX) : lapack/ztrtrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
-xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : ztrtrs.c
+xtrtrs.$(SUFFIX) xtrtrs.$(PSUFFIX) : lapack/ztrtrs.c
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 
 sgesv.$(SUFFIX) sgesv.$(PSUFFIX) : lapack/gesv.c

From 7ec7b999a543a5480db1f76dcb413deee6c50e2d Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 6 Sep 2019 16:49:27 -0400
Subject: [PATCH 033/210] add missing file

---
 interface/lapack/ztrtrs.c | 171 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 171 insertions(+)
 create mode 100644 interface/lapack/ztrtrs.c

diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c
new file mode 100644
index 000000000..4cd423069
--- /dev/null
+++ b/interface/lapack/ztrtrs.c
@@ -0,0 +1,171 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#ifdef XDOUBLE
+#define ERROR_NAME "XTRTRS"
+#elif defined(DOUBLE)
+#define ERROR_NAME "ZTRTRS"
+#else
+#define ERROR_NAME "CTRTRS"
+#endif
+
+static blasint (*trtrs_single[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
+    TRTRS_UNU_SINGLE, TRTRS_UNN_SINGLE, TRTRS_UTU_SINGLE, TRTRS_UTN_SINGLE, TRTRS_URU_SINGLE, TRTRS_URN_SINGLE, TRTRS_UCU_SINGLE, TRTRS_UCN_SINGLE, TRTRS_LNU_SINGLE, TRTRS_LNN_SINGLE, TRTRS_LTU_SINGLE, TRTRS_LTN_SINGLE, TRTRS_LRU_SINGLE, TRTRS_LRN_SINGLE, TRTRS_LCU_SINGLE, TRTRS_LCN_SINGLE,
+};
+
+#ifdef SMP
+static blasint (*trtrs_parallel[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
+    TRTRS_UNU_PARALLEL, TRTRS_UNN_PARALLEL, TRTRS_UTU_PARALLEL, TRTRS_UTN_PARALLEL, TRTRS_URU_PARALLEL, TRTRS_URN_PARALLEL, TRTRS_UCU_PARALLEL, TRTRS_UCN_PARALLEL, TRTRS_LNU_PARALLEL, TRTRS_LNN_PARALLEL, TRTRS_LTU_PARALLEL, TRTRS_LTN_PARALLEL, TRTRS_LRU_PARALLEL, TRTRS_LRN_PARALLEL, TRTRS_LCU_PARALLEL, TRTRS_LCN_PARALLEL,
+};
+#endif
+
+int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
+  FLOAT *b, blasint *ldB, blasint *Info){
+
+    char uplo_arg = *UPLO;
+    char trans_arg = *TRANS;
+    char diag_arg = *DIAG;
+
+  blas_arg_t args;
+
+  blasint info;
+  int uplo, trans, diag;
+  FLOAT *buffer;
+#ifdef PPC440
+  extern
+#endif
+  FLOAT *sa, *sb;
+
+  PRINT_DEBUG_NAME;
+
+  args.m    = *N;
+  args.n    = *NRHS;
+  args.a    = (void *)a;
+  args.lda  = *ldA;
+  args.b    = (void *)b;
+  args.ldb  = *ldB;
+
+  info = 0;
+
+  TOUPPER(trans_arg);
+  trans = -1;
+  if (trans_arg == 'N') trans = 0;
+  if (trans_arg == 'T') trans = 1;
+  if (trans_arg == 'R') trans = 2;
+  if (trans_arg == 'C') trans = 3;
+
+  uplo = -1;
+  if (uplo_arg == 'U') uplo = 0;
+  if (uplo_arg == 'L') uplo = 1;
+
+  diag = -1;
+  if (diag_arg == 'U') diag = 0;
+  if (diag_arg == 'N') diag = 1;
+
+  if (args.ldb  < MAX(1, args.m)) info = 7;
+  if (args.lda  < MAX(1, args.m)) info = 9;
+  if (args.n    < 0) info = 5;
+  if (args.m    < 0) info = 4;
+  if (trans     < 0) info = 2;
+  if (uplo      < 0) info = 1;
+  if (diag      < 0) info = 3;
+
+  if (info != 0) {
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    *Info = - info;
+    return 0;
+  }
+
+  args.alpha = NULL;
+  args.beta  = NULL;
+
+  *Info = 0;
+
+  if (args.m == 0 || args.n == 0) return 0;
+
+  if (diag) {
+    if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) {
+      *Info = IAMIN_K(args.n, args.a, args.lda + 1);
+      return 0;
+    }
+  }
+
+
+  IDEBUG_START;
+
+  FUNCTION_PROFILE_START();
+
+#ifndef PPC440
+  buffer = (FLOAT *)blas_memory_alloc(1);
+
+  sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A);
+  sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
+#endif
+
+#ifdef SMP
+  args.common = NULL;
+  args.nthreads = num_cpu_avail(4);
+
+  if (args.nthreads == 1) {
+#endif
+
+      (trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
+
+#ifdef SMP
+  } else {
+    (trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
+  }
+#endif
+
+#ifndef PPC440
+  blas_memory_free(buffer);
+#endif
+
+  FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.m * args.n, 2 * args.m * args.m * args.n);
+
+  IDEBUG_END;
+
+  return 0;
+
+}

From 4b21b646ea49c9e00490bb29f978d1a91a4a1192 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Fri, 6 Sep 2019 17:19:40 -0400
Subject: [PATCH 034/210] turn on optimized code

---
 lapack-netlib/SRC/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 87a8f51e4..1c276aff6 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -507,22 +507,22 @@ ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o
 SLAPACKOBJS     = \
         sgetrf.o sgetrs.o spotrf.o sgetf2.o \
         spotf2.o slaswp.o sgesv.o slauu2.o  \
-        slauum.o strti2.o strtri.o
+        slauum.o strti2.o strtri.o strtrs.o
 
 DLAPACKOBJS     = \
         dgetrf.o dgetrs.o dpotrf.o dgetf2.o \
         dpotf2.o dlaswp.o dgesv.o dlauu2.o  \
-        dlauum.o dtrti2.o dtrtri.o
+        dlauum.o dtrti2.o dtrtri.o dtrtrs.o
 
 CLAPACKOBJS     = \
         cgetrf.o cgetrs.o cpotrf.o cgetf2.o \
         cpotf2.o claswp.o cgesv.o clauu2.o \
-        clauum.o ctrti2.o ctrtri.o
+        clauum.o ctrti2.o ctrtri.o ctrtrs.o
 
 ZLAPACKOBJS     = \
         zgetrf.o zgetrs.o zpotrf.o zgetf2.o \
         zpotf2.o zlaswp.o zgesv.o  zlauu2.o \
-        zlauum.o ztrti2.o ztrtri.o
+        zlauum.o ztrti2.o ztrtri.o ztrtrs.o
 
 
 ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O))

From 5997b6b491a7fb9b0184f4edfc1c76d16de3affa Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Sat, 7 Sep 2019 22:06:27 -0400
Subject: [PATCH 035/210] bugfix

---
 interface/lapack/ztrtrs.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c
index 4cd423069..0536fc5d3 100644
--- a/interface/lapack/ztrtrs.c
+++ b/interface/lapack/ztrtrs.c
@@ -150,11 +150,11 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
   if (args.nthreads == 1) {
 #endif
 
-      (trtrs_single[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
+      (trtrs_single[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
 
 #ifdef SMP
   } else {
-    (trtrs_parallel[(uplo << 2) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
+    (trtrs_parallel[(uplo << 3) | (trans << 1) | diag])(&args, NULL, NULL, sa, sb, 0);
   }
 #endif
 

From f2becb777a9640225a6ef89b939fb8b0bdbbbc77 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Mon, 9 Sep 2019 11:36:50 -0400
Subject: [PATCH 036/210] fix Makefile

---
 lapack/trtrs/Makefile | 64 +++++++++++++++++++++----------------------
 1 file changed, 32 insertions(+), 32 deletions(-)

diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile
index f9faaf9b9..587d94e3d 100644
--- a/lapack/trtrs/Makefile
+++ b/lapack/trtrs/Makefile
@@ -257,52 +257,52 @@ ztrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c
 ztrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-xtrtrs_UNU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_UNU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-xtrtrs_UNN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_UNN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-xtrtrs_UTU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_UTU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-xtrtrs_UTN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_UTN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-xtrtrs_URU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_URU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-xtrtrs_URN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_URN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-xtrtrs_UCU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_UCU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-xtrtrs_UCN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_UCN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-xtrtrs_LNU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LNU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-xtrtrs_LNN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LNN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-xtrtrs_LTU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LTU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-xtrtrs_LTN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LTN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-xtrtrs_LRU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LRU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-xtrtrs_LRN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LRN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-xtrtrs_LCU_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LCU_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-xtrtrs_LCN_single.$(SUFFIX) : xtrtrs_single.c
+xtrtrs_LCN_single.$(SUFFIX) : ztrtrs_single.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
 ctrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c
@@ -401,52 +401,52 @@ ztrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 ztrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-xtrtrs_UNU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_UNU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-xtrtrs_UNN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_UNN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-xtrtrs_UTU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_UTU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-xtrtrs_UTN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_UTN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-xtrtrs_URU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_URU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-xtrtrs_URN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_URN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-xtrtrs_UCU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_UCU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-xtrtrs_UCN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_UCN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -UUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
-xtrtrs_LNU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LNU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -UDIAG $< -o $(@F)
 
-xtrtrs_LNN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LNN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=1 -DDIAG $< -o $(@F)
 
-xtrtrs_LTU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LTU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -UDIAG $< -o $(@F)
 
-xtrtrs_LTN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LTN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=2 -DDIAG $< -o $(@F)
 
-xtrtrs_LRU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LRU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -UDIAG $< -o $(@F)
 
-xtrtrs_LRN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LRN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=3 -DDIAG $< -o $(@F)
 
-xtrtrs_LCU_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LCU_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -UDIAG $< -o $(@F)
 
-xtrtrs_LCN_parallel.$(SUFFIX) : xtrtrs_parallel.c
+xtrtrs_LCN_parallel.$(SUFFIX) : ztrtrs_parallel.c
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DUPLO -DTRANS=4 -DDIAG $< -o $(@F)
 
 include ../../Makefile.tail

From eb45eb6942b9aa35970f4f58ce268130891a79c5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 10 Sep 2019 08:27:06 +0200
Subject: [PATCH 037/210] Fix C compiler handling and BINARY=32 mode in CMAKE
 builds (#2248)

* Fix compiler identification and option setting

* Handle BINARY=32 option on X86_64

* Add xGEMM3M unroll parameters for crossbuild-target CORE2

* Replace bogus mingw64/32bit CI job with actual 32bit build

mingw64 is not multilib-capable, so using an x86_64-mingw with BINARY=32 in the CI was not going to work anyway (but build passed while BINARY=32 was ignored).
---
 appveyor.yml             |  7 ++++---
 cmake/cc.cmake           | 10 +++++-----
 cmake/prebuild.cmake     |  4 ++++
 cmake/system_check.cmake | 31 +++++++++++++++++++++++++++----
 4 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/appveyor.yml b/appveyor.yml
index 2f9cc7b0b..1936059d5 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -38,7 +38,8 @@ environment:
     - COMPILER: MinGW64-gcc-7.2.0-mingw
       DYNAMIC_ARCH: OFF
       WITH_FORTRAN: ignore
-    - COMPILER: MinGW64-gcc-7.2.0
+    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+      COMPILER: MinGW-gcc-6.3.0-32   
     - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
       COMPILER: MinGW-gcc-5.3.0
       WITH_FORTRAN: ignore
@@ -62,10 +63,10 @@ before_build:
   - set PATH=%PATH:C:\Program Files\Git\usr\bin;=%
   - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
   - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
-  - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH%
+  - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] set PATH=C:\msys64\usr\bin;C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw64\bin;%PATH%
   - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
   - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 ..
-  - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles"  -DBINARY=32 -DNOFORTRAN=1 ..
+  - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
   - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 ..
   - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
   - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 98f9298f8..37da0d6ed 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -3,7 +3,7 @@
 ## Description: Ported from portion of OpenBLAS/Makefile.system
 ##              Sets C related variables.
 
-if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_C_COMPILER} STREQUAL "Clang")
+if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LSB" OR ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
 
   set(CCOMMON_OPT "${CCOMMON_OPT} -Wall")
   set(COMMON_PROF "${COMMON_PROF} -fno-inline")
@@ -43,7 +43,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "GNU" OR ${CMAKE_C_COMPILER} STREQUAL "LSB" OR
   endif ()
 endif ()
 
-if (${CMAKE_C_COMPILER} STREQUAL "PGI")
+if (${CMAKE_C_COMPILER_ID} STREQUAL "PGI")
   if (BINARY64)
     set(CCOMMON_OPT "${CCOMMON_OPT} -tp p7-64")
   else ()
@@ -51,7 +51,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PGI")
   endif ()
 endif ()
 
-if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
+if (${CMAKE_C_COMPILER_ID} STREQUAL "PATHSCALE")
   if (BINARY64)
     set(CCOMMON_OPT "${CCOMMON_OPT} -m64")
   else ()
@@ -59,7 +59,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "PATHSCALE")
   endif ()
 endif ()
 
-if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
+if (${CMAKE_C_COMPILER_ID} STREQUAL "OPEN64")
 
   if (MIPS64)
 
@@ -87,7 +87,7 @@ if (${CMAKE_C_COMPILER} STREQUAL "OPEN64")
   endif ()
 endif ()
 
-if (${CMAKE_C_COMPILER} STREQUAL "SUN")
+if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
   set(CCOMMON_OPT "${CCOMMON_OPT} -w")
   if (X86)
     set(CCOMMON_OPT "${CCOMMON_OPT} -m32")
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index da185db5a..086df1943 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -133,6 +133,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       set(CGEMM_UNROLL_N 2)
       set(ZGEMM_UNROLL_M 2)
       set(ZGEMM_UNROLL_N 2)
+      set(CGEMM3M_UNROLL_M 8)
+      set(CGEMM3M_UNROLL_N 4)
+      set(ZGEMM3M_UNROLL_M 4)
+      set(ZGEMM3M_UNROLL_N 4)
   elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake
index 610f689e0..c4a553c5a 100644
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@@ -39,10 +39,18 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
   set(MIPS64 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
-  if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-    set(X86_64 1)
+  if (NOT BINARY)
+    if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+      set(X86_64 1)
+    else()
+      set(X86 1)
+    endif()
   else()
-    set(X86 1)
+    if (${BINARY} EQUAL "64")
+       set(X86_64 1)
+    else ()
+       set(X86 1)
+    endif()
   endif()
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
   set(X86 1)
@@ -54,6 +62,22 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)")
   else()
     set(ARM 1)
   endif()
+elseif (${CMAKE_CROSSCOMPILING})
+   if (${TARGET} STREQUAL "CORE2")
+    if (NOT BINARY)
+       set(X86 1)
+    elseif (${BINARY} EQUAL "64")
+       set(X86_64 1)
+    else ()
+       set(X86 1)
+    endif()
+   elseif (${TARGET} STREQUAL "ARMV7")
+       set(ARM 1)
+   else()
+       set(ARM64 1)
+   endif ()
+else ()
+   message(WARNING "Target ARCH could not be determined, got \"${CMAKE_SYSTEM_PROCESSOR}\"")
 endif()
 
 if (X86_64)
@@ -92,4 +116,3 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
 endif()
   file(REMOVE "avx512.tmp" "avx512.o")
 endif()
-

From 459bb9291db0a9a97718cb312c77f8ea3dba7c60 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Tue, 10 Sep 2019 17:10:33 -0400
Subject: [PATCH 038/210] fix error codes

---
 interface/lapack/trtrs.c  | 6 +++---
 interface/lapack/ztrtrs.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/interface/lapack/trtrs.c b/interface/lapack/trtrs.c
index 261b07ec6..96dde1618 100644
--- a/interface/lapack/trtrs.c
+++ b/interface/lapack/trtrs.c
@@ -103,8 +103,8 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
   if (diag_arg == 'U') diag = 0;
   if (diag_arg == 'N') diag = 1;
 
-  if (args.ldb  < MAX(1, args.m)) info = 7;
-  if (args.lda  < MAX(1, args.m)) info = 9;
+  if (args.ldb  < MAX(1, args.m)) info = 9;
+  if (args.lda  < MAX(1, args.m)) info = 7;
   if (args.n    < 0) info = 5;
   if (args.m    < 0) info = 4;
   if (trans     < 0) info = 2;
@@ -112,7 +112,7 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
   if (diag      < 0) info = 3;
 
   if (info != 0) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c
index 0536fc5d3..4ee51435b 100644
--- a/interface/lapack/ztrtrs.c
+++ b/interface/lapack/ztrtrs.c
@@ -103,8 +103,8 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
   if (diag_arg == 'U') diag = 0;
   if (diag_arg == 'N') diag = 1;
 
-  if (args.ldb  < MAX(1, args.m)) info = 7;
-  if (args.lda  < MAX(1, args.m)) info = 9;
+  if (args.ldb  < MAX(1, args.m)) info = 9;
+  if (args.lda  < MAX(1, args.m)) info = 7;
   if (args.n    < 0) info = 5;
   if (args.m    < 0) info = 4;
   if (trans     < 0) info = 2;
@@ -112,7 +112,7 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
   if (diag      < 0) info = 3;
 
   if (info != 0) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }

From 6cb47ea3f0a6d4263cca3d2649b8512a6b53192d Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Tue, 10 Sep 2019 17:11:01 -0400
Subject: [PATCH 039/210] fix Makefile

---
 lapack/trtrs/Makefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/lapack/trtrs/Makefile b/lapack/trtrs/Makefile
index 587d94e3d..a3b8f4322 100644
--- a/lapack/trtrs/Makefile
+++ b/lapack/trtrs/Makefile
@@ -36,7 +36,7 @@ strtrs_LNN_single.$(SUFFIX) : trtrs_single.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
 
 strtrs_LTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F)
 
 strtrs_LTN_single.$(SUFFIX) : trtrs_single.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
@@ -60,7 +60,7 @@ strtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
 
 strtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F)
 
 strtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
@@ -84,7 +84,7 @@ dtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
 
 dtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F)
 
 dtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
@@ -108,7 +108,7 @@ dtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
 
 dtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F)
 
 dtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
@@ -132,7 +132,7 @@ qtrtrs_LNN_single.$(SUFFIX) : trtrs_single.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
 
 qtrtrs_LTU_single.$(SUFFIX) : trtrs_single.c
-	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F)
 
 qtrtrs_LTN_single.$(SUFFIX) : trtrs_single.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
@@ -156,7 +156,7 @@ qtrtrs_LNN_parallel.$(SUFFIX) : trtrs_parallel.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -UTRANS -DDIAG $< -o $(@F)
 
 qtrtrs_LTU_parallel.$(SUFFIX) : trtrs_parallel.c
-	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)
+	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -UDIAG $< -o $(@F)
 
 qtrtrs_LTN_parallel.$(SUFFIX) : trtrs_parallel.c
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DUPLO -DTRANS -DDIAG $< -o $(@F)

From 5d6525c87cfc6a7ba69e24eefb8b053d480bc97b Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Tue, 10 Sep 2019 17:30:57 -0400
Subject: [PATCH 040/210] more bugfix

---
 interface/lapack/trtrs.c  | 6 +++---
 interface/lapack/ztrtrs.c | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/interface/lapack/trtrs.c b/interface/lapack/trtrs.c
index 96dde1618..54fbe8394 100644
--- a/interface/lapack/trtrs.c
+++ b/interface/lapack/trtrs.c
@@ -122,11 +122,11 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
 
   *Info = 0;
 
-  if (args.m == 0 || args.n == 0) return 0;
+  if (args.m == 0) return 0;
 
   if (diag) {
-    if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) {
-      *Info = IAMIN_K(args.n, args.a, args.lda + 1);
+    if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
+      *Info = IAMIN_K(args.m, args.a, args.lda + 1);
       return 0;
     }
   }
diff --git a/interface/lapack/ztrtrs.c b/interface/lapack/ztrtrs.c
index 4ee51435b..7f1bd9af4 100644
--- a/interface/lapack/ztrtrs.c
+++ b/interface/lapack/ztrtrs.c
@@ -122,11 +122,11 @@ int NAME(char *UPLO, char* TRANS, char* DIAG, blasint *N, blasint *NRHS, FLOAT *
 
   *Info = 0;
 
-  if (args.m == 0 || args.n == 0) return 0;
+  if (args.m == 0) return 0;
 
   if (diag) {
-    if (AMIN_K(args.n, args.a, args.lda + 1) == ZERO) {
-      *Info = IAMIN_K(args.n, args.a, args.lda + 1);
+    if (AMIN_K(args.m, args.a, args.lda + 1) == ZERO) {
+      *Info = IAMIN_K(args.m, args.a, args.lda + 1);
       return 0;
     }
   }

From 2463938879a93ff8b8207b112d03bbeb4cbabae2 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Wed, 11 Sep 2019 10:33:35 -0400
Subject: [PATCH 041/210] fix error message

---
 interface/lapack/gesv.c   | 14 +++++++-------
 interface/lapack/getf2.c  |  2 +-
 interface/lapack/getrf.c  |  2 +-
 interface/lapack/getrs.c  |  2 +-
 interface/lapack/lauu2.c  |  2 +-
 interface/lapack/lauum.c  |  2 +-
 interface/lapack/potf2.c  |  2 +-
 interface/lapack/potrf.c  |  2 +-
 interface/lapack/potri.c  |  2 +-
 interface/lapack/trti2.c  |  2 +-
 interface/lapack/trtri.c  |  2 +-
 interface/lapack/zgetf2.c |  2 +-
 interface/lapack/zgetrf.c |  2 +-
 interface/lapack/zgetrs.c |  2 +-
 interface/lapack/zlauu2.c |  2 +-
 interface/lapack/zpotf2.c |  2 +-
 interface/lapack/zpotrf.c |  2 +-
 interface/lapack/zpotri.c |  2 +-
 interface/lapack/ztrti2.c |  2 +-
 interface/lapack/ztrtri.c |  2 +-
 20 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/interface/lapack/gesv.c b/interface/lapack/gesv.c
index 721da970d..175350329 100644
--- a/interface/lapack/gesv.c
+++ b/interface/lapack/gesv.c
@@ -44,19 +44,19 @@
 
 #ifndef COMPLEX
 #ifdef XDOUBLE
-#define ERROR_NAME "QGESV  "
+#define ERROR_NAME "QGESV"
 #elif defined(DOUBLE)
-#define ERROR_NAME "DGESV  "
+#define ERROR_NAME "DGESV"
 #else
-#define ERROR_NAME "SGESV  "
+#define ERROR_NAME "SGESV"
 #endif
 #else
 #ifdef XDOUBLE
-#define ERROR_NAME "XGESV  "
+#define ERROR_NAME "XGESV"
 #elif defined(DOUBLE)
-#define ERROR_NAME "ZGESV  "
+#define ERROR_NAME "ZGESV"
 #else
-#define ERROR_NAME "CGESV  "
+#define ERROR_NAME "CGESV"
 #endif
 #endif
 
@@ -89,7 +89,7 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,
   if (args.m   < 0)             info = 1;
 
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/getf2.c b/interface/lapack/getf2.c
index 3e66c0403..8506feca9 100644
--- a/interface/lapack/getf2.c
+++ b/interface/lapack/getf2.c
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
   if (args.n   < 0)             info = 2;
   if (args.m   < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/getrf.c b/interface/lapack/getrf.c
index 44a92ddc4..02bb124b3 100644
--- a/interface/lapack/getrf.c
+++ b/interface/lapack/getrf.c
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
   if (args.n   < 0)             info = 2;
   if (args.m   < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/getrs.c b/interface/lapack/getrs.c
index 1b8c83aca..c2a9eb882 100644
--- a/interface/lapack/getrs.c
+++ b/interface/lapack/getrs.c
@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
   if (trans     < 0) info = 1;
 
   if (info != 0) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     return 0;
   }
 
diff --git a/interface/lapack/lauu2.c b/interface/lapack/lauu2.c
index 3599a4791..e581e3c15 100644
--- a/interface/lapack/lauu2.c
+++ b/interface/lapack/lauu2.c
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (args.n   < 0)             info = 2;
   if (uplo     < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/lauum.c b/interface/lapack/lauum.c
index 2c49eb0b0..70f6a0ec5 100644
--- a/interface/lapack/lauum.c
+++ b/interface/lapack/lauum.c
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (args.n   < 0)             info = 2;
   if (uplo     < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/potf2.c b/interface/lapack/potf2.c
index 837192265..1537b6ee4 100644
--- a/interface/lapack/potf2.c
+++ b/interface/lapack/potf2.c
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (args.n   < 0)             info = 2;
   if (uplo     < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/potrf.c b/interface/lapack/potrf.c
index 092272225..dbd55f62f 100644
--- a/interface/lapack/potrf.c
+++ b/interface/lapack/potrf.c
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (args.n   < 0)             info = 2;
   if (uplo     < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/potri.c b/interface/lapack/potri.c
index d6230621f..2c0c64b6f 100644
--- a/interface/lapack/potri.c
+++ b/interface/lapack/potri.c
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (uplo < 0)                  info = 1;
 
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/trti2.c b/interface/lapack/trti2.c
index 42c4c4815..47f04f06f 100644
--- a/interface/lapack/trti2.c
+++ b/interface/lapack/trti2.c
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
   if (diag < 0)                  info = 2;
   if (uplo < 0)                  info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/trtri.c b/interface/lapack/trtri.c
index 6724a678a..028529389 100644
--- a/interface/lapack/trtri.c
+++ b/interface/lapack/trtri.c
@@ -99,7 +99,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
   if (diag < 0)                  info = 2;
   if (uplo < 0)                  info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/zgetf2.c b/interface/lapack/zgetf2.c
index 59ec4874e..68b9a7e4b 100644
--- a/interface/lapack/zgetf2.c
+++ b/interface/lapack/zgetf2.c
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
   if (args.n   < 0)             info = 2;
   if (args.m   < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/zgetrf.c b/interface/lapack/zgetrf.c
index 5031f587b..7f8db94f6 100644
--- a/interface/lapack/zgetrf.c
+++ b/interface/lapack/zgetrf.c
@@ -74,7 +74,7 @@ int NAME(blasint *M, blasint *N, FLOAT *a, blasint *ldA, blasint *ipiv, blasint
   if (args.n   < 0)             info = 2;
   if (args.m   < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/zgetrs.c b/interface/lapack/zgetrs.c
index 54d4b0905..0add909ca 100644
--- a/interface/lapack/zgetrs.c
+++ b/interface/lapack/zgetrs.c
@@ -102,7 +102,7 @@ int NAME(char *TRANS, blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA,
   if (trans     < 0) info = 1;
 
   if (info != 0) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     return 0;
   }
 
diff --git a/interface/lapack/zlauu2.c b/interface/lapack/zlauu2.c
index b0698ef2e..ae972543c 100644
--- a/interface/lapack/zlauu2.c
+++ b/interface/lapack/zlauu2.c
@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (args.n   < 0)             info = 2;
   if (uplo     < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/zpotf2.c b/interface/lapack/zpotf2.c
index 27ee0891a..c74b66728 100644
--- a/interface/lapack/zpotf2.c
+++ b/interface/lapack/zpotf2.c
@@ -91,7 +91,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (args.n   < 0)             info = 2;
   if (uplo     < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/zpotrf.c b/interface/lapack/zpotrf.c
index 8cd3980d5..c4cd99bf6 100644
--- a/interface/lapack/zpotrf.c
+++ b/interface/lapack/zpotrf.c
@@ -90,7 +90,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (args.n   < 0)             info = 2;
   if (uplo     < 0)             info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/zpotri.c b/interface/lapack/zpotri.c
index 7c72a7e62..8da211683 100644
--- a/interface/lapack/zpotri.c
+++ b/interface/lapack/zpotri.c
@@ -99,7 +99,7 @@ int NAME(char *UPLO, blasint *N, FLOAT *a, blasint *ldA, blasint *Info){
   if (uplo < 0)                  info = 1;
 
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/ztrti2.c b/interface/lapack/ztrti2.c
index a25476677..cb9c0d557 100644
--- a/interface/lapack/ztrti2.c
+++ b/interface/lapack/ztrti2.c
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
   if (diag < 0)                  info = 2;
   if (uplo < 0)                  info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }
diff --git a/interface/lapack/ztrtri.c b/interface/lapack/ztrtri.c
index b3ce85b9f..dda4a9e4b 100644
--- a/interface/lapack/ztrtri.c
+++ b/interface/lapack/ztrtri.c
@@ -96,7 +96,7 @@ int NAME(char *UPLO, char *DIAG, blasint *N, FLOAT *a, blasint *ldA, blasint *In
   if (diag < 0)                  info = 2;
   if (uplo < 0)                  info = 1;
   if (info) {
-    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+    BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1);
     *Info = - info;
     return 0;
   }

From e7c4d6705a41910240dd19b9e7082a422563bf15 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 17 Sep 2019 18:56:04 +0200
Subject: [PATCH 042/210] Revert #2051 and replace with a better fix (#2261)

* Revert #2051 and add a better fix for TARGET=generic with DYNAMIC_ARCH
fixes #2257 without breaking #2048 again
---
 kernel/Makefile.L3 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index f83def47b..7998c135a 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -24,9 +24,11 @@ ifeq ($(TARGET), LOONGSON3B)
 USE_TRMM = 1
 endif
 
-ifeq ($(CORE), GENERIC)
+ifneq ($(DYNAMIC_ARCH), 1)
+ifeq ($(TARGET), GENERIC)
 USE_TRMM = 1
 endif
+endif
 
 ifeq ($(CORE), HASWELL)
 USE_TRMM = 1

From bfa2cc7d6411f16e1889bb7541159086949448c6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 20 Sep 2019 10:29:35 +0200
Subject: [PATCH 043/210] Restore ppc64 CI job and remove the travis_wait that
 caused the problem with it

---
 .travis.yml | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 27ecba6c8..2b1b99b26 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -17,7 +17,7 @@ matrix:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
       script:
         - set -e
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+        - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
         - make -C utest $COMMON_FLAGS $BTYPE
@@ -25,14 +25,14 @@ matrix:
         - TARGET_BOX=LINUX64
         - BTYPE="BINARY=64"
 
-        #    - <<: *test-ubuntu
-        #      os: linux-ppc64le
-        #      before_script:
-        #        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
-        #      env:
-        #        # for matrix annotation only
-        #        - TARGET_BOX=PPC64LE_LINUX
-        #        - BTYPE="BINARY=64 USE_OPENMP=1"
+    - <<: *test-ubuntu
+      os: linux-ppc64le
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+      env:
+        # for matrix annotation only
+        - TARGET_BOX=PPC64LE_LINUX
+        - BTYPE="BINARY=64 USE_OPENMP=1"
 
     - <<: *test-ubuntu
       env:

From 673e5a049585f307fec09f38e8774d3ef902d239 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Sep 2019 22:35:22 +0200
Subject: [PATCH 044/210] Replace several POWER8/9 C kernels with their
 gcc7-generated assembly versions (#2263)

* Add gcc7-generated assembly files for POWER8/9 isa/ica-min/max and POWER9 caxpy

To work around internal compiler errors encountered when compiling the original C source with gcc 4 and 5, and wrong code generated by gcc 8.3.0

* Use gcc-generated assembly instead of original C sources

to work around internal compiler errors encountered with gcc 4.8/5.4 and wrong code generation by gcc 8.3

* Use gcc-generated assembly instead of the original C source

to work around internal compiler errors encountered with gcc 4.8 and 5.4, and wrong code generation by gcc 8.3

* Add gcc7-generated assembler version of caxpy for power8

to work around wrong code generated by gcc 8.3

* Handle CONJ define for caxpyc

* Handle CONJ define for caxpyc

* Add gcc7-generated assembly cdot for POWER9

* Use prebuilt assembly for POWER9 cdot

created with gcc 7.3.1 to work around ICE in older gcc versions

* Exclude POWER9 from DYNAMIC_ARCH when gcc versions is lower than 6

* Update Makefile.system

* Use PROLOGUE macro to ensure correct function name for DYNAMIC_ARCH

* Disable POWER9 with old gcc versions
---
 Makefile.system               |  14 +-
 driver/others/dynamic_power.c |   8 +
 kernel/power/KERNEL.POWER8    |  10 +-
 kernel/power/KERNEL.POWER9    |  12 +-
 kernel/power/caxpy_power8.S   | 574 ++++++++++++++++++++++++++++++++++
 kernel/power/caxpy_power9.S   | 538 +++++++++++++++++++++++++++++++
 kernel/power/cdot_power9.S    | 242 ++++++++++++++
 kernel/power/icamax_power8.S  | 458 +++++++++++++++++++++++++++
 kernel/power/icamax_power9.S  | 387 +++++++++++++++++++++++
 kernel/power/icamin_power8.S  | 454 +++++++++++++++++++++++++++
 kernel/power/icamin_power9.S  | 385 +++++++++++++++++++++++
 kernel/power/isamax_power8.S  | 434 +++++++++++++++++++++++++
 kernel/power/isamax_power9.S  | 397 +++++++++++++++++++++++
 kernel/power/isamin_power8.S  | 417 ++++++++++++++++++++++++
 kernel/power/isamin_power9.S  | 382 ++++++++++++++++++++++
 15 files changed, 4699 insertions(+), 13 deletions(-)
 create mode 100644 kernel/power/caxpy_power8.S
 create mode 100644 kernel/power/caxpy_power9.S
 create mode 100644 kernel/power/cdot_power9.S
 create mode 100644 kernel/power/icamax_power8.S
 create mode 100644 kernel/power/icamax_power9.S
 create mode 100644 kernel/power/icamin_power8.S
 create mode 100644 kernel/power/icamin_power9.S
 create mode 100644 kernel/power/isamax_power8.S
 create mode 100644 kernel/power/isamax_power9.S
 create mode 100644 kernel/power/isamin_power8.S
 create mode 100644 kernel/power/isamin_power9.S

diff --git a/Makefile.system b/Makefile.system
index 2cf1322a9..8843d0ad3 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -322,12 +322,13 @@ CCOMMON_OPT	+= -DMS_ABI
 endif
 
 ifeq ($(C_COMPILER), GCC)
-#Test for supporting MS_ABI
+#Version tests for supporting specific features (MS_ABI, POWER9 intrinsics)
 GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
 GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
+GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
 ifeq ($(GCCVERSIONGT4), 1)
-# GCC Majar version > 4
+# GCC Major version > 4
 # It is compatible with MSVC ABI.
 CCOMMON_OPT	+= -DMS_ABI
 endif
@@ -554,8 +555,17 @@ endif
 ifeq ($(ARCH), power)
 DYNAMIC_CORE = POWER6
 DYNAMIC_CORE += POWER8
+ifneq ($(C_COMPILER), GCC)
 DYNAMIC_CORE += POWER9
 endif
+ifeq ($(C_COMPILER), GCC)
+ifeq ($(GCCVERSIONGT5), 1)
+DYNAMIC_CORE += POWER9
+else
+$(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
+endif
+endif
+endif
 
 # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty
 ifndef DYNAMIC_CORE
diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 0c4a87a5e..1dec5f4b3 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -3,7 +3,9 @@
 
 extern gotoblas_t gotoblas_POWER6;
 extern gotoblas_t gotoblas_POWER8;
+#if (!defined C_GCC) || (GCC_VERSION >= 60000)
 extern gotoblas_t gotoblas_POWER9;
+#endif
 
 extern void openblas_warning(int verbose, const char *msg);
 
@@ -19,7 +21,9 @@ static char *corename[] = {
 char *gotoblas_corename(void) {
 	if (gotoblas == &gotoblas_POWER6)	return corename[1];
 	if (gotoblas == &gotoblas_POWER8)	return corename[2];
+#if (!defined C_GCC) || (GCC_VERSION >= 60000)
 	if (gotoblas == &gotoblas_POWER9)	return corename[3];
+#endif
 	return corename[0];
 }
 
@@ -29,8 +33,10 @@ static gotoblas_t *get_coretype(void) {
 		return &gotoblas_POWER6;
 	if (__builtin_cpu_is("power8"))
 		return &gotoblas_POWER8;
+#if (!defined C_GCC) || (GCC_VERSION >= 60000)
 	if (__builtin_cpu_is("power9"))
 		return &gotoblas_POWER9;
+#endif
 	return NULL;
 }
 
@@ -53,7 +59,9 @@ static gotoblas_t *force_coretype(char * coretype) {
 	{
 	case  1: return (&gotoblas_POWER6);
 	case  2: return (&gotoblas_POWER8);
+#if (!defined C_GCC) || (GCC_VERSION >= 60000)
 	case  3: return (&gotoblas_POWER9);
+#endif
 	default: return NULL;
 	}
 	snprintf(message, 128, "Core not found: %s\n", coretype);
diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index 43f004fbb..c08f3fb00 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 #SMINKERNEL   = ../arm/min.c
 #DMINKERNEL   = ../arm/min.c
 #
-ISAMAXKERNEL = isamax.c
+ISAMAXKERNEL = isamax_power8.S
 IDAMAXKERNEL = idamax.c
-ICAMAXKERNEL = icamax.c
+ICAMAXKERNEL = icamax_power8.S
 IZAMAXKERNEL = izamax.c
 #
-ISAMINKERNEL = isamin.c
+ISAMINKERNEL = isamin_power8.S
 IDAMINKERNEL = idamin.c
-ICAMINKERNEL = icamin.c
+ICAMINKERNEL = icamin_power8.S
 IZAMINKERNEL = izamin.c
 #
 #ISMAXKERNEL  = ../arm/imax.c
@@ -112,7 +112,7 @@ ZASUMKERNEL  = zasum.c
 #
 SAXPYKERNEL  = saxpy.c
 DAXPYKERNEL  = daxpy.c
-CAXPYKERNEL  = caxpy.c
+CAXPYKERNEL  = caxpy_power8.S
 ZAXPYKERNEL  = zaxpy.c
 #
 SCOPYKERNEL  = scopy.c
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index a570a903a..2ed843fff 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -89,14 +89,14 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 #SMINKERNEL   = ../arm/min.c
 #DMINKERNEL   = ../arm/min.c
 #
-ISAMAXKERNEL = isamax.c
+ISAMAXKERNEL = isamax_power9.S
 IDAMAXKERNEL = idamax.c
-ICAMAXKERNEL = icamax.c
+ICAMAXKERNEL = icamax_power9.S
 IZAMAXKERNEL = izamax.c
 #
-ISAMINKERNEL = isamin.c
+ISAMINKERNEL = isamin_power9.S
 IDAMINKERNEL = idamin.c
-ICAMINKERNEL = icamin.c
+ICAMINKERNEL = icamin_power9.S
 IZAMINKERNEL = izamin.c
 #
 #ISMAXKERNEL  = ../arm/imax.c
@@ -112,7 +112,7 @@ ZASUMKERNEL  = zasum.c
 #
 SAXPYKERNEL  = saxpy.c
 DAXPYKERNEL  = daxpy.c
-CAXPYKERNEL  = caxpy.c
+CAXPYKERNEL  = caxpy_power9.S
 ZAXPYKERNEL  = zaxpy.c
 #
 SCOPYKERNEL  = scopy.c
@@ -123,7 +123,7 @@ ZCOPYKERNEL  = zcopy.c
 SDOTKERNEL   =  sdot.c
 DDOTKERNEL   =  ddot.c
 DSDOTKERNEL  =  sdot.c
-CDOTKERNEL   =  cdot.c
+CDOTKERNEL   =  cdot_power9.S
 ZDOTKERNEL   =  zdot.c
 #
 SNRM2KERNEL  = ../arm/nrm2.c
diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S
new file mode 100644
index 000000000..09a423571
--- /dev/null
+++ b/kernel/power/caxpy_power8.S
@@ -0,0 +1,574 @@
+#define ASSEMBLER
+#include "common.h"
+/*
+	.file	"caxpy.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl caxpy_k
+	.type	caxpy_k, @function
+*/
+
+	PROLOGUE
+
+caxpy_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	caxpy_k,.-caxpy_k
+	mr. 7,3
+	ble 0,.L33
+	cmpdi 7,9,1
+	beq 7,.L41
+.L3:
+	mtctr 7
+	ld 7,96(1)
+	sldi 9,9,3
+	sldi 7,7,3
+	.p2align 4,,15
+.L14:
+	lfs 10,4(8)
+	lfs 11,0(8)
+	lfs 12,0(10)
+	lfs 0,4(10)
+	fmuls 10,2,10
+#ifdef CONJ	
+	fmsubs 11,11,1,10
+#else	
+	fmadds 11,11,1,10
+#endif
+	fadds 12,12,11
+	stfs 12,0(10)
+	lfs 11,0(8)
+	lfs 12,4(8)
+	add 8,8,9
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmsubs 12,12,1,11
+	fsubs 0,0,12
+#else	
+	fmadds 12,12,1,11
+	fadds 0,0,12
+#endif
+	stfs 0,4(10)
+	add 10,10,7
+	bdnz .L14
+.L33:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L41:
+	ld 6,96(1)
+	cmpdi 7,6,1
+	bne 7,.L3
+	rldicr. 4,7,0,59
+	std 31,-8(1)
+	li 11,0
+	bne 0,.L42
+.L4:
+	addi 6,11,8
+	subf 0,4,7
+	sldi 6,6,2
+	addi 9,6,-32
+	add 5,10,6
+	add 3,8,9
+	add 6,8,6
+	subfc 5,5,3
+	add 9,10,9
+	subfe 5,5,5
+	subfc 6,6,9
+	subfe 31,31,31
+	addi 6,5,1
+	addi 5,31,1
+	or 6,6,5
+	rlwinm 6,6,0,0xff
+	cmpwi 7,6,0
+	beq 7,.L7
+	sradi 6,4,63
+	srdi 5,7,63
+	subfc 31,7,4
+	adde 6,5,6
+	subfic 31,0,3
+	subfe 31,31,31
+	xori 6,6,0x1
+	neg 31,31
+	and 6,6,31
+	rlwinm 6,6,0,0xff
+	cmpwi 7,6,0
+	beq 7,.L7
+	cmpd 7,4,7
+	li 6,1
+	blt 7,.L43
+.L9:
+	addi 0,7,-1
+	subf 0,4,0
+	subfic 0,0,3
+	subfe 31,31,31
+	addi 0,31,1
+	rlwinm 0,0,0,0xff
+	cmpwi 7,0,0
+	bne 7,.L10
+	sradi 0,4,63
+	subfc 31,7,4
+	adde 5,5,0
+	rlwinm 5,5,0,0xff
+	cmpwi 7,5,0
+	bne 7,.L10
+	addi 0,6,-1
+	addis 31,2,.LC3@toc@ha
+	std 30,-16(1)
+	xscvdpspn 12,1
+	xscvdpspn 11,2
+	srdi. 30,0,2
+	addis 6,2,.LC2@toc@ha
+	addi 6,6,.LC2@toc@l
+	mtctr 30
+	addi 31,31,.LC3@toc@l
+	lxvd2x 42,0,6
+	li 5,16
+	li 6,0
+	lxvd2x 41,0,31
+	xxspltw 12,12,0
+	xxspltw 11,11,0
+	xxpermdi 42,42,42,2
+	xxpermdi 41,41,41,2
+	beq 0,.L44
+	.p2align 4,,15
+.L11:
+#ifdef CONJ
+	lxvd2x 44,3,6
+	lxvd2x 45,3,5
+	lxvd2x 33,9,6
+	lxvd2x 0,9,5
+	xxpermdi 44,44,44,2
+	xxpermdi 45,45,45,2
+	xxpermdi 32,33,33,2
+	xxpermdi 33,0,0,2
+	vperm 11,13,12,10
+	vperm 13,13,12,9
+	vperm 12,1,0,10
+	vperm 1,1,0,9
+	xvmulsp 0,11,43
+	xvmulsp 32,11,45
+	xvmsubmsp 45,12,0
+	xvmaddasp 32,12,43
+	xvaddsp 44,32,44
+	xvsubsp 32,33,45
+	vmrglw 1,0,12
+	vmrghw 0,0,12
+#else
+	lxvd2x 45,3,6
+	lxvd2x 33,3,5
+	lxvd2x 43,9,6
+	lxvd2x 0,9,5
+	xxpermdi 45,45,45,2
+	xxpermdi 33,33,33,2
+	xxpermdi 32,43,43,2
+	xxpermdi 43,0,0,2
+	vperm 12,1,13,10
+	vperm 1,1,13,9
+	vperm 13,11,0,10
+	vperm 11,11,0,9
+	xvmulsp 0,11,44
+	xvmulsp 32,11,33
+	xvmaddmsp 33,12,0
+	xvmsubasp 32,12,44
+	xvaddsp 45,32,45
+	xvaddsp 32,33,43
+	vmrglw 1,0,13
+	vmrghw 0,0,13
+#endif
+	xxpermdi 0,33,33,2
+	xxpermdi 32,32,32,2
+	stxvd2x 0,9,6
+	addi 6,6,32
+	stxvd2x 32,9,5
+	addi 5,5,32
+	bdnz .L11
+	rldicr 0,0,0,61
+	ld 30,-16(1)
+	sldi 9,0,1
+	add 4,4,0
+	add 11,11,9
+.L10:
+	sldi 6,11,2
+	addi 9,4,1
+	addi 5,6,4
+	cmpd 7,7,9
+	lfsx 12,8,6
+	lfsx 0,10,6
+	addi 9,11,2
+	lfsx 11,8,5
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmadds 12,12,1,11
+#else	
+	fmsubs 12,12,1,11
+#endif	
+	fadds 0,0,12
+	stfsx 0,10,6
+	lfsx 11,8,6
+	lfsx 12,8,5
+	lfsx 0,10,5
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmsubs 12,12,1,11
+	fsubs 0,0,12
+#else	
+	fmadds 12,12,1,11
+	fadds 0,0,12
+#endif
+	stfsx 0,10,5
+	ble 7,.L39
+	sldi 9,9,2
+	addi 6,4,2
+	addi 5,9,4
+	cmpd 7,7,6
+	lfsx 12,8,9
+	lfsx 0,10,9
+	addi 6,11,4
+	lfsx 11,8,5
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmadds 12,1,12,11
+#else	
+	fmsubs 12,1,12,11
+#endif	
+	fadds 0,0,12
+	stfsx 0,10,9
+	lfsx 11,8,9
+	lfsx 12,8,5
+	lfsx 0,10,5
+	fmuls 11,2,11
+	fmsubs 12,1,12,11
+	fsubs 0,0,12
+	stfsx 0,10,5
+	ble 7,.L39
+	sldi 6,6,2
+	addi 4,4,3
+	addi 5,6,4
+	cmpd 7,7,4
+	lfsx 12,8,6
+	lfsx 0,10,6
+	addi 9,11,6
+	lfsx 11,8,5
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmadds 12,1,12,11
+#else
+	fmsubs 12,1,12,11
+#endif
+	fadds 0,0,12
+	stfsx 0,10,6
+	lfsx 11,8,6
+	lfsx 12,8,5
+	lfsx 0,10,5
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmsubs 12,1,12,11
+	fsubs 0,0,12
+#else	
+	fmadds 12,1,12,11
+	fadds 0,0,12
+#endif	
+	stfsx 0,10,5
+	ble 7,.L39
+	sldi 9,9,2
+	ld 31,-8(1)
+	addi 7,9,4
+	lfsx 12,8,9
+	lfsx 0,10,9
+	lfsx 11,8,7
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmadds 12,1,12,11
+#else	
+	fmsubs 12,1,12,11
+#endif	
+	fadds 0,0,12
+	stfsx 0,10,9
+	lfsx 11,8,9
+	lfsx 12,8,7
+	lfsx 0,10,7
+	fmuls 2,2,11
+#ifdef CONJ	
+	fmsubs 1,1,12,2
+	fsubs 1,0,1
+#else	
+	fmadds 1,1,12,2
+	fadds 1,0,1
+#endif	
+	stfsx 1,10,7
+	b .L33
+.L43:
+	mr 6,0
+	b .L9
+.L7:
+	addi 10,4,1
+	cmpd 7,10,7
+	subf 10,4,7
+	mtctr 10
+	bgt 7,.L26
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,7,10
+	beq 7,.L26
+	.p2align 4,,15
+.L13:
+	lfs 10,4(3)
+	lfs 11,0(3)
+	addi 9,9,8
+	addi 3,3,8
+	lfs 12,-8(9)
+	lfs 0,-4(9)
+	fmuls 10,2,10
+#ifdef CONJ	
+	fmadds 11,1,11,10
+#else	
+	fmsubs 11,1,11,10
+#endif	
+	fadds 12,12,11
+	stfs 12,-8(9)
+	lfs 11,-8(3)
+	lfs 12,-4(3)
+	fmuls 11,2,11
+#ifdef CONJ	
+	fmsubs 12,1,12,11
+	fsubs 0,0,12
+#else	
+	fmadds 12,1,12,11
+	fadds 0,0,12
+#endif	
+	stfs 0,-4(9)
+	bdnz .L13
+.L39:
+	ld 31,-8(1)
+	b .L33
+.L42:
+#ifdef CONJ
+	fneg 0,1
+	xxpermdi 32,1,1,0
+	addis 9,2,.LANCHOR0@toc@ha
+	std 28,-32(1)
+	sradi. 28,4,1
+	addi 9,9,.LANCHOR0@toc@l
+	xscvdpspn 5,2
+	xvcvdpsp 32,32
+	lxvd2x 12,0,9
+	xxpermdi 39,0,0,0
+	xxspltw 5,5,0
+	xvcvdpsp 39,39
+#else
+	fneg 0,2
+	xxpermdi 39,2,2,0
+	addis 9,2,.LANCHOR0@toc@ha
+	std 28,-32(1)
+	sradi. 28,4,1
+	addi 9,9,.LANCHOR0@toc@l
+	xscvdpspn 5,1
+	xvcvdpsp 39,39
+	lxvd2x 12,0,9
+	xxpermdi 32,0,0,0
+	xxspltw 5,5,0
+	xvcvdpsp 32,32
+#endif
+	xxpermdi 12,12,12,2
+	vmrgew 7,7,0
+	beq 0,.L5
+	xxlnor 38,12,12
+	std 29,-24(1)
+	std 30,-16(1)
+	mr 6,8
+	mr 9,10
+	li 29,0
+	li 30,16
+	li 31,32
+	li 12,48
+	li 0,64
+	li 11,80
+	li 3,96
+	li 5,112
+	.p2align 4,,15
+.L6:
+	lxvd2x 6,0,9
+	lxvd2x 40,0,6
+	addi 29,29,8
+	lxvd2x 41,6,30
+	lxvd2x 42,6,31
+	cmpd 7,28,29
+	lxvd2x 43,6,12
+	lxvd2x 44,6,0
+	lxvd2x 45,6,11
+	lxvd2x 33,6,3
+	lxvd2x 32,6,5
+	lxvd2x 7,9,30
+	addi 6,6,128
+	lxvd2x 8,9,31
+	lxvd2x 9,9,12
+	xxpermdi 40,40,40,2
+	xxpermdi 6,6,6,2
+	lxvd2x 10,9,0
+	lxvd2x 11,9,11
+	xxpermdi 41,41,41,2
+	xxpermdi 42,42,42,2
+	lxvd2x 12,9,3
+	lxvd2x 0,9,5
+	xxpermdi 43,43,43,2
+	xxpermdi 44,44,44,2
+	xxpermdi 45,45,45,2
+	xxpermdi 33,33,33,2
+	xxpermdi 32,32,32,2
+	xxpermdi 7,7,7,2
+	xxpermdi 8,8,8,2
+	xxpermdi 9,9,9,2
+	xxpermdi 10,10,10,2
+	xxpermdi 11,11,11,2
+	xxpermdi 12,12,12,2
+	xxpermdi 0,0,0,2
+#ifndef CONJ	
+	xvmaddasp 6,5,40
+	xvmaddasp 7,5,41
+	xvmaddasp 8,5,42
+	xvmaddasp 9,5,43
+	xvmaddasp 10,5,44
+	xvmaddasp 11,5,45
+	xvmaddasp 12,5,33
+	xvmaddasp 0,5,32
+	vperm 8,8,8,6
+	vperm 9,9,9,6
+	vperm 10,10,10,6
+	vperm 11,11,11,6
+	vperm 12,12,12,6
+	vperm 13,13,13,6
+	vperm 1,1,1,6
+	vperm 0,0,0,6
+#endif
+	xvmaddasp 6,39,40
+	xvmaddasp 7,39,41
+	xvmaddasp 8,39,42
+	xvmaddasp 9,39,43
+	xvmaddasp 10,39,44
+	xvmaddasp 11,39,45
+	xvmaddasp 12,39,33
+	xvmaddasp 0,39,32
+#ifdef CONJ	
+	vperm 8,8,8,6
+	vperm 9,9,9,6
+	vperm 10,10,10,6
+	vperm 11,11,11,6
+	vperm 12,12,12,6
+	vperm 13,13,13,6
+	vperm 1,1,1,6
+	vperm 0,0,0,6
+	xvmaddasp 6,5,40
+	xvmaddasp 7,5,41
+	xvmaddasp 8,5,42
+	xvmaddasp 9,5,43
+	xvmaddasp 10,5,44
+	xvmaddasp 11,5,45
+	xvmaddasp 12,5,33
+	xvmaddasp 0,5,32
+#endif
+	xxpermdi 6,6,6,2
+	xxpermdi 7,7,7,2
+	xxpermdi 8,8,8,2
+	xxpermdi 9,9,9,2
+	stxvd2x 6,0,9
+	xxpermdi 10,10,10,2
+	stxvd2x 7,9,30
+	xxpermdi 11,11,11,2
+	stxvd2x 8,9,31
+	xxpermdi 12,12,12,2
+	stxvd2x 9,9,12
+	xxpermdi 0,0,0,2
+	stxvd2x 10,9,0
+	stxvd2x 11,9,11
+	stxvd2x 12,9,3
+	stxvd2x 0,9,5
+	addi 9,9,128
+	bgt 7,.L6
+	ld 29,-24(1)
+	ld 30,-16(1)
+.L5:
+	cmpd 7,7,4
+	ble 7,.L36
+	sldi 11,4,1
+	ld 28,-32(1)
+	b .L4
+.L36:
+	ld 28,-32(1)
+	ld 31,-8(1)
+	b .L33
+.L44:
+	li 31,1
+	mtctr 31
+	b .L11
+.L26:
+	li 10,1
+	mtctr 10
+	b .L13
+	.long 0
+	.byte 0,0,0,0,0,4,0,0
+	.size	caxpy_k,.-caxpy_k
+	.section	.rodata
+	.align 4
+	.set	.LANCHOR0,. + 0
+	.type	swap_mask_arr, @object
+	.size	swap_mask_arr, 16
+swap_mask_arr:
+	.byte	4
+	.byte	5
+	.byte	6
+	.byte	7
+	.byte	0
+	.byte	1
+	.byte	2
+	.byte	3
+	.byte	12
+	.byte	13
+	.byte	14
+	.byte	15
+	.byte	8
+	.byte	9
+	.byte	10
+	.byte	11
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.byte	31
+	.byte	30
+	.byte	29
+	.byte	28
+	.byte	23
+	.byte	22
+	.byte	21
+	.byte	20
+	.byte	15
+	.byte	14
+	.byte	13
+	.byte	12
+	.byte	7
+	.byte	6
+	.byte	5
+	.byte	4
+.LC3:
+	.byte	27
+	.byte	26
+	.byte	25
+	.byte	24
+	.byte	19
+	.byte	18
+	.byte	17
+	.byte	16
+	.byte	11
+	.byte	10
+	.byte	9
+	.byte	8
+	.byte	3
+	.byte	2
+	.byte	1
+	.byte	0
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.gnu_attribute 4, 1
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/caxpy_power9.S b/kernel/power/caxpy_power9.S
new file mode 100644
index 000000000..48e6e5ba3
--- /dev/null
+++ b/kernel/power/caxpy_power9.S
@@ -0,0 +1,538 @@
+#define ASSEMBLER
+#include "common.h"
+
+/*
+	.file	"caxpy.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl caxpy_k
+	.type	caxpy_k, @function
+*/
+
+	PROLOGUE
+
+caxpy_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	caxpy_k,.-caxpy_k
+	mr. 7,3
+	ble 0,.L33
+	cmpdi 7,9,1
+	beq 7,.L37
+.L3:
+	mtctr 7
+	ld 7,96(1)
+	sldi 9,9,3
+	sldi 7,7,3
+	.p2align 4,,15
+.L14:
+	lfs 10,4(8)
+	lfs 11,0(8)
+	lfs 12,0(10)
+	lfs 0,4(10)
+	fmuls 10,2,10
+#ifdef CONJ	
+	fmadds 11,11,1,10
+#else
+	fmsubs 11,11,1,10
+#endif
+	fadds 12,12,11
+	stfs 12,0(10)
+	lfs 11,0(8)
+	lfs 12,4(8)
+	add 8,8,9
+	fmuls 11,2,11
+#ifdef CONJ
+	fmsubs 12,12,1,11
+	fsubs 0,0,12
+#else
+	fmadds 12,12,1,11
+	fadds 0,0,12
+#endif
+	stfs 0,4(10)
+	add 10,10,7
+	bdnz .L14
+.L33:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L37:
+	ld 6,96(1)
+	cmpdi 7,6,1
+	bne 7,.L3
+	rldicr. 4,7,0,59
+	li 11,0
+	bne 0,.L38
+.L4:
+	addi 6,11,8
+	subf 0,4,7
+	sldi 6,6,2
+	addi 9,6,-32
+	add 5,10,6
+	add 6,8,6
+	add 3,8,9
+	add 9,10,9
+	subfc 5,5,3
+	subfe 5,5,5
+	subfc 6,6,9
+	subfe 12,12,12
+	addi 6,5,1
+	addi 5,12,1
+	or 6,6,5
+	rlwinm 6,6,0,0xff
+	cmpwi 7,6,0
+	beq 7,.L7
+	sradi 6,4,63
+	srdi 5,7,63
+	subfc 12,7,4
+	adde 6,5,6
+	subfic 12,0,4
+	subfe 12,12,12
+	xori 6,6,0x1
+	neg 12,12
+	and 6,6,12
+	rlwinm 6,6,0,0xff
+	cmpwi 7,6,0
+	beq 7,.L7
+	cmpd 7,4,7
+	li 6,1
+	blt 7,.L39
+.L9:
+	addi 0,7,-1
+	subf 0,4,0
+	subfic 0,0,3
+	subfe 12,12,12
+	addi 0,12,1
+	rlwinm 0,0,0,0xff
+	cmpwi 7,0,0
+	bne 7,.L10
+	sradi 0,4,63
+	subfc 12,7,4
+	adde 5,5,0
+	rlwinm 5,5,0,0xff
+	cmpwi 7,5,0
+	bne 7,.L10
+	xscvdpspn 0,1
+	xscvdpspn 12,2
+	addi 0,6,-1
+	std 31,-8(1)
+	addis 12,2,.LC2@toc@ha
+	addis 6,2,.LC3@toc@ha
+	li 5,16
+	srdi. 31,0,2
+	addi 6,6,.LC3@toc@l
+	addi 12,12,.LC2@toc@l
+	mtctr 31
+	lxv 41,0(6)
+	lxv 42,0(12)
+	li 6,0
+	xxspltw 0,0,0
+	xxspltw 12,12,0
+	beq 0,.L40
+	.p2align 4,,15
+.L11:
+#ifdef CONJ
+	lxvx 33,3,5
+	lxvx 44,3,6
+	lxvx 43,9,6
+	lxvx 32,9,5
+	vperm 13,1,12,10
+	vperm 12,1,12,9
+	vperm 8,0,11,10
+	vperm 0,0,11,9
+	xvmulsp 33,12,44
+	xvmulsp 11,12,45
+	xvmaddasp 33,0,45
+	xvmsubmsp 44,0,11
+	xvaddsp 33,33,40
+	xvsubsp 32,32,44
+#else
+	lxvx 33,3,6
+	lxvx 32,3,5
+	lxvx 43,9,6
+	lxvx 44,9,5
+	vperm 13,0,1,10
+	vperm 0,0,1,9
+	vperm 8,12,11,10
+	vperm 12,12,11,9
+	xvmulsp 33,12,32
+	xvmulsp 11,12,45
+	xvmsubasp 33,0,45
+	xvmaddmsp 32,0,11
+	xvaddsp 33,33,40
+	xvaddsp 32,32,44
+#endif
+	vmrglw 13,0,1
+	vmrghw 0,0,1
+	stxvx 45,9,6
+	stxvx 32,9,5
+	addi 6,6,32
+	addi 5,5,32
+	bdnz .L11
+	rldicr 0,0,0,61
+	ld 31,-8(1)
+	sldi 9,0,1
+	add 4,4,0
+	add 11,11,9
+.L10:
+	sldi 5,11,2
+	addi 6,4,1
+	addi 9,11,2
+	addi 3,5,4
+	lfsx 12,8,5
+	cmpd 7,7,6
+	lfsx 0,10,5
+	lfsx 11,8,3
+	fmuls 11,2,11
+#ifdef CONJ
+	fmadds 12,12,1,11
+#else
+	fmsubs 12,12,1,11
+#endif
+	fadds 0,0,12
+	stfsx 0,10,5
+	lfsx 11,8,5
+	lfsx 12,8,3
+	lfsx 0,10,3
+	fmuls 11,2,11
+#ifdef CONJ
+	fmsubs 12,12,1,11
+	fsubs 0,0,12
+#else
+	fmadds 12,12,1,11
+	fadds 0,0,12
+#endif
+	stfsx 0,10,3
+	ble 7,.L33
+	sldi 9,9,2
+	addi 5,4,2
+	addi 6,11,4
+	addi 3,9,4
+	lfsx 12,8,9
+	cmpd 7,7,5
+	lfsx 0,10,9
+	lfsx 11,8,3
+	fmuls 11,2,11
+#ifdef CONJ
+	fmadds 12,1,12,11
+#else
+	fmsubs 12,1,12,11
+#endif
+	fadds 0,0,12
+	stfsx 0,10,9
+	lfsx 11,8,9
+	lfsx 12,8,3
+	lfsx 0,10,3
+	fmuls 11,2,11
+#ifdef CONJ
+	fmsubs 12,1,12,11
+	fsubs 0,0,12
+#else
+	fmadds 12,1,12,11
+	fadds 0,0,12
+#endif
+	stfsx 0,10,3
+	ble 7,.L33
+	sldi 6,6,2
+	addi 4,4,3
+	addi 9,11,6
+	addi 5,6,4
+	lfsx 12,8,6
+	cmpd 7,7,4
+	lfsx 0,10,6
+	lfsx 11,8,5
+	fmuls 11,2,11
+#ifdef CONJ
+	fmadds 12,1,12,11
+#else
+	fmsubs 12,1,12,11
+#endif
+	fadds 0,0,12
+	stfsx 0,10,6
+	lfsx 11,8,6
+	lfsx 12,8,5
+	lfsx 0,10,5
+	fmuls 11,2,11
+#ifdef CONJ
+	fmsubs 12,1,12,11
+	fsubs 0,0,12
+#else
+	fmadds 12,1,12,11
+	fadds 0,0,12
+#endif
+	stfsx 0,10,5
+	ble 7,.L33
+	sldi 9,9,2
+	addi 7,9,4
+	lfsx 12,8,9
+	lfsx 0,10,9
+	lfsx 11,8,7
+	fmuls 11,2,11
+#ifdef CONJ
+	fmadds 12,1,12,11
+#else
+	fmsubs 12,1,12,11
+#endif
+	fadds 0,0,12
+	stfsx 0,10,9
+	lfsx 11,8,9
+	lfsx 12,8,7
+	lfsx 0,10,7
+	fmuls 2,2,11
+#ifdef CONJ
+	fmsubs 1,1,12,2
+	fsubs 1,0,1
+#else
+	fmadds 1,1,12,2
+	fadds 1,0,1
+#endif
+	stfsx 1,10,7
+	b .L33
+.L39:
+	mr 6,0
+	b .L9
+.L38:
+#ifdef CONJ
+	fneg 0,1
+	xxpermdi 45,1,1,0
+	xscvdpspn 12,2
+	addis 9,2,.LANCHOR0@toc@ha
+	sradi. 3,4,1
+	xxpermdi 44,0,0,0
+	addi 9,9,.LANCHOR0@toc@l
+	xvcvdpsp 45,45
+	lxv 33,0(9)
+	xvcvdpsp 32,44
+	xxspltw 12,12,0
+#else
+	fneg 12,2
+	xxpermdi 32,2,2,0
+	xscvdpspn 0,1
+	addis 9,2,.LANCHOR0@toc@ha
+	sradi. 3,4,1
+	xxpermdi 45,12,12,0
+	addi 9,9,.LANCHOR0@toc@l
+	xvcvdpsp 32,32
+	lxv 33,0(9)
+	xvcvdpsp 45,45
+	xxspltw 0,0,0
+#endif
+	vmrgew 0,0,13
+	beq 0,.L5
+	mr 6,8
+	mr 9,10
+	li 5,0
+	.p2align 4,,15
+.L6:
+	lxv 38,16(6)
+	lxv 11,16(9)
+	addi 5,5,8
+	addi 6,6,128
+	addi 9,9,128
+	lxv 39,-96(6)
+	lxv 40,-80(6)
+	lxv 41,-64(6)
+	lxv 42,-48(6)
+	cmpd 7,3,5
+	lxv 43,-32(6)
+	lxv 45,-128(6)
+	lxv 44,-16(6)
+#ifdef CONJ
+	lxv 0,-128(9)
+	vpermr 17,6,6,1
+	xvmaddmsp 38,32,11
+	lxv 11,-96(9)
+	vpermr 18,7,7,1
+	vpermr 19,8,8,1
+	vpermr 2,9,9,1
+	vpermr 3,10,10,1
+	vpermr 4,11,11,1
+	xvmaddasp 0,32,45
+	vpermr 5,12,12,1
+	xvmaddmsp 39,32,11
+	lxv 11,-80(9)
+	vpermr 13,13,13,1
+	xvmaddasp 38,12,49
+	xvmaddmsp 40,32,11
+	lxv 11,-64(9)
+	xvmaddmsp 45,12,0
+	xvmaddasp 39,12,50
+	stxv 38,-112(9)
+	xvmaddmsp 41,32,11
+	lxv 11,-48(9)
+	xvmaddasp 40,12,51
+	stxv 45,-128(9)
+	stxv 39,-96(9)
+	xvmaddmsp 42,32,11
+	lxv 11,-32(9)
+	xvmaddasp 41,12,34
+	stxv 40,-80(9)
+	xvmaddmsp 43,32,11
+	lxv 11,-16(9)
+	xvmaddasp 42,12,35
+	stxv 41,-64(9)
+	xvmaddmsp 44,32,11
+	xvmaddasp 43,12,36
+	stxv 42,-48(9)
+	xvmaddasp 44,12,37
+#else
+	lxv 12,-128(9)
+	vpermr 17,6,6,1
+	xvmaddmsp 38,0,11
+	lxv 11,-96(9)
+	vpermr 18,7,7,1
+	vpermr 19,8,8,1
+	vpermr 2,9,9,1
+	vpermr 3,10,10,1
+	vpermr 4,11,11,1
+	xvmaddasp 12,0,45
+	vpermr 5,12,12,1
+	xvmaddmsp 39,0,11
+	lxv 11,-80(9)
+	vpermr 13,13,13,1
+	xvmaddasp 38,32,49
+	xvmaddmsp 40,0,11
+	lxv 11,-64(9)
+	xvmaddmsp 45,32,12
+	xvmaddasp 39,32,50
+	stxv 38,-112(9)
+	xvmaddmsp 41,0,11
+	lxv 11,-48(9)
+	xvmaddasp 40,32,51
+	stxv 45,-128(9)
+	stxv 39,-96(9)
+	xvmaddmsp 42,0,11
+	lxv 11,-32(9)
+	xvmaddasp 41,32,34
+	stxv 40,-80(9)
+	xvmaddmsp 43,0,11
+	lxv 11,-16(9)
+	xvmaddasp 42,32,35
+	stxv 41,-64(9)
+	xvmaddmsp 44,0,11
+	xvmaddasp 43,32,36
+	stxv 42,-48(9)
+	xvmaddasp 44,32,37
+#endif
+	stxv 43,-32(9)
+	stxv 44,-16(9)
+	bgt 7,.L6
+.L5:
+	cmpd 7,7,4
+	ble 7,.L33
+	sldi 11,4,1
+	b .L4
+.L7:
+	addi 10,4,1
+	subf 8,4,7
+	cmpd 7,10,7
+	mtctr 8
+	bgt 7,.L26
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,7,10
+	beq 7,.L26
+	.p2align 4,,15
+.L13:
+	lfs 10,4(3)
+	lfs 11,0(3)
+	lfs 12,0(9)
+	lfs 0,4(9)
+	addi 3,3,8
+	addi 9,9,8
+	fmuls 10,2,10
+#ifdef CONJ
+	fmadds 11,1,11,10
+#else
+	fmsubs 11,1,11,10
+#endif
+	fadds 12,12,11
+	stfs 12,-8(9)
+	lfs 11,-8(3)
+	lfs 12,-4(3)
+	fmuls 11,2,11
+#ifdef CONJ
+	fmsubs 12,1,12,11
+	fsubs 0,0,12
+#else
+	fmadds 12,1,12,11
+	fadds 0,0,12
+#endif
+	stfs 0,-4(9)
+	bdnz .L13
+	b .L33
+.L40:
+	li 31,1
+	mtctr 31
+	b .L11
+.L26:
+	li 10,1
+	mtctr 10
+	b .L13
+	.long 0
+	.byte 0,0,0,0,0,1,0,0
+	.size	caxpy_k,.-caxpy_k
+	.section	.rodata
+	.align 4
+	.set	.LANCHOR0,. + 0
+	.type	swap_mask_arr, @object
+	.size	swap_mask_arr, 16
+swap_mask_arr:
+	.byte	4
+	.byte	5
+	.byte	6
+	.byte	7
+	.byte	0
+	.byte	1
+	.byte	2
+	.byte	3
+	.byte	12
+	.byte	13
+	.byte	14
+	.byte	15
+	.byte	8
+	.byte	9
+	.byte	10
+	.byte	11
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.byte	31
+	.byte	30
+	.byte	29
+	.byte	28
+	.byte	23
+	.byte	22
+	.byte	21
+	.byte	20
+	.byte	15
+	.byte	14
+	.byte	13
+	.byte	12
+	.byte	7
+	.byte	6
+	.byte	5
+	.byte	4
+.LC3:
+	.byte	27
+	.byte	26
+	.byte	25
+	.byte	24
+	.byte	19
+	.byte	18
+	.byte	17
+	.byte	16
+	.byte	11
+	.byte	10
+	.byte	9
+	.byte	8
+	.byte	3
+	.byte	2
+	.byte	1
+	.byte	0
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.gnu_attribute 4, 1
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S
new file mode 100644
index 000000000..01d194c0c
--- /dev/null
+++ b/kernel/power/cdot_power9.S
@@ -0,0 +1,242 @@
+	.file	"cdot.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl cdot_k
+	.type	cdot_k, @function
+cdot_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	cdot_k,.-cdot_k
+	mr. 9,3
+	ble 0,.L10
+	cmpdi 7,5,1
+	beq 7,.L18
+.L3:
+	mtctr 9
+	xxlxor 2,2,2
+	sldi 5,5,3
+	sldi 7,7,3
+#ifdef CONJ
+	fmr 12,2
+#endif
+	fmr 8,2
+#ifndef CONJ
+	fmr 9,2
+#endif
+	fmr 1,2
+	.p2align 4,,15
+.L9:
+#ifdef CONJ
+	lfs 9,0(4)
+	lfs 11,0(6)
+	lfs 10,4(6)
+	lfs 0,4(4)
+	add 6,6,7
+	add 4,4,5
+	fmadds 1,9,11,1
+	fmadds 12,9,10,12
+	fmadds 8,0,10,8
+	fmadds 2,11,0,2
+#else
+	lfs 10,0(4)
+	lfs 12,0(6)
+	lfs 11,4(6)
+	lfs 0,4(4)
+	add 6,6,7
+	add 4,4,5
+	fmadds 1,10,12,1
+	fmadds 8,10,11,8
+	fmadds 9,0,11,9
+	fmadds 2,12,0,2
+#endif
+	bdnz .L9
+.L7:
+#ifdef CONJ
+	fsubs 2,12,2
+	fadds 1,1,8
+#else
+	fadds 2,2,8
+	fsubs 1,1,9
+#endif
+	blr
+	.p2align 4,,15
+.L18:
+	cmpdi 7,7,1
+	bne 7,.L3
+	rldicr. 10,9,0,60
+	bne 0,.L19
+	xxlxor 2,2,2
+	li 8,0
+#ifdef CONJ
+	fmr 12,2
+#endif
+	fmr 8,2
+#ifndef CONJ
+	fmr 9,2
+#endif
+	fmr 1,2
+.L4:
+	addi 7,10,1
+	sldi 8,8,2
+	subf 10,10,9
+	cmpd 7,7,9
+	mtctr 10
+	add 4,4,8
+	add 6,6,8
+	bgt 7,.L16
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L16
+	.p2align 4,,15
+.L8:
+#ifdef CONJ
+	lfs 9,0(4)
+	lfs 11,0(6)
+	lfs 10,4(6)
+	lfs 0,4(4)
+	addi 6,6,8
+	addi 4,4,8
+	fmadds 1,9,11,1
+	fmadds 12,9,10,12
+	fmadds 8,0,10,8
+	fmadds 2,11,0,2
+#else
+	lfs 10,0(4)
+	lfs 12,0(6)
+	lfs 11,4(6)
+	lfs 0,4(4)
+	addi 6,6,8
+	addi 4,4,8
+	fmadds 1,10,12,1
+	fmadds 8,10,11,8
+	fmadds 9,0,11,9
+	fmadds 2,12,0,2
+#endif
+	bdnz .L8
+	b .L7
+	.p2align 4,,15
+.L10:
+	xxlxor 1,1,1
+	fmr 2,1
+	blr
+.L19:
+	addis 8,2,.LANCHOR0@toc@ha
+	sradi. 3,10,1
+	xxspltib 42,0
+	addi 8,8,.LANCHOR0@toc@l
+	lxv 32,0(8)
+	beq 0,.L12
+	xxlor 6,42,42
+	xxlor 4,42,42
+	xxlor 0,42,42
+	xxlor 7,42,42
+	xxlor 5,42,42
+	xxlor 3,42,42
+	xxlor 12,42,42
+	mr 7,4
+	mr 8,6
+	li 5,0
+	.p2align 4,,15
+.L6:
+	lxv 43,0(8)
+	lxv 44,16(8)
+	addi 5,5,4
+	addi 8,8,64
+	addi 7,7,64
+	lxv 45,-32(8)
+	lxv 33,-16(8)
+	lxv 8,-64(7)
+	lxv 9,-48(7)
+	cmpd 7,3,5
+	lxv 10,-32(7)
+	lxv 11,-16(7)
+	vpermr 6,11,11,0
+	vpermr 7,12,12,0
+	vpermr 8,13,13,0
+	vpermr 9,1,1,0
+	xvmaddasp 12,43,8
+	xvmaddasp 3,44,9
+	xvmaddasp 0,8,38
+	xvmaddasp 4,9,39
+	xvmaddasp 6,10,40
+	xvmaddasp 5,45,10
+	xvmaddasp 42,11,41
+	xvmaddasp 7,33,11
+	bgt 7,.L6
+	xvaddsp 12,12,3
+	xvaddsp 0,0,4
+	xvaddsp 12,12,5
+	xvaddsp 0,0,6
+	xvaddsp 12,12,7
+	xvaddsp 42,0,42
+.L5:
+#ifdef CONJ
+	xxpermdi 8,12,12,2
+	xxpermdi 0,42,42,2
+	cmpd 7,9,10
+	sldi 8,10,1
+	xvaddsp 8,8,12
+	xvaddsp 0,0,42
+	xxsldwi 1,8,8,3
+	xxsldwi 12,0,0,3
+	xxsldwi 8,8,8,2
+	xxsldwi 0,0,0,2
+	xscvspdp 1,1
+	xscvspdp 12,12
+	xscvspdp 8,8
+#else
+	xxpermdi 9,12,12,2
+	xxpermdi 0,42,42,2
+	cmpd 7,9,10
+	sldi 8,10,1
+	xvaddsp 9,9,12
+	xvaddsp 0,0,42
+	xxsldwi 1,9,9,3
+	xxsldwi 2,0,0,3
+	xxsldwi 9,9,9,2
+	xxsldwi 0,0,0,2
+	xscvspdp 8,2
+	xscvspdp 1,1
+	xscvspdp 9,9
+#endif
+	xscvspdp 2,0
+	bgt 7,.L4
+	b .L7
+.L12:
+	xxlor 12,42,42
+	b .L5
+.L16:
+	li 9,1
+	mtctr 9
+	b .L8
+	.long 0
+	.byte 0,0,0,0,0,0,0,0
+	.size	cdot_k,.-cdot_k
+	.section	.rodata
+	.align 4
+	.set	.LANCHOR0,. + 0
+	.type	swap_mask_arr, @object
+	.size	swap_mask_arr, 16
+swap_mask_arr:
+	.byte	4
+	.byte	5
+	.byte	6
+	.byte	7
+	.byte	0
+	.byte	1
+	.byte	2
+	.byte	3
+	.byte	12
+	.byte	13
+	.byte	14
+	.byte	15
+	.byte	8
+	.byte	9
+	.byte	10
+	.byte	11
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/icamax_power8.S b/kernel/power/icamax_power8.S
new file mode 100644
index 000000000..4872aff40
--- /dev/null
+++ b/kernel/power/icamax_power8.S
@@ -0,0 +1,458 @@
+/*	.file	"icamax.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl icamax_k
+	.type	icamax_k, @function
+*/
+#define ASSEMBLER
+#include "common.h"
+
+	PROLOGUE
+
+icamax_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	icamax_k,.-icamax_k
+	mr. 9,3
+	ble 0,.L25
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	cmpdi 7,5,1
+	beq 7,.L54
+	lfs 11,0(4)
+	lfs 0,4(4)
+	cmpdi 7,9,1
+	fabs 11,11
+	fabs 0,0
+	fadds 11,11,0
+	beq 7,.L29
+	addi 9,9,-1
+	sldi 5,5,3
+	mtctr 9
+	add 4,4,5
+	li 3,0
+	li 9,1
+	.p2align 4,,15
+.L24:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	add 4,4,5
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bng 7,.L23
+	fmr 11,0
+	mr 3,9
+.L23:
+	addi 9,9,1
+	bdnz .L24
+.L52:
+	addi 3,3,1
+	blr
+	.p2align 4,,15
+.L25:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L54:
+	rldicr. 8,9,0,58
+	bne 0,.L55
+	addi 7,8,1
+	li 10,0
+	xxlxor 11,11,11
+	cmpd 7,7,9
+	sldi 10,10,2
+	add 4,4,10
+	subf 10,8,9
+	mtctr 10
+	li 3,0
+	bgt 7,.L43
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L43
+	.p2align 4,,15
+.L44:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bng 7,.L46
+	fmr 11,0
+	mr 3,8
+.L46:
+	addi 8,8,1
+	bdnz .L44
+	b .L52
+	.p2align 4,,15
+.L55:
+	li 0,-144
+	std 31,-8(1)
+	addis 5,2,.LC2@toc@ha
+	vspltisw 18,0
+	vspltisw 19,0
+	addis 6,2,.LC3@toc@ha
+	addi 5,5,.LC2@toc@l
+	stvx 24,1,0
+	li 0,-128
+	addi 6,6,.LC3@toc@l
+	xxlor 49,50,50
+	addis 7,2,.LC4@toc@ha
+	lxvd2x 44,0,5
+	addis 10,2,.LC5@toc@ha
+	stvx 25,1,0
+	li 0,-112
+	addi 7,7,.LC4@toc@l
+	lxvd2x 45,0,6
+	addis 5,2,.LC6@toc@ha
+	addis 6,2,.LC7@toc@ha
+	stvx 26,1,0
+	li 0,-96
+	addi 10,10,.LC5@toc@l
+	addi 6,6,.LC7@toc@l
+	addi 5,5,.LC6@toc@l
+	stvx 27,1,0
+	li 0,-80
+	lxvd2x 46,0,10
+	xxpermdi 44,44,44,2
+	mr 10,4
+	lxvd2x 48,0,6
+	lxvd2x 47,0,5
+	xxpermdi 45,45,45,2
+	li 6,0
+	stvx 28,1,0
+	li 0,-64
+	xxlnand 44,44,44
+	xxlnand 45,45,45
+	stvx 29,1,0
+	li 0,-48
+	vspltisw 29,8
+	vadduwm 29,29,29
+	xxpermdi 46,46,46,2
+	stvx 30,1,0
+	li 0,-32
+	xxpermdi 47,47,47,2
+	xxpermdi 48,48,48,2
+	stvx 31,1,0
+	lxvd2x 63,0,7
+	addis 7,2,.LC8@toc@ha
+	addi 7,7,.LC8@toc@l
+	lxvd2x 62,0,7
+	xxpermdi 63,63,63,2
+	.p2align 4,,15
+.L5:
+	addi 3,10,16
+	addi 5,10,32
+	lxvd2x 34,0,10
+	addi 7,10,64
+	addi 31,10,48
+	addi 12,10,80
+	addi 11,10,96
+	lxvd2x 36,0,3
+	lxvd2x 37,0,5
+	addi 3,10,112
+	addi 5,10,128
+	lxvd2x 38,0,7
+	lxvd2x 7,0,31
+	addi 7,10,160
+	addi 31,10,144
+	lxvd2x 33,0,12
+	lxvd2x 39,0,11
+	addi 12,10,176
+	addi 11,10,192
+	lxvd2x 8,0,3
+	lxvd2x 40,0,5
+	xxpermdi 34,34,34,2
+	addi 3,10,208
+	addi 5,10,224
+	lxvd2x 41,0,7
+	lxvd2x 9,0,31
+	addi 7,10,240
+	lxvd2x 10,0,12
+	lxvd2x 42,0,11
+	xxpermdi 37,37,37,2
+	xxpermdi 36,36,36,2
+	addi 6,6,32
+	lxvd2x 32,0,3
+	lxvd2x 43,0,5
+	xxpermdi 7,7,7,2
+	xxpermdi 38,38,38,2
+	cmpd 7,8,6
+	addi 10,10,256
+	lxvd2x 11,0,7
+	xxpermdi 39,39,39,2
+	xxpermdi 33,33,33,2
+	xxpermdi 40,40,40,2
+	xxpermdi 8,8,8,2
+	xxpermdi 41,41,41,2
+	xxpermdi 9,9,9,2
+	xxpermdi 10,10,10,2
+	xxpermdi 42,42,42,2
+	xxpermdi 43,43,43,2
+	xxpermdi 32,32,32,2
+	xxpermdi 11,11,11,2
+	xvabssp 57,37
+	xvabssp 58,39
+	xvabssp 35,40
+	xvabssp 59,41
+	xvabssp 34,34
+	xvabssp 33,33
+	xvabssp 32,32
+	xvabssp 60,43
+	xvabssp 36,36
+	xvabssp 37,7
+	xvabssp 38,38
+	xvabssp 39,8
+	xvabssp 40,9
+	xvabssp 41,10
+	xvabssp 42,42
+	xvabssp 43,11
+	vperm 24,4,2,12
+	vperm 4,4,2,13
+	vperm 2,5,25,12
+	vperm 5,5,25,13
+	vperm 25,1,6,12
+	vperm 6,1,6,13
+	vperm 1,7,26,12
+	vperm 7,7,26,13
+	vperm 26,8,3,12
+	vperm 8,8,3,13
+	vperm 3,9,27,12
+	vperm 9,9,27,13
+	vperm 27,0,10,12
+	vperm 10,0,10,13
+	vperm 0,11,28,12
+	vperm 11,11,28,13
+	xvaddsp 12,33,39
+	xvaddsp 38,57,38
+	xvaddsp 0,32,43
+	xvaddsp 42,59,42
+	xvaddsp 36,56,36
+	xvaddsp 37,34,37
+	xvaddsp 40,58,40
+	xvaddsp 41,35,41
+	xvcmpgtsp 32,12,38
+	xvcmpgtsp 33,0,42
+	xvcmpgtsp 43,37,36
+	xvcmpgtsp 39,41,40
+	xxsel 12,38,12,32
+	xxsel 38,47,48,32
+	xxsel 0,42,0,33
+	xxsel 42,47,48,33
+	xxsel 37,36,37,43
+	xxsel 43,63,46,43
+	xxsel 41,40,41,39
+	xxsel 39,63,46,39
+	xvcmpgtsp 32,12,37
+	xvcmpgtsp 33,0,41
+	xxsel 12,37,12,32
+	xxsel 43,43,38,32
+	xxsel 0,41,0,33
+	xxsel 33,39,42,33
+	xvcmpgtsp 32,0,12
+	vadduwm 1,1,29
+	xxsel 0,12,0,32
+	xxsel 32,43,33,32
+	xvcmpgtsp 33,0,51
+	vadduwm 0,17,0
+	vadduwm 17,17,30
+	xxsel 50,50,32,33
+	xxsel 51,51,0,33
+	bgt 7,.L5
+	xxsldwi 11,51,51,3
+	xxsldwi 12,51,51,2
+	vspltw 0,18,3
+	xxsldwi 0,51,51,1
+	xscvspdp 11,11
+	xscvspdp 12,12
+	mfvsrwz 6,32
+	vspltw 0,18,2
+	xscvspdp 0,0
+	mfvsrwz 7,50
+	mfvsrwz 5,32
+	vspltw 0,18,0
+	xscvspdp 51,51
+	mfvsrwz 10,32
+	fcmpu 7,11,12
+	rldicl 3,6,0,32
+	fmr 10,0
+	rldicl 11,7,0,32
+	rldicl 31,5,0,32
+	rldicl 0,10,0,32
+	beq 7,.L56
+	bnl 7,.L8
+	fmr 11,12
+	mr 3,31
+.L8:
+	xscmpudp 7,0,51
+	bne 7,.L11
+	cmplw 7,7,10
+	ble 7,.L12
+	mr 7,10
+.L12:
+	rldicl 11,7,0,32
+.L13:
+	fcmpu 7,11,10
+	beq 7,.L57
+	blt 7,.L58
+.L17:
+	cmpd 7,9,8
+	ble 7,.L19
+	addi 7,8,1
+	sldi 10,8,1
+	cmpd 7,7,9
+	sldi 10,10,2
+	add 4,4,10
+	subf 10,8,9
+	mtctr 10
+	bgt 7,.L37
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L37
+	.p2align 4,,15
+.L21:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bng 7,.L20
+	fmr 11,0
+	mr 3,8
+.L20:
+	addi 8,8,1
+	bdnz .L21
+.L19:
+	li 0,-144
+	ld 31,-8(1)
+	addi 3,3,1
+	lvx 24,1,0
+	li 0,-128
+	lvx 25,1,0
+	li 0,-112
+	lvx 26,1,0
+	li 0,-96
+	lvx 27,1,0
+	li 0,-80
+	lvx 28,1,0
+	li 0,-64
+	lvx 29,1,0
+	li 0,-48
+	lvx 30,1,0
+	li 0,-32
+	lvx 31,1,0
+	blr
+	.p2align 4,,15
+.L56:
+	cmplw 7,6,5
+	ble 7,.L7
+	mr 6,5
+.L7:
+	rldicl 3,6,0,32
+	b .L8
+	.p2align 4,,15
+.L29:
+	li 3,1
+	blr
+	.p2align 4,,15
+.L11:
+	bnl 7,.L13
+	xscpsgndp 10,51,51
+	mr 11,0
+	b .L13
+	.p2align 4,,15
+.L57:
+	cmpd 7,3,11
+	ble 7,.L17
+	mr 3,11
+	b .L17
+	.p2align 4,,15
+.L58:
+	fmr 11,10
+	mr 3,11
+	b .L17
+.L43:
+	li 9,1
+	mtctr 9
+	b .L44
+.L37:
+	li 9,1
+	mtctr 9
+	b .L21
+	.long 0
+	.byte 0,0,0,0,0,1,0,0
+	.size	icamax_k,.-icamax_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.byte	0
+	.byte	1
+	.byte	2
+	.byte	3
+	.byte	8
+	.byte	9
+	.byte	10
+	.byte	11
+	.byte	16
+	.byte	17
+	.byte	18
+	.byte	19
+	.byte	24
+	.byte	25
+	.byte	26
+	.byte	27
+.LC3:
+	.byte	4
+	.byte	5
+	.byte	6
+	.byte	7
+	.byte	12
+	.byte	13
+	.byte	14
+	.byte	15
+	.byte	20
+	.byte	21
+	.byte	22
+	.byte	23
+	.byte	28
+	.byte	29
+	.byte	30
+	.byte	31
+.LC4:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC5:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC6:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC7:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+.LC8:
+	.long	32
+	.long	32
+	.long	32
+	.long	32
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/icamax_power9.S b/kernel/power/icamax_power9.S
new file mode 100644
index 000000000..2968b3f8b
--- /dev/null
+++ b/kernel/power/icamax_power9.S
@@ -0,0 +1,387 @@
+	.file	"icamax.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl icamax_k
+	.type	icamax_k, @function
+icamax_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	icamax_k,.-icamax_k
+	mr. 9,3
+	ble 0,.L25
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	cmpdi 7,5,1
+	beq 7,.L53
+	lfs 11,0(4)
+	lfs 0,4(4)
+	cmpdi 7,9,1
+	fabs 11,11
+	fabs 0,0
+	fadds 11,11,0
+	beq 7,.L29
+	addi 9,9,-1
+	sldi 5,5,3
+	li 3,0
+	mtctr 9
+	add 4,4,5
+	li 9,1
+	.p2align 4,,15
+.L24:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	add 4,4,5
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bng 7,.L23
+	fmr 11,0
+	mr 3,9
+.L23:
+	addi 9,9,1
+	bdnz .L24
+.L51:
+	addi 3,3,1
+	blr
+	.p2align 4,,15
+.L25:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L53:
+	rldicr. 8,9,0,58
+	bne 0,.L54
+	addi 7,8,1
+	li 10,0
+	subf 6,8,9
+	li 3,0
+	xxlxor 11,11,11
+	cmpd 7,7,9
+	sldi 10,10,2
+	mtctr 6
+	add 4,4,10
+	bgt 7,.L43
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L43
+	.p2align 4,,15
+.L44:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bng 7,.L46
+	fmr 11,0
+	mr 3,8
+.L46:
+	addi 8,8,1
+	bdnz .L44
+	b .L51
+	.p2align 4,,15
+.L54:
+	addis 11,2,.LC2@toc@ha
+	addis 3,2,.LC3@toc@ha
+	addis 5,2,.LC6@toc@ha
+	addis 6,2,.LC7@toc@ha
+	xxspltib 47,0
+	addis 7,2,.LC4@toc@ha
+	addis 10,2,.LC5@toc@ha
+	stxv 58,-96(1)
+	stxv 59,-80(1)
+	addi 11,11,.LC2@toc@l
+	addi 3,3,.LC3@toc@l
+	addi 5,5,.LC6@toc@l
+	addi 6,6,.LC7@toc@l
+	stxv 62,-32(1)
+	stxv 63,-16(1)
+	xxspltib 58,16
+	addi 7,7,.LC4@toc@l
+	addi 10,10,.LC5@toc@l
+	xxspltib 59,32
+	lxv 44,0(11)
+	lxv 45,0(3)
+	xxspltib 48,0
+	lxv 62,0(5)
+	xxlor 46,47,47
+	lxv 63,0(6)
+	stxv 60,-64(1)
+	stxv 61,-48(1)
+	lxv 60,0(7)
+	lxv 61,0(10)
+	li 7,0
+	mr 10,4
+	vextsb2w 26,26
+	vextsb2w 27,27
+	stxv 56,-128(1)
+	stxv 57,-112(1)
+	.p2align 4,,15
+.L5:
+	lxv 0,0(10)
+	addi 7,7,32
+	addi 10,10,256
+	cmpd 7,8,7
+	xvabssp 34,0
+	lxv 0,-240(10)
+	xvabssp 42,0
+	lxv 0,-224(10)
+	xvabssp 49,0
+	lxv 0,-208(10)
+	vpermr 25,10,2,12
+	vpermr 2,10,2,13
+	xvabssp 35,0
+	lxv 0,-192(10)
+	xvaddsp 34,57,34
+	xvabssp 36,0
+	lxv 0,-176(10)
+	vpermr 10,3,17,12
+	vpermr 3,3,17,13
+	xvabssp 33,0
+	lxv 0,-160(10)
+	xvaddsp 10,42,35
+	xvabssp 50,0
+	lxv 0,-144(10)
+	vpermr 17,1,4,12
+	vpermr 4,1,4,13
+	xvabssp 37,0
+	lxv 0,-128(10)
+	xvaddsp 36,49,36
+	xvabssp 38,0
+	lxv 0,-112(10)
+	vpermr 1,5,18,12
+	vpermr 5,5,18,13
+	xvabssp 43,0
+	lxv 0,-96(10)
+	xvaddsp 12,33,37
+	xvabssp 51,0
+	lxv 0,-80(10)
+	vpermr 18,11,6,12
+	vpermr 6,11,6,13
+	xvabssp 39,0
+	lxv 0,-64(10)
+	xvaddsp 38,50,38
+	xvabssp 40,0
+	lxv 0,-48(10)
+	vpermr 11,7,19,12
+	vpermr 7,7,19,13
+	xvabssp 32,0
+	lxv 0,-32(10)
+	xvaddsp 11,43,39
+	xvcmpgtsp 39,10,34
+	xvcmpgtsp 43,12,36
+	xvabssp 56,0
+	lxv 0,-16(10)
+	vpermr 19,0,8,12
+	vpermr 8,0,8,13
+	xxsel 10,34,10,39
+	xxsel 12,36,12,43
+	xxsel 39,60,61,39
+	xxsel 43,62,63,43
+	xvabssp 41,0
+	xvaddsp 40,51,40
+	vpermr 0,9,24,12
+	vpermr 9,9,24,13
+	xvaddsp 0,32,41
+	xvcmpgtsp 41,11,38
+	xvcmpgtsp 32,12,10
+	xvcmpgtsp 42,0,40
+	xxsel 11,38,11,41
+	xxsel 12,10,12,32
+	xxsel 43,39,43,32
+	xxsel 41,60,61,41
+	xxsel 0,40,0,42
+	xxsel 42,62,63,42
+	xvcmpgtsp 33,0,11
+	xxsel 0,11,0,33
+	xxsel 33,41,42,33
+	xvcmpgtsp 32,0,12
+	vadduwm 1,1,26
+	xxsel 0,12,0,32
+	xxsel 32,43,33,32
+	xvcmpgtsp 33,0,48
+	vadduwm 0,14,0
+	vadduwm 14,14,27
+	xxsel 47,47,32,33
+	xxsel 48,48,0,33
+	bgt 7,.L5
+	xxsldwi 11,48,48,3
+	xxsldwi 12,48,48,2
+	li 10,0
+	li 3,12
+	xxsldwi 0,48,48,1
+	xscvspdp 48,48
+	vextuwrx 6,10,15
+	li 10,4
+	xscvspdp 11,11
+	xscvspdp 12,12
+	xscvspdp 0,0
+	vextuwrx 5,10,15
+	li 10,8
+	vextuwrx 7,10,15
+	vextuwrx 10,3,15
+	rldicl 12,5,0,32
+	rldicl 3,6,0,32
+	rldicl 11,7,0,32
+	rldicl 0,10,0,32
+	fcmpu 7,11,12
+	fmr 10,0
+	beq 7,.L55
+	bnl 7,.L8
+	mr 3,12
+	fmr 11,12
+.L8:
+	xscmpudp 7,0,48
+	bne 7,.L11
+	cmplw 7,7,10
+	ble 7,.L12
+	mr 7,10
+.L12:
+	rldicl 11,7,0,32
+.L13:
+	fcmpu 7,11,10
+	beq 7,.L56
+	bnl 7,.L17
+	mr 3,11
+	fmr 11,10
+.L17:
+	cmpd 7,9,8
+	ble 7,.L19
+	addi 7,8,1
+	sldi 10,8,1
+	subf 6,8,9
+	cmpd 7,7,9
+	sldi 10,10,2
+	mtctr 6
+	add 4,4,10
+	bgt 7,.L37
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L37
+	.p2align 4,,15
+.L21:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bng 7,.L20
+	fmr 11,0
+	mr 3,8
+.L20:
+	addi 8,8,1
+	bdnz .L21
+.L19:
+	lxv 56,-128(1)
+	lxv 57,-112(1)
+	addi 3,3,1
+	lxv 58,-96(1)
+	lxv 59,-80(1)
+	lxv 60,-64(1)
+	lxv 61,-48(1)
+	lxv 62,-32(1)
+	lxv 63,-16(1)
+	blr
+	.p2align 4,,15
+.L55:
+	cmplw 7,6,5
+	ble 7,.L7
+	mr 6,5
+.L7:
+	rldicl 3,6,0,32
+	b .L8
+	.p2align 4,,15
+.L29:
+	li 3,1
+	blr
+	.p2align 4,,15
+.L11:
+	bnl 7,.L13
+	mr 11,0
+	xscpsgndp 10,48,48
+	b .L13
+	.p2align 4,,15
+.L56:
+	cmpd 7,3,11
+	ble 7,.L17
+	mr 3,11
+	b .L17
+.L37:
+	li 9,1
+	mtctr 9
+	b .L21
+.L43:
+	li 9,1
+	mtctr 9
+	b .L44
+	.long 0
+	.byte 0,0,0,0,0,0,0,0
+	.size	icamax_k,.-icamax_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.byte	0
+	.byte	1
+	.byte	2
+	.byte	3
+	.byte	8
+	.byte	9
+	.byte	10
+	.byte	11
+	.byte	16
+	.byte	17
+	.byte	18
+	.byte	19
+	.byte	24
+	.byte	25
+	.byte	26
+	.byte	27
+.LC3:
+	.byte	4
+	.byte	5
+	.byte	6
+	.byte	7
+	.byte	12
+	.byte	13
+	.byte	14
+	.byte	15
+	.byte	20
+	.byte	21
+	.byte	22
+	.byte	23
+	.byte	28
+	.byte	29
+	.byte	30
+	.byte	31
+.LC4:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC5:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC6:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC7:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S
new file mode 100644
index 000000000..e3d66798e
--- /dev/null
+++ b/kernel/power/icamin_power8.S
@@ -0,0 +1,454 @@
+/*	.file	"icamin.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl icamin_k
+	.type	icamin_k, @function
+*/
+#define ASSEMBLER
+#include "common.h"
+
+	PROLOGUE
+
+icamin_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	icamin_k,.-icamin_k
+	mr. 9,3
+	ble 0,.L25
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	lfs 11,0(4)
+	lfs 0,4(4)
+	cmpdi 7,5,1
+	fabs 11,11
+	fabs 0,0
+	fadds 11,11,0
+	beq 7,.L54
+	cmpdi 7,9,1
+	beq 7,.L29
+	addi 9,9,-1
+	sldi 5,5,3
+	mtctr 9
+	add 4,4,5
+	li 3,0
+	li 9,1
+	.p2align 4,,15
+.L24:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	add 4,4,5
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bnl 7,.L23
+	fmr 11,0
+	mr 3,9
+.L23:
+	addi 9,9,1
+	bdnz .L24
+.L52:
+	addi 3,3,1
+	blr
+	.p2align 4,,15
+.L25:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L54:
+	rldicr. 8,9,0,58
+	bne 0,.L55
+	addi 7,8,1
+	li 10,0
+	cmpd 7,7,9
+	sldi 10,10,2
+	add 4,4,10
+	subf 10,8,9
+	mtctr 10
+	li 3,0
+	bgt 7,.L43
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L43
+	.p2align 4,,15
+.L44:
+	lfs 0,0(4)
+	lfs 12,4(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,11,0
+	bng 7,.L46
+	fmr 11,0
+	mr 3,8
+.L46:
+	addi 8,8,1
+	bdnz .L44
+	b .L52
+	.p2align 4,,15
+.L55:
+	li 0,-128
+	std 31,-8(1)
+	addis 5,2,.LC2@toc@ha
+	xscvdpspn 11,11
+	vspltisw 19,0
+	addis 6,2,.LC3@toc@ha
+	addi 5,5,.LC2@toc@l
+	stvx 25,1,0
+	li 0,-112
+	addi 6,6,.LC3@toc@l
+	xxlor 50,51,51
+	addis 7,2,.LC4@toc@ha
+	lxvd2x 44,0,5
+	addis 10,2,.LC5@toc@ha
+	stvx 26,1,0
+	li 0,-96
+	addi 7,7,.LC4@toc@l
+	lxvd2x 45,0,6
+	addis 5,2,.LC6@toc@ha
+	addis 6,2,.LC7@toc@ha
+	stvx 27,1,0
+	li 0,-80
+	addi 10,10,.LC5@toc@l
+	xxspltw 5,11,0
+	addi 6,6,.LC7@toc@l
+	addi 5,5,.LC6@toc@l
+	stvx 28,1,0
+	li 0,-64
+	lxvd2x 47,0,10
+	xxpermdi 44,44,44,2
+	mr 10,4
+	lxvd2x 49,0,6
+	lxvd2x 48,0,5
+	xxpermdi 45,45,45,2
+	li 6,0
+	stvx 29,1,0
+	li 0,-48
+	xxlnand 44,44,44
+	xxlnand 45,45,45
+	stvx 30,1,0
+	lxvd2x 62,0,7
+	addis 7,2,.LC8@toc@ha
+	li 0,-32
+	addi 7,7,.LC8@toc@l
+	xxpermdi 47,47,47,2
+	stvx 31,1,0
+	vspltisw 31,8
+	xxpermdi 48,48,48,2
+	lxvd2x 46,0,7
+	vadduwm 31,31,31
+	xxpermdi 49,49,49,2
+	xxpermdi 62,62,62,2
+	.p2align 4,,15
+.L5:
+	addi 3,10,16
+	addi 5,10,32
+	lxvd2x 34,0,10
+	addi 7,10,64
+	addi 31,10,48
+	addi 12,10,80
+	addi 11,10,96
+	lxvd2x 36,0,3
+	lxvd2x 37,0,5
+	addi 3,10,112
+	addi 5,10,128
+	lxvd2x 38,0,7
+	lxvd2x 6,0,31
+	addi 7,10,160
+	addi 31,10,144
+	lxvd2x 33,0,12
+	lxvd2x 39,0,11
+	addi 12,10,176
+	addi 11,10,192
+	lxvd2x 7,0,3
+	lxvd2x 40,0,5
+	xxpermdi 34,34,34,2
+	addi 3,10,208
+	addi 5,10,224
+	lxvd2x 41,0,7
+	lxvd2x 8,0,31
+	addi 7,10,240
+	lxvd2x 9,0,12
+	lxvd2x 42,0,11
+	xxpermdi 37,37,37,2
+	xxpermdi 36,36,36,2
+	addi 6,6,32
+	lxvd2x 32,0,3
+	lxvd2x 43,0,5
+	xxpermdi 6,6,6,2
+	xxpermdi 38,38,38,2
+	cmpd 7,8,6
+	addi 10,10,256
+	lxvd2x 10,0,7
+	xxpermdi 39,39,39,2
+	xxpermdi 33,33,33,2
+	xxpermdi 40,40,40,2
+	xxpermdi 7,7,7,2
+	xxpermdi 41,41,41,2
+	xxpermdi 8,8,8,2
+	xxpermdi 9,9,9,2
+	xxpermdi 42,42,42,2
+	xxpermdi 43,43,43,2
+	xxpermdi 32,32,32,2
+	xxpermdi 10,10,10,2
+	xvabssp 58,37
+	xvabssp 59,39
+	xvabssp 35,40
+	xvabssp 60,41
+	xvabssp 34,34
+	xvabssp 33,33
+	xvabssp 32,32
+	xvabssp 61,43
+	xvabssp 36,36
+	xvabssp 37,6
+	xvabssp 38,38
+	xvabssp 39,7
+	xvabssp 40,8
+	xvabssp 41,9
+	xvabssp 42,42
+	xvabssp 43,10
+	vperm 25,4,2,12
+	vperm 4,4,2,13
+	vperm 2,5,26,12
+	vperm 5,5,26,13
+	vperm 26,1,6,12
+	vperm 6,1,6,13
+	vperm 1,7,27,12
+	vperm 7,7,27,13
+	vperm 27,8,3,12
+	vperm 8,8,3,13
+	vperm 3,9,28,12
+	vperm 9,9,28,13
+	vperm 28,0,10,12
+	vperm 10,0,10,13
+	vperm 0,11,29,12
+	vperm 11,11,29,13
+	xvaddsp 12,33,39
+	xvaddsp 38,58,38
+	xvaddsp 0,32,43
+	xvaddsp 42,60,42
+	xvaddsp 36,57,36
+	xvaddsp 37,34,37
+	xvaddsp 40,59,40
+	xvaddsp 41,35,41
+	xvcmpgtsp 32,38,12
+	xvcmpgtsp 33,42,0
+	xvcmpgtsp 43,36,37
+	xvcmpgtsp 39,40,41
+	xxsel 12,38,12,32
+	xxsel 38,48,49,32
+	xxsel 0,42,0,33
+	xxsel 42,48,49,33
+	xxsel 37,36,37,43
+	xxsel 43,62,47,43
+	xxsel 41,40,41,39
+	xxsel 39,62,47,39
+	xvcmpgtsp 32,37,12
+	xvcmpgtsp 33,41,0
+	xxsel 12,37,12,32
+	xxsel 43,43,38,32
+	xxsel 0,41,0,33
+	xxsel 33,39,42,33
+	xvcmpgtsp 32,12,0
+	vadduwm 1,1,31
+	xxsel 0,12,0,32
+	xxsel 32,43,33,32
+	xvcmpgtsp 33,5,0
+	vadduwm 0,0,18
+	vadduwm 18,18,14
+	xxsel 51,51,32,33
+	xxsel 5,5,0,33
+	bgt 7,.L5
+	xxsldwi 11,5,5,3
+	xxsldwi 12,5,5,2
+	vspltw 0,19,3
+	xxsldwi 0,5,5,1
+	xscvspdp 11,11
+	xscvspdp 12,12
+	mfvsrwz 6,32
+	vspltw 0,19,2
+	xscvspdp 0,0
+	mfvsrwz 7,51
+	mfvsrwz 5,32
+	vspltw 0,19,0
+	xscvspdp 5,5
+	mfvsrwz 10,32
+	fcmpu 7,11,12
+	rldicl 3,6,0,32
+	fmr 10,0
+	rldicl 11,7,0,32
+	rldicl 31,5,0,32
+	rldicl 0,10,0,32
+	beq 7,.L56
+	bng 7,.L8
+	fmr 11,12
+	mr 3,31
+.L8:
+	fcmpu 7,0,5
+	bne 7,.L11
+	cmplw 7,7,10
+	ble 7,.L12
+	mr 7,10
+.L12:
+	rldicl 11,7,0,32
+.L13:
+	fcmpu 7,11,10
+	beq 7,.L57
+	bgt 7,.L58
+.L17:
+	cmpd 7,9,8
+	ble 7,.L19
+	addi 7,8,1
+	sldi 10,8,1
+	cmpd 7,7,9
+	sldi 10,10,2
+	add 4,4,10
+	subf 10,8,9
+	mtctr 10
+	bgt 7,.L37
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L37
+	.p2align 4,,15
+.L21:
+	lfs 0,0(4)
+	lfs 12,4(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,11,0
+	bng 7,.L20
+	fmr 11,0
+	mr 3,8
+.L20:
+	addi 8,8,1
+	bdnz .L21
+.L19:
+	li 0,-128
+	ld 31,-8(1)
+	addi 3,3,1
+	lvx 25,1,0
+	li 0,-112
+	lvx 26,1,0
+	li 0,-96
+	lvx 27,1,0
+	li 0,-80
+	lvx 28,1,0
+	li 0,-64
+	lvx 29,1,0
+	li 0,-48
+	lvx 30,1,0
+	li 0,-32
+	lvx 31,1,0
+	blr
+	.p2align 4,,15
+.L56:
+	cmplw 7,6,5
+	ble 7,.L7
+	mr 6,5
+.L7:
+	rldicl 3,6,0,32
+	b .L8
+	.p2align 4,,15
+.L29:
+	li 3,1
+	blr
+	.p2align 4,,15
+.L11:
+	bng 7,.L13
+	fmr 10,5
+	mr 11,0
+	b .L13
+	.p2align 4,,15
+.L57:
+	cmpd 7,3,11
+	ble 7,.L17
+	mr 3,11
+	b .L17
+	.p2align 4,,15
+.L58:
+	fmr 11,10
+	mr 3,11
+	b .L17
+.L43:
+	li 9,1
+	mtctr 9
+	b .L44
+.L37:
+	li 9,1
+	mtctr 9
+	b .L21
+	.long 0
+	.byte 0,0,0,0,0,1,0,0
+	.size	icamin_k,.-icamin_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.byte	0
+	.byte	1
+	.byte	2
+	.byte	3
+	.byte	8
+	.byte	9
+	.byte	10
+	.byte	11
+	.byte	16
+	.byte	17
+	.byte	18
+	.byte	19
+	.byte	24
+	.byte	25
+	.byte	26
+	.byte	27
+.LC3:
+	.byte	4
+	.byte	5
+	.byte	6
+	.byte	7
+	.byte	12
+	.byte	13
+	.byte	14
+	.byte	15
+	.byte	20
+	.byte	21
+	.byte	22
+	.byte	23
+	.byte	28
+	.byte	29
+	.byte	30
+	.byte	31
+.LC4:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC5:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC6:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC7:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+.LC8:
+	.long	32
+	.long	32
+	.long	32
+	.long	32
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/icamin_power9.S b/kernel/power/icamin_power9.S
new file mode 100644
index 000000000..8eaa79f33
--- /dev/null
+++ b/kernel/power/icamin_power9.S
@@ -0,0 +1,385 @@
+	.file	"icamin.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl icamin_k
+	.type	icamin_k, @function
+icamin_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	icamin_k,.-icamin_k
+	mr. 9,3
+	ble 0,.L25
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	lfs 11,0(4)
+	lfs 0,4(4)
+	cmpdi 7,5,1
+	fabs 11,11
+	fabs 0,0
+	fadds 11,11,0
+	beq 7,.L53
+	cmpdi 7,9,1
+	beq 7,.L29
+	addi 9,9,-1
+	sldi 5,5,3
+	li 3,0
+	mtctr 9
+	add 4,4,5
+	li 9,1
+	.p2align 4,,15
+.L24:
+	lfs 0,4(4)
+	lfs 12,0(4)
+	add 4,4,5
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,0,11
+	bnl 7,.L23
+	fmr 11,0
+	mr 3,9
+.L23:
+	addi 9,9,1
+	bdnz .L24
+.L51:
+	addi 3,3,1
+	blr
+	.p2align 4,,15
+.L25:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L53:
+	rldicr. 8,9,0,58
+	bne 0,.L54
+	addi 7,8,1
+	li 10,0
+	subf 6,8,9
+	li 3,0
+	cmpd 7,7,9
+	sldi 10,10,2
+	mtctr 6
+	add 4,4,10
+	bgt 7,.L43
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L43
+	.p2align 4,,15
+.L44:
+	lfs 0,0(4)
+	lfs 12,4(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,11,0
+	bng 7,.L46
+	fmr 11,0
+	mr 3,8
+.L46:
+	addi 8,8,1
+	bdnz .L44
+	b .L51
+	.p2align 4,,15
+.L54:
+	xscvdpspn 9,11
+	addis 11,2,.LC2@toc@ha
+	addis 3,2,.LC3@toc@ha
+	addis 5,2,.LC6@toc@ha
+	addis 6,2,.LC7@toc@ha
+	addis 7,2,.LC4@toc@ha
+	addis 10,2,.LC5@toc@ha
+	xxspltib 48,0
+	addi 11,11,.LC2@toc@l
+	addi 3,3,.LC3@toc@l
+	addi 5,5,.LC6@toc@l
+	stxv 59,-80(1)
+	addi 6,6,.LC7@toc@l
+	stxv 60,-64(1)
+	stxv 63,-16(1)
+	addi 7,7,.LC4@toc@l
+	xxspltib 59,16
+	lxv 44,0(11)
+	xxspltib 60,32
+	lxv 45,0(3)
+	lxv 63,0(5)
+	xxlor 47,48,48
+	lxv 46,0(6)
+	addi 10,10,.LC5@toc@l
+	stxv 61,-48(1)
+	stxv 62,-32(1)
+	xxspltw 9,9,0
+	lxv 61,0(7)
+	lxv 62,0(10)
+	li 7,0
+	mr 10,4
+	vextsb2w 27,27
+	vextsb2w 28,28
+	stxv 57,-112(1)
+	stxv 58,-96(1)
+	.p2align 4,,15
+.L5:
+	lxv 0,0(10)
+	addi 7,7,32
+	addi 10,10,256
+	cmpd 7,8,7
+	xvabssp 34,0
+	lxv 0,-240(10)
+	xvabssp 42,0
+	lxv 0,-224(10)
+	xvabssp 49,0
+	lxv 0,-208(10)
+	vpermr 26,10,2,12
+	vpermr 2,10,2,13
+	xvabssp 35,0
+	lxv 0,-192(10)
+	xvaddsp 34,58,34
+	xvabssp 36,0
+	lxv 0,-176(10)
+	vpermr 10,3,17,12
+	vpermr 3,3,17,13
+	xvabssp 33,0
+	lxv 0,-160(10)
+	xvaddsp 10,42,35
+	xvabssp 50,0
+	lxv 0,-144(10)
+	vpermr 17,1,4,12
+	vpermr 4,1,4,13
+	xvabssp 37,0
+	lxv 0,-128(10)
+	xvaddsp 36,49,36
+	xvabssp 38,0
+	lxv 0,-112(10)
+	vpermr 1,5,18,12
+	vpermr 5,5,18,13
+	xvabssp 43,0
+	lxv 0,-96(10)
+	xvaddsp 12,33,37
+	xvabssp 51,0
+	lxv 0,-80(10)
+	vpermr 18,11,6,12
+	vpermr 6,11,6,13
+	xvabssp 39,0
+	lxv 0,-64(10)
+	xvaddsp 38,50,38
+	xvabssp 40,0
+	lxv 0,-48(10)
+	vpermr 11,7,19,12
+	vpermr 7,7,19,13
+	xvabssp 32,0
+	lxv 0,-32(10)
+	xvaddsp 11,43,39
+	xvcmpgtsp 39,34,10
+	xvcmpgtsp 43,36,12
+	xvabssp 57,0
+	lxv 0,-16(10)
+	vpermr 19,0,8,12
+	vpermr 8,0,8,13
+	xxsel 10,34,10,39
+	xxsel 12,36,12,43
+	xxsel 39,61,62,39
+	xxsel 43,63,46,43
+	xvabssp 41,0
+	xvaddsp 40,51,40
+	vpermr 0,9,25,12
+	vpermr 9,9,25,13
+	xvaddsp 0,32,41
+	xvcmpgtsp 41,38,11
+	xvcmpgtsp 32,10,12
+	xvcmpgtsp 42,40,0
+	xxsel 11,38,11,41
+	xxsel 12,10,12,32
+	xxsel 43,39,43,32
+	xxsel 41,61,62,41
+	xxsel 0,40,0,42
+	xxsel 42,63,46,42
+	xvcmpgtsp 33,11,0
+	xxsel 0,11,0,33
+	xxsel 33,41,42,33
+	xvcmpgtsp 32,12,0
+	vadduwm 1,1,27
+	xxsel 0,12,0,32
+	xxsel 32,43,33,32
+	xvcmpgtsp 33,9,0
+	vadduwm 0,0,15
+	vadduwm 15,15,28
+	xxsel 48,48,32,33
+	xxsel 9,9,0,33
+	bgt 7,.L5
+	xxsldwi 11,9,9,3
+	xxsldwi 12,9,9,2
+	li 10,0
+	li 3,12
+	xxsldwi 0,9,9,1
+	xscvspdp 9,9
+	vextuwrx 6,10,16
+	li 10,4
+	xscvspdp 11,11
+	xscvspdp 12,12
+	xscvspdp 0,0
+	vextuwrx 5,10,16
+	li 10,8
+	vextuwrx 7,10,16
+	vextuwrx 10,3,16
+	rldicl 12,5,0,32
+	rldicl 3,6,0,32
+	rldicl 11,7,0,32
+	rldicl 0,10,0,32
+	fcmpu 7,11,12
+	fmr 10,0
+	beq 7,.L55
+	bng 7,.L8
+	mr 3,12
+	fmr 11,12
+.L8:
+	fcmpu 7,0,9
+	bne 7,.L11
+	cmplw 7,7,10
+	ble 7,.L12
+	mr 7,10
+.L12:
+	rldicl 11,7,0,32
+.L13:
+	fcmpu 7,11,10
+	beq 7,.L56
+	bng 7,.L17
+	mr 3,11
+	fmr 11,10
+.L17:
+	cmpd 7,9,8
+	ble 7,.L19
+	addi 7,8,1
+	sldi 10,8,1
+	subf 6,8,9
+	cmpd 7,7,9
+	sldi 10,10,2
+	mtctr 6
+	add 4,4,10
+	bgt 7,.L37
+	li 10,-1
+	rldicr 10,10,0,0
+	cmpd 7,9,10
+	beq 7,.L37
+	.p2align 4,,15
+.L21:
+	lfs 0,0(4)
+	lfs 12,4(4)
+	addi 4,4,8
+	fabs 0,0
+	fabs 12,12
+	fadds 0,0,12
+	fcmpu 7,11,0
+	bng 7,.L20
+	fmr 11,0
+	mr 3,8
+.L20:
+	addi 8,8,1
+	bdnz .L21
+.L19:
+	lxv 57,-112(1)
+	lxv 58,-96(1)
+	addi 3,3,1
+	lxv 59,-80(1)
+	lxv 60,-64(1)
+	lxv 61,-48(1)
+	lxv 62,-32(1)
+	lxv 63,-16(1)
+	blr
+	.p2align 4,,15
+.L55:
+	cmplw 7,6,5
+	ble 7,.L7
+	mr 6,5
+.L7:
+	rldicl 3,6,0,32
+	b .L8
+	.p2align 4,,15
+.L29:
+	li 3,1
+	blr
+	.p2align 4,,15
+.L11:
+	bng 7,.L13
+	mr 11,0
+	fmr 10,9
+	b .L13
+	.p2align 4,,15
+.L56:
+	cmpd 7,3,11
+	ble 7,.L17
+	mr 3,11
+	b .L17
+.L37:
+	li 9,1
+	mtctr 9
+	b .L21
+.L43:
+	li 9,1
+	mtctr 9
+	b .L44
+	.long 0
+	.byte 0,0,0,0,0,0,0,0
+	.size	icamin_k,.-icamin_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.byte	0
+	.byte	1
+	.byte	2
+	.byte	3
+	.byte	8
+	.byte	9
+	.byte	10
+	.byte	11
+	.byte	16
+	.byte	17
+	.byte	18
+	.byte	19
+	.byte	24
+	.byte	25
+	.byte	26
+	.byte	27
+.LC3:
+	.byte	4
+	.byte	5
+	.byte	6
+	.byte	7
+	.byte	12
+	.byte	13
+	.byte	14
+	.byte	15
+	.byte	20
+	.byte	21
+	.byte	22
+	.byte	23
+	.byte	28
+	.byte	29
+	.byte	30
+	.byte	31
+.LC4:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC5:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC6:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC7:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S
new file mode 100644
index 000000000..c8fcaecc3
--- /dev/null
+++ b/kernel/power/isamax_power8.S
@@ -0,0 +1,434 @@
+/*	.file	"isamax.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl isamax_k
+	.type	isamax_k, @function
+*/
+
+#define ASSEMBLER
+#include "common.h"
+
+	PROLOGUE
+
+isamax_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	isamax_k,.-isamax_k
+	mr. 11,3
+	ble 0,.L36
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	cmpdi 7,5,1
+	beq 7,.L69
+	rldicr. 7,11,0,61
+	beq 0,.L40
+	sldi 3,5,1
+	xxlxor 0,0,0
+	sldi 6,5,2
+	add 3,3,5
+	sldi 0,5,4
+	sldi 3,3,2
+	sldi 5,5,3
+	mr 9,4
+	li 8,0
+	li 10,0
+	.p2align 4,,15
+.L31:
+	lfs 12,0(9)
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L23
+	fmr 0,12
+	mr 8,10
+.L23:
+	lfsx 12,9,6
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L25
+	fmr 0,12
+	addi 8,10,1
+.L25:
+	lfsx 12,9,5
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L27
+	fmr 0,12
+	addi 8,10,2
+.L27:
+	lfsx 12,9,3
+	add 9,9,0
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L29
+	fmr 0,12
+	addi 8,10,3
+.L29:
+	addi 10,10,4
+	cmpd 7,7,10
+	bgt 7,.L31
+	addi 7,7,-1
+	srdi 7,7,2
+	addi 7,7,1
+	sldi 9,7,2
+	mulld 7,6,7
+	cmpd 7,11,9
+	ble 7,.L67
+.L22:
+	addi 10,9,1
+	sldi 7,7,2
+	cmpd 7,10,11
+	subf 10,9,11
+	mtctr 10
+	add 4,4,7
+	bgt 7,.L54
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L54
+	.p2align 4,,15
+.L35:
+	lfs 12,0(4)
+	add 4,4,6
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L33
+	fmr 0,12
+	mr 8,9
+.L33:
+	addi 9,9,1
+	bdnz .L35
+.L67:
+	addi 3,8,1
+	blr
+	.p2align 4,,15
+.L36:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L69:
+	rldicr. 10,11,0,57
+	bne 0,.L70
+	addi 7,10,1
+	sldi 9,10,2
+	xxlxor 12,12,12
+	cmpd 7,7,11
+	add 4,4,9
+	subf 9,10,11
+	li 8,0
+	mtctr 9
+	bgt 7,.L60
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L60
+	.p2align 4,,15
+.L61:
+	lfs 0,0(4)
+	addi 4,4,4
+	fabs 0,0
+	fcmpu 7,0,12
+	bng 7,.L63
+	fmr 12,0
+	mr 8,10
+.L63:
+	addi 10,10,1
+	bdnz .L61
+	b .L67
+	.p2align 4,,15
+.L70:
+	li 0,-64
+	std 31,-8(1)
+	addis 3,2,.LC2@toc@ha
+	vspltisw 18,0
+	vspltisw 12,0
+	addis 5,2,.LC3@toc@ha
+	addis 6,2,.LC6@toc@ha
+	stvx 29,1,0
+	li 0,-48
+	addis 8,2,.LC7@toc@ha
+	xxlor 35,50,50
+	addi 3,3,.LC2@toc@l
+	addi 5,5,.LC3@toc@l
+	stvx 30,1,0
+	addi 6,6,.LC6@toc@l
+	li 0,-32
+	addi 8,8,.LC7@toc@l
+	lxvd2x 51,0,3
+	lxvd2x 34,0,5
+	addis 7,2,.LC4@toc@ha
+	stvx 31,1,0
+	lxvd2x 47,0,6
+	addis 9,2,.LC5@toc@ha
+	addi 7,7,.LC4@toc@l
+	lxvd2x 48,0,8
+	addi 9,9,.LC5@toc@l
+	vspltisw 17,8
+	vadduwm 17,17,17
+	lxvd2x 36,0,7
+	li 7,0
+	lxvd2x 37,0,9
+	mr 9,4
+	.p2align 4,,15
+.L5:
+	addi 5,9,16
+	addi 6,9,32
+	lxvd2x 41,0,9
+	vadduwm 31,3,15
+	addi 8,9,64
+	addi 31,9,48
+	addi 12,9,80
+	addi 3,9,96
+	lxvd2x 5,0,5
+	lxvd2x 43,0,6
+	addi 5,9,112
+	addi 6,9,128
+	lxvd2x 1,0,8
+	lxvd2x 9,0,31
+	addi 8,9,160
+	addi 31,9,144
+	lxvd2x 6,0,12
+	lxvd2x 13,0,3
+	addi 12,9,176
+	addi 3,9,192
+	lxvd2x 11,0,5
+	lxvd2x 2,0,6
+	xvabssp 41,41
+	addi 5,9,208
+	addi 6,9,224
+	lxvd2x 3,0,8
+	lxvd2x 7,0,31
+	addi 8,9,240
+	lxvd2x 10,0,12
+	lxvd2x 4,0,3
+	xvabssp 43,43
+	xvabssp 5,5
+	addi 7,7,64
+	lxvd2x 8,0,5
+	lxvd2x 0,0,6
+	xvabssp 9,9
+	xvabssp 1,1
+	cmpd 7,10,7
+	addi 9,9,256
+	lxvd2x 12,0,8
+	xvabssp 6,6
+	xvabssp 13,13
+	xvabssp 11,11
+	xvabssp 2,2
+	xvabssp 7,7
+	xvabssp 3,3
+	xvabssp 10,10
+	xvabssp 4,4
+	xvabssp 8,8
+	xvabssp 0,0
+	xvabssp 12,12
+	xvcmpgtsp 32,5,41
+	xvcmpgtsp 61,9,43
+	xvcmpgtsp 45,6,1
+	xvcmpgtsp 62,11,13
+	xvcmpgtsp 38,7,2
+	xvcmpgtsp 46,10,3
+	xvcmpgtsp 40,8,4
+	xvcmpgtsp 39,12,0
+	xxsel 5,41,5,32
+	xxsel 32,51,34,32
+	xxsel 9,43,9,61
+	xxsel 6,1,6,45
+	xxsel 11,13,11,62
+	xxsel 43,51,34,45
+	xxsel 7,2,7,38
+	xvcmpgtsp 41,9,5
+	xxsel 10,3,10,46
+	xvcmpgtsp 45,11,6
+	xxsel 8,4,8,40
+	xxsel 62,36,37,62
+	xxsel 0,0,12,39
+	xvcmpgtsp 42,10,7
+	xxsel 61,36,37,61
+	xxsel 40,51,34,40
+	xvcmpgtsp 33,0,8
+	xxsel 39,36,37,39
+	xxsel 38,51,34,38
+	xxsel 46,36,37,46
+	xxsel 9,5,9,41
+	xxsel 41,32,61,41
+	xxsel 12,6,11,45
+	xxsel 45,43,62,45
+	xxsel 11,7,10,42
+	xvcmpgtsp 32,12,9
+	vadduwm 13,13,17
+	xxsel 42,38,46,42
+	xxsel 0,8,0,33
+	xxsel 33,40,39,33
+	xvcmpgtsp 43,0,11
+	vadduwm 1,1,17
+	xxsel 12,9,12,32
+	xxsel 32,41,45,32
+	vadduwm 0,3,0
+	vadduwm 3,3,16
+	xxsel 0,11,0,43
+	xxsel 33,42,33,43
+	xvcmpgtsp 45,0,12
+	vadduwm 1,31,1
+	xxsel 0,12,0,45
+	xxsel 32,32,33,45
+	xvcmpgtsp 33,0,44
+	xxsel 50,50,32,33
+	xxsel 44,44,0,33
+	bgt 7,.L5
+	xxsldwi 12,44,44,1
+	xscvspdp 10,44
+	vspltw 0,18,0
+	xxsldwi 0,44,44,3
+	xscvspdp 12,12
+	mfvsrwz 3,50
+	mfvsrwz 6,32
+	vspltw 0,18,3
+	xscvspdp 0,0
+	xxsldwi 44,44,44,2
+	mfvsrwz 7,32
+	vspltw 0,18,2
+	xscvspdp 44,44
+	mfvsrwz 9,32
+	fcmpu 7,12,10
+	rldicl 8,3,0,32
+	rldicl 31,6,0,32
+	fmr 11,0
+	rldicl 0,7,0,32
+	rldicl 5,9,0,32
+	beq 7,.L71
+	bnl 7,.L8
+	fmr 12,10
+	mr 8,31
+.L8:
+	xscmpudp 7,0,44
+	bne 7,.L11
+	cmplw 7,7,9
+	ble 7,.L12
+	mr 7,9
+.L12:
+	rldicl 5,7,0,32
+.L13:
+	fcmpu 7,12,11
+	beq 7,.L72
+	bnl 7,.L17
+	fmr 12,11
+	mr 8,5
+.L17:
+	cmpd 7,11,10
+	ble 7,.L16
+	addi 7,10,1
+	sldi 9,10,2
+	cmpd 7,7,11
+	add 4,4,9
+	subf 9,10,11
+	mtctr 9
+	bgt 7,.L53
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L53
+	.p2align 4,,15
+.L21:
+	lfs 0,0(4)
+	addi 4,4,4
+	fabs 0,0
+	fcmpu 7,0,12
+	bng 7,.L19
+	fmr 12,0
+	mr 8,10
+.L19:
+	addi 10,10,1
+	bdnz .L21
+.L16:
+	li 0,-64
+	ld 31,-8(1)
+	addi 3,8,1
+	lvx 29,1,0
+	li 0,-48
+	lvx 30,1,0
+	li 0,-32
+	lvx 31,1,0
+	blr
+	.p2align 4,,15
+.L71:
+	cmplw 7,3,6
+	ble 7,.L7
+	mr 3,6
+.L7:
+	rldicl 8,3,0,32
+	b .L8
+	.p2align 4,,15
+.L40:
+	xxlxor 0,0,0
+	sldi 6,5,2
+	li 8,0
+	li 9,0
+	b .L22
+	.p2align 4,,15
+.L11:
+	blt 7,.L39
+	mr 5,0
+	b .L13
+	.p2align 4,,15
+.L72:
+	cmpd 7,8,5
+	ble 7,.L17
+	mr 8,5
+	b .L17
+	.p2align 4,,15
+.L39:
+	xscpsgndp 11,44,44
+	b .L13
+.L53:
+	li 9,1
+	mtctr 9
+	b .L21
+.L54:
+	li 10,1
+	mtctr 10
+	b .L35
+.L60:
+	li 9,1
+	mtctr 9
+	b .L61
+	.long 0
+	.byte 0,0,0,0,0,1,0,0
+	.size	isamax_k,.-isamax_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC3:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC4:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC5:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+.LC6:
+	.long	32
+	.long	32
+	.long	32
+	.long	32
+.LC7:
+	.long	64
+	.long	64
+	.long	64
+	.long	64
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/isamax_power9.S b/kernel/power/isamax_power9.S
new file mode 100644
index 000000000..9df1e773c
--- /dev/null
+++ b/kernel/power/isamax_power9.S
@@ -0,0 +1,397 @@
+	.file	"isamax.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl isamax_k
+	.type	isamax_k, @function
+isamax_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	isamax_k,.-isamax_k
+	mr. 11,3
+	ble 0,.L36
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	cmpdi 7,5,1
+	beq 7,.L69
+	rldicr. 7,11,0,61
+	beq 0,.L40
+	sldi 10,5,1
+	sldi 6,5,2
+	sldi 0,5,4
+	sldi 3,5,3
+	mr 9,4
+	xxlxor 0,0,0
+	li 8,0
+	add 5,10,5
+	li 10,0
+	sldi 5,5,2
+	.p2align 4,,15
+.L31:
+	lfs 12,0(9)
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L23
+	fmr 0,12
+	mr 8,10
+.L23:
+	lfsx 12,9,6
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L25
+	fmr 0,12
+	addi 8,10,1
+.L25:
+	lfsx 12,9,3
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L27
+	fmr 0,12
+	addi 8,10,2
+.L27:
+	lfsx 12,9,5
+	add 9,9,0
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L29
+	fmr 0,12
+	addi 8,10,3
+.L29:
+	addi 10,10,4
+	cmpd 7,7,10
+	bgt 7,.L31
+	addi 7,7,-1
+	srdi 7,7,2
+	addi 7,7,1
+	sldi 9,7,2
+	mulld 7,6,7
+	cmpd 7,11,9
+	ble 7,.L67
+.L22:
+	addi 10,9,1
+	sldi 7,7,2
+	subf 5,9,11
+	cmpd 7,10,11
+	mtctr 5
+	add 4,4,7
+	bgt 7,.L54
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L54
+	.p2align 4,,15
+.L35:
+	lfs 12,0(4)
+	add 4,4,6
+	fabs 12,12
+	fcmpu 7,12,0
+	bng 7,.L33
+	fmr 0,12
+	mr 8,9
+.L33:
+	addi 9,9,1
+	bdnz .L35
+.L67:
+	addi 3,8,1
+	blr
+	.p2align 4,,15
+.L36:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L69:
+	rldicr. 10,11,0,57
+	bne 0,.L70
+	addi 7,10,1
+	sldi 9,10,2
+	subf 6,10,11
+	li 8,0
+	xxlxor 12,12,12
+	cmpd 7,7,11
+	mtctr 6
+	add 4,4,9
+	bgt 7,.L60
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L60
+	.p2align 4,,15
+.L61:
+	lfs 0,0(4)
+	addi 4,4,4
+	fabs 0,0
+	fcmpu 7,0,12
+	bng 7,.L63
+	fmr 12,0
+	mr 8,10
+.L63:
+	addi 10,10,1
+	bdnz .L61
+	b .L67
+	.p2align 4,,15
+.L70:
+	addis 6,2,.LC2@toc@ha
+	addis 7,2,.LC3@toc@ha
+	addis 8,2,.LC4@toc@ha
+	addis 9,2,.LC5@toc@ha
+	xxspltib 46,0
+	stxv 61,-48(1)
+	stxv 62,-32(1)
+	addi 6,6,.LC2@toc@l
+	addi 7,7,.LC3@toc@l
+	stxv 63,-16(1)
+	xxspltib 61,32
+	xxspltib 63,16
+	xxspltib 62,64
+	addi 8,8,.LC4@toc@l
+	addi 9,9,.LC5@toc@l
+	lxv 47,0(6)
+	xxspltib 34,0
+	lxv 48,0(7)
+	xxlor 51,46,46
+	lxv 49,0(8)
+	lxv 50,0(9)
+	li 8,0
+	mr 9,4
+	vextsb2w 29,29
+	vextsb2w 31,31
+	vextsb2w 30,30
+	stxv 59,-80(1)
+	stxv 60,-64(1)
+	.p2align 4,,15
+.L5:
+	lxv 0,0(9)
+	vadduwm 27,19,29
+	lxv 12,240(9)
+	addi 8,8,64
+	addi 9,9,256
+	cmpd 7,10,8
+	xvabssp 44,0
+	lxv 0,-240(9)
+	xvabssp 12,12
+	xvabssp 5,0
+	lxv 0,-224(9)
+	xvabssp 32,0
+	lxv 0,-208(9)
+	xvcmpgtsp 35,5,44
+	xvabssp 9,0
+	lxv 0,-192(9)
+	xxsel 5,44,5,35
+	xxsel 35,47,48,35
+	xvabssp 1,0
+	lxv 0,-176(9)
+	xvcmpgtsp 60,9,32
+	xvabssp 6,0
+	lxv 0,-160(9)
+	xxsel 9,32,9,60
+	xxsel 60,49,50,60
+	xvabssp 13,0
+	lxv 0,-144(9)
+	xvcmpgtsp 42,9,5
+	xvcmpgtsp 37,6,1
+	xvabssp 11,0
+	lxv 0,-128(9)
+	xxsel 9,5,9,42
+	xxsel 42,35,60,42
+	xxsel 6,1,6,37
+	xxsel 37,47,48,37
+	xvabssp 2,0
+	lxv 0,-112(9)
+	xvcmpgtsp 36,11,13
+	xvabssp 7,0
+	lxv 0,-96(9)
+	xxsel 11,13,11,36
+	xxsel 36,49,50,36
+	xvabssp 3,0
+	lxv 0,-80(9)
+	xvcmpgtsp 45,11,6
+	xvcmpgtsp 39,7,2
+	xvabssp 10,0
+	lxv 0,-64(9)
+	xxsel 7,2,7,39
+	xxsel 39,47,48,39
+	xvabssp 4,0
+	lxv 0,-48(9)
+	xvcmpgtsp 38,10,3
+	xvabssp 8,0
+	lxv 0,-32(9)
+	xxsel 10,3,10,38
+	xxsel 38,49,50,38
+	xvabssp 0,0
+	xvcmpgtsp 43,10,7
+	xvcmpgtsp 41,8,4
+	xvcmpgtsp 40,12,0
+	xxsel 8,4,8,41
+	xxsel 41,47,48,41
+	xxsel 0,0,12,40
+	xxsel 12,6,11,45
+	xxsel 11,7,10,43
+	xxsel 45,37,36,45
+	xvcmpgtsp 33,0,8
+	xvcmpgtsp 32,12,9
+	vadduwm 13,13,31
+	xxsel 40,49,50,40
+	xxsel 43,39,38,43
+	xxsel 0,8,0,33
+	xxsel 12,9,12,32
+	xxsel 33,41,40,33
+	xxsel 32,42,45,32
+	xvcmpgtsp 44,0,11
+	vadduwm 1,1,31
+	vadduwm 0,19,0
+	vadduwm 19,19,30
+	xxsel 0,11,0,44
+	xxsel 33,43,33,44
+	xvcmpgtsp 45,0,12
+	vadduwm 1,27,1
+	xxsel 0,12,0,45
+	xxsel 32,32,33,45
+	xvcmpgtsp 33,0,34
+	xxsel 46,46,32,33
+	xxsel 34,34,0,33
+	bgt 7,.L5
+	xxsldwi 12,34,34,3
+	xxsldwi 11,34,34,2
+	li 9,0
+	li 8,12
+	xxsldwi 0,34,34,1
+	xscvspdp 34,34
+	vextuwrx 3,9,14
+	li 9,4
+	xscvspdp 12,12
+	xscvspdp 11,11
+	xscvspdp 0,0
+	vextuwrx 6,9,14
+	li 9,8
+	vextuwrx 7,9,14
+	vextuwrx 9,8,14
+	rldicl 12,6,0,32
+	rldicl 8,3,0,32
+	rldicl 0,7,0,32
+	rldicl 5,9,0,32
+	fcmpu 7,12,11
+	fmr 10,0
+	beq 7,.L71
+	bnl 7,.L8
+	mr 8,12
+	fmr 12,11
+.L8:
+	xscmpudp 7,0,34
+	bne 7,.L11
+	cmplw 7,7,9
+	ble 7,.L12
+	mr 7,9
+.L12:
+	rldicl 5,7,0,32
+.L13:
+	fcmpu 7,12,10
+	beq 7,.L72
+	bnl 7,.L17
+	mr 8,5
+	fmr 12,10
+.L17:
+	cmpd 7,11,10
+	ble 7,.L16
+	addi 7,10,1
+	sldi 9,10,2
+	subf 6,10,11
+	cmpd 7,7,11
+	mtctr 6
+	add 4,4,9
+	bgt 7,.L53
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L53
+	.p2align 4,,15
+.L21:
+	lfs 0,0(4)
+	addi 4,4,4
+	fabs 0,0
+	fcmpu 7,0,12
+	bng 7,.L19
+	fmr 12,0
+	mr 8,10
+.L19:
+	addi 10,10,1
+	bdnz .L21
+.L16:
+	lxv 59,-80(1)
+	lxv 60,-64(1)
+	addi 3,8,1
+	lxv 61,-48(1)
+	lxv 62,-32(1)
+	lxv 63,-16(1)
+	blr
+	.p2align 4,,15
+.L71:
+	cmplw 7,3,6
+	ble 7,.L7
+	mr 3,6
+.L7:
+	rldicl 8,3,0,32
+	b .L8
+	.p2align 4,,15
+.L40:
+	sldi 6,5,2
+	li 8,0
+	li 9,0
+	xxlxor 0,0,0
+	b .L22
+	.p2align 4,,15
+.L11:
+	blt 7,.L39
+	mr 5,0
+	b .L13
+	.p2align 4,,15
+.L72:
+	cmpd 7,8,5
+	ble 7,.L17
+	mr 8,5
+	b .L17
+	.p2align 4,,15
+.L39:
+	xscpsgndp 10,34,34
+	b .L13
+.L53:
+	li 9,1
+	mtctr 9
+	b .L21
+.L54:
+	li 10,1
+	mtctr 10
+	b .L35
+.L60:
+	li 9,1
+	mtctr 9
+	b .L61
+	.long 0
+	.byte 0,0,0,0,0,0,0,0
+	.size	isamax_k,.-isamax_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC3:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC4:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC5:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S
new file mode 100644
index 000000000..3873e879b
--- /dev/null
+++ b/kernel/power/isamin_power8.S
@@ -0,0 +1,417 @@
+/*	.file	"isamin.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl isamin_k
+	.type	isamin_k, @function
+*/
+#define ASSEMBLER
+#include "common.h"
+
+	PROLOGUE
+
+isamin_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	isamin_k,.-isamin_k
+	mr. 11,3
+	ble 0,.L36
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	lfs 0,0(4)
+	li 0,-48
+	cmpdi 7,5,1
+	stvx 30,1,0
+	li 0,-32
+	stvx 31,1,0
+	fabs 0,0
+	beq 7,.L62
+	rldicr. 6,11,0,61
+	beq 0,.L40
+	sldi 0,5,1
+	sldi 12,5,2
+	std 31,-8(1)
+	add 0,0,5
+	neg 31,5
+	sldi 3,5,4
+	sldi 0,0,2
+	add 7,4,12
+	sldi 31,31,2
+	sldi 5,5,3
+	li 9,0
+	li 10,0
+	b .L24
+	.p2align 4,,15
+.L41:
+	mr 10,9
+.L25:
+	fmr 0,12
+	add 7,7,3
+.L24:
+	lfs 12,0(7)
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L26
+	fmr 0,12
+	addi 10,9,1
+.L26:
+	add 8,31,7
+	lfsx 12,8,5
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L28
+	fmr 0,12
+	addi 10,9,2
+.L28:
+	lfsx 12,8,0
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L30
+	fmr 0,12
+	addi 10,9,3
+.L30:
+	addi 9,9,4
+	cmpd 7,6,9
+	ble 7,.L63
+	lfsx 12,8,3
+	fabs 12,12
+	fcmpu 7,12,0
+	blt 7,.L41
+	fmr 12,0
+	b .L25
+	.p2align 4,,15
+.L36:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L63:
+	addi 6,6,-1
+	ld 31,-8(1)
+	srdi 6,6,2
+	addi 6,6,1
+	sldi 9,6,2
+	mulld 6,12,6
+	cmpd 7,11,9
+	ble 7,.L33
+.L23:
+	addi 8,9,1
+	sldi 6,6,2
+	cmpd 7,8,11
+	subf 8,9,11
+	mtctr 8
+	add 4,4,6
+	bgt 7,.L52
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L52
+	.p2align 4,,15
+.L35:
+	lfs 12,0(4)
+	add 4,4,12
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L34
+	fmr 0,12
+	mr 10,9
+.L34:
+	addi 9,9,1
+	bdnz .L35
+.L33:
+	li 0,-48
+	addi 3,10,1
+	lvx 30,1,0
+	li 0,-32
+	lvx 31,1,0
+	blr
+	.p2align 4,,15
+.L62:
+	rldicr. 8,11,0,57
+	li 10,0
+	bne 0,.L64
+.L4:
+	addi 7,8,1
+	sldi 9,8,2
+	cmpd 7,7,11
+	add 4,4,9
+	subf 9,8,11
+	mtctr 9
+	bgt 7,.L51
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L51
+	.p2align 4,,15
+.L22:
+	lfs 12,0(4)
+	addi 4,4,4
+	fabs 12,12
+	fcmpu 7,0,12
+	bng 7,.L21
+	fmr 0,12
+	mr 10,8
+.L21:
+	addi 8,8,1
+	bdnz .L22
+	li 0,-48
+	addi 3,10,1
+	lvx 30,1,0
+	li 0,-32
+	lvx 31,1,0
+	blr
+	.p2align 4,,15
+.L64:
+	lxvd2x 4,0,4
+	addis 10,2,.LC2@toc@ha
+	addis 5,2,.LC3@toc@ha
+	std 31,-8(1)
+	vspltisw 2,0
+	addi 10,10,.LC2@toc@l
+	addis 7,2,.LC4@toc@ha
+	addis 9,2,.LC5@toc@ha
+	addis 6,2,.LC6@toc@ha
+	lxvd2x 51,0,10
+	addis 10,2,.LC7@toc@ha
+	addi 7,7,.LC4@toc@l
+	addi 9,9,.LC5@toc@l
+	addi 5,5,.LC3@toc@l
+	xvabssp 4,4
+	addi 6,6,.LC6@toc@l
+	addi 10,10,.LC7@toc@l
+	lxvd2x 36,0,7
+	vspltisw 18,8
+	lxvd2x 37,0,9
+	lxvd2x 35,0,5
+	mr 9,4
+	li 7,0
+	lxvd2x 48,0,6
+	lxvd2x 49,0,10
+	vadduwm 18,18,18
+	xxlor 38,51,51
+	xxlor 40,4,4
+	b .L6
+	.p2align 4,,15
+.L65:
+	lxvd2x 5,0,9
+	xvabssp 40,5
+.L6:
+	addi 5,9,16
+	addi 6,9,32
+	vadduwm 14,2,16
+	addi 10,9,64
+	addi 12,9,48
+	addi 31,9,80
+	addi 3,9,96
+	lxvd2x 5,0,5
+	lxvd2x 42,0,6
+	addi 5,9,112
+	addi 6,9,128
+	lxvd2x 44,0,10
+	lxvd2x 9,0,12
+	addi 10,9,160
+	addi 12,9,144
+	lxvd2x 6,0,31
+	lxvd2x 1,0,3
+	addi 31,9,176
+	addi 3,9,192
+	lxvd2x 11,0,5
+	lxvd2x 13,0,6
+	addi 5,9,208
+	addi 6,9,224
+	lxvd2x 2,0,10
+	lxvd2x 7,0,12
+	addi 10,9,240
+	lxvd2x 10,0,31
+	lxvd2x 3,0,3
+	xvabssp 42,42
+	xvabssp 5,5
+	addi 7,7,64
+	lxvd2x 8,0,5
+	lxvd2x 0,0,6
+	xvabssp 44,44
+	xvabssp 9,9
+	cmpd 7,8,7
+	addi 9,9,256
+	lxvd2x 12,0,10
+	xvabssp 6,6
+	xvabssp 1,1
+	xvabssp 11,11
+	xvabssp 13,13
+	xvabssp 7,7
+	xvabssp 2,2
+	xvabssp 10,10
+	xvabssp 3,3
+	xvabssp 8,8
+	xvabssp 0,0
+	xvabssp 12,12
+	xvcmpgtsp 32,40,5
+	xvcmpgtsp 62,42,9
+	xvcmpgtsp 45,44,6
+	xvcmpgtsp 63,1,11
+	xvcmpgtsp 39,13,7
+	xvcmpgtsp 47,2,10
+	xvcmpgtsp 41,3,8
+	xvcmpgtsp 33,0,12
+	xxsel 5,40,5,32
+	xxsel 32,38,35,32
+	xxsel 9,42,9,62
+	xxsel 6,44,6,45
+	xxsel 11,1,11,63
+	xxsel 44,38,35,45
+	xxsel 7,13,7,39
+	xvcmpgtsp 42,5,9
+	xxsel 10,2,10,47
+	xvcmpgtsp 45,6,11
+	xxsel 8,3,8,41
+	xxsel 63,36,37,63
+	xxsel 0,0,12,33
+	xvcmpgtsp 43,7,10
+	xxsel 40,36,37,33
+	xxsel 62,36,37,62
+	xvcmpgtsp 33,8,0
+	xxsel 41,38,35,41
+	xxsel 39,38,35,39
+	xxsel 47,36,37,47
+	xxsel 9,5,9,42
+	xxsel 42,32,62,42
+	xxsel 12,6,11,45
+	xxsel 45,44,63,45
+	xxsel 11,7,10,43
+	xvcmpgtsp 32,9,12
+	vadduwm 13,13,18
+	xxsel 43,39,47,43
+	xxsel 0,8,0,33
+	xxsel 33,41,40,33
+	xvcmpgtsp 44,11,0
+	vadduwm 1,1,18
+	xxsel 12,9,12,32
+	xxsel 32,42,45,32
+	vadduwm 0,2,0
+	vadduwm 2,2,17
+	xxsel 0,11,0,44
+	xxsel 33,43,33,44
+	xvcmpgtsp 45,12,0
+	vadduwm 1,14,1
+	xxsel 0,12,0,45
+	xxsel 32,32,33,45
+	xvcmpgtsp 33,4,0
+	xxsel 51,51,32,33
+	xxsel 4,4,0,33
+	bgt 7,.L65
+	xxsldwi 0,4,4,1
+	xscvspdp 10,4
+	vspltw 0,19,0
+	xxsldwi 12,4,4,3
+	xscvspdp 0,0
+	mfvsrwz 3,51
+	mfvsrwz 6,32
+	vspltw 0,19,3
+	xscvspdp 12,12
+	xxsldwi 4,4,4,2
+	mfvsrwz 7,32
+	vspltw 0,19,2
+	xscvspdp 4,4
+	mfvsrwz 9,32
+	fcmpu 7,0,10
+	rldicl 10,3,0,32
+	rldicl 31,6,0,32
+	fmr 11,12
+	rldicl 5,7,0,32
+	rldicl 0,9,0,32
+	beq 7,.L66
+	bng 7,.L9
+	fmr 0,10
+	mr 10,31
+.L9:
+	fcmpu 7,12,4
+	bne 7,.L12
+	cmplw 7,7,9
+	ble 7,.L13
+	mr 7,9
+.L13:
+	rldicl 5,7,0,32
+.L14:
+	fcmpu 7,0,11
+	beq 7,.L67
+	bng 7,.L19
+	fmr 0,11
+	mr 10,5
+.L19:
+	cmpd 7,11,8
+	ld 31,-8(1)
+	bgt 7,.L4
+	b .L33
+	.p2align 4,,15
+.L66:
+	cmplw 7,3,6
+	ble 7,.L8
+	mr 3,6
+.L8:
+	rldicl 10,3,0,32
+	b .L9
+	.p2align 4,,15
+.L40:
+	sldi 12,5,2
+	li 10,0
+	li 9,0
+	b .L23
+	.p2align 4,,15
+.L12:
+	bng 7,.L14
+	fmr 11,4
+	mr 5,0
+	b .L14
+	.p2align 4,,15
+.L67:
+	cmpd 7,10,5
+	ble 7,.L19
+	mr 10,5
+	b .L19
+.L51:
+	li 9,1
+	mtctr 9
+	b .L22
+.L52:
+	li 8,1
+	mtctr 8
+	b .L35
+	.long 0
+	.byte 0,0,0,0,0,1,0,0
+	.size	isamin_k,.-isamin_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC3:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC4:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC5:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+.LC6:
+	.long	32
+	.long	32
+	.long	32
+	.long	32
+.LC7:
+	.long	64
+	.long	64
+	.long	64
+	.long	64
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits
diff --git a/kernel/power/isamin_power9.S b/kernel/power/isamin_power9.S
new file mode 100644
index 000000000..0475edf46
--- /dev/null
+++ b/kernel/power/isamin_power9.S
@@ -0,0 +1,382 @@
+	.file	"isamin.c"
+	.abiversion 2
+	.section	".text"
+	.align 2
+	.p2align 4,,15
+	.globl isamin_k
+	.type	isamin_k, @function
+isamin_k:
+.LCF0:
+0:	addis 2,12,.TOC.-.LCF0@ha
+	addi 2,2,.TOC.-.LCF0@l
+	.localentry	isamin_k,.-isamin_k
+	mr. 11,3
+	ble 0,.L36
+	cmpdi 7,5,0
+	li 3,0
+	blelr 7
+	lfs 0,0(4)
+	cmpdi 7,5,1
+	stxv 61,-64(1)
+	stxv 62,-48(1)
+	stxv 63,-32(1)
+	fabs 0,0
+	beq 7,.L62
+	rldicr. 6,11,0,61
+	beq 0,.L40
+	sldi 8,5,1
+	sldi 0,5,2
+	neg 12,5
+	std 31,-8(1)
+	sldi 3,5,4
+	sldi 31,5,3
+	li 9,0
+	li 10,0
+	add 5,8,5
+	add 7,4,0
+	sldi 12,12,2
+	sldi 5,5,2
+	b .L24
+	.p2align 4,,15
+.L41:
+	mr 10,9
+.L25:
+	add 7,7,3
+	fmr 0,12
+.L24:
+	lfs 12,0(7)
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L26
+	fmr 0,12
+	addi 10,9,1
+.L26:
+	add 8,7,12
+	lfsx 12,8,31
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L28
+	fmr 0,12
+	addi 10,9,2
+.L28:
+	lfsx 12,8,5
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L30
+	fmr 0,12
+	addi 10,9,3
+.L30:
+	addi 9,9,4
+	cmpd 7,6,9
+	ble 7,.L63
+	lfsx 12,8,3
+	fabs 12,12
+	fcmpu 7,12,0
+	blt 7,.L41
+	fmr 12,0
+	b .L25
+	.p2align 4,,15
+.L36:
+	li 3,0
+	blr
+	.p2align 4,,15
+.L63:
+	addi 6,6,-1
+	ld 31,-8(1)
+	srdi 6,6,2
+	addi 6,6,1
+	sldi 9,6,2
+	mulld 6,0,6
+	cmpd 7,11,9
+	ble 7,.L33
+.L23:
+	addi 8,9,1
+	sldi 6,6,2
+	subf 7,9,11
+	cmpd 7,8,11
+	mtctr 7
+	add 4,4,6
+	bgt 7,.L52
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L52
+	.p2align 4,,15
+.L35:
+	lfs 12,0(4)
+	add 4,4,0
+	fabs 12,12
+	fcmpu 7,12,0
+	bnl 7,.L34
+	fmr 0,12
+	mr 10,9
+.L34:
+	addi 9,9,1
+	bdnz .L35
+.L33:
+	lxv 61,-64(1)
+	lxv 62,-48(1)
+	addi 3,10,1
+	lxv 63,-32(1)
+	blr
+	.p2align 4,,15
+.L62:
+	rldicr. 8,11,0,57
+	li 10,0
+	bne 0,.L64
+.L4:
+	addi 7,8,1
+	sldi 9,8,2
+	subf 6,8,11
+	cmpd 7,7,11
+	mtctr 6
+	add 4,4,9
+	bgt 7,.L51
+	li 3,-1
+	rldicr 3,3,0,0
+	cmpd 7,11,3
+	beq 7,.L51
+	.p2align 4,,15
+.L22:
+	lfs 12,0(4)
+	addi 4,4,4
+	fabs 12,12
+	fcmpu 7,0,12
+	bng 7,.L21
+	fmr 0,12
+	mr 10,8
+.L21:
+	addi 8,8,1
+	bdnz .L22
+	lxv 61,-64(1)
+	lxv 62,-48(1)
+	addi 3,10,1
+	lxv 63,-32(1)
+	blr
+	.p2align 4,,15
+.L64:
+	lxv 0,0(4)
+	xxspltib 47,16
+	addis 6,2,.LC2@toc@ha
+	addis 7,2,.LC3@toc@ha
+	addis 10,2,.LC4@toc@ha
+	addis 9,2,.LC5@toc@ha
+	xxspltib 63,32
+	xxspltib 46,64
+	addi 6,6,.LC2@toc@l
+	addi 10,10,.LC4@toc@l
+	addi 7,7,.LC3@toc@l
+	std 31,-8(1)
+	addi 9,9,.LC5@toc@l
+	xxspltib 50,0
+	vextsb2w 15,15
+	lxv 48,0(6)
+	lxv 51,0(10)
+	vextsb2w 31,31
+	vextsb2w 14,14
+	xvabssp 4,0
+	lxv 34,0(9)
+	lxv 49,0(7)
+	mr 9,4
+	li 10,0
+	xxlor 35,48,48
+	xxlor 40,4,4
+	b .L6
+	.p2align 4,,15
+.L65:
+	lxv 0,0(9)
+	xvabssp 40,0
+.L6:
+	lxv 0,16(9)
+	vadduwm 29,18,31
+	lxv 12,240(9)
+	addi 10,10,64
+	addi 9,9,256
+	cmpd 7,8,10
+	xvabssp 5,0
+	lxv 0,-224(9)
+	xvabssp 12,12
+	xvabssp 32,0
+	lxv 0,-208(9)
+	xvcmpgtsp 42,40,5
+	xvabssp 9,0
+	lxv 0,-192(9)
+	xxsel 5,40,5,42
+	xvabssp 44,0
+	lxv 0,-176(9)
+	xvcmpgtsp 62,32,9
+	xvabssp 6,0
+	lxv 0,-160(9)
+	xxsel 9,32,9,62
+	xxsel 32,35,49,42
+	xvabssp 1,0
+	lxv 0,-144(9)
+	xxsel 62,51,34,62
+	xvcmpgtsp 42,5,9
+	xvcmpgtsp 37,44,6
+	xvabssp 11,0
+	lxv 0,-128(9)
+	xxsel 9,5,9,42
+	xxsel 42,32,62,42
+	xxsel 6,44,6,37
+	xxsel 37,35,49,37
+	xvabssp 13,0
+	lxv 0,-112(9)
+	xvcmpgtsp 36,1,11
+	xvabssp 7,0
+	lxv 0,-96(9)
+	xxsel 11,1,11,36
+	xxsel 36,51,34,36
+	xvabssp 2,0
+	lxv 0,-80(9)
+	xvcmpgtsp 45,6,11
+	xvcmpgtsp 39,13,7
+	xvabssp 10,0
+	lxv 0,-64(9)
+	xxsel 7,13,7,39
+	xxsel 39,35,49,39
+	xvabssp 3,0
+	lxv 0,-48(9)
+	xvcmpgtsp 38,2,10
+	xvabssp 8,0
+	lxv 0,-32(9)
+	xxsel 10,2,10,38
+	xxsel 38,51,34,38
+	xvabssp 0,0
+	xvcmpgtsp 43,7,10
+	xvcmpgtsp 41,3,8
+	xvcmpgtsp 33,0,12
+	xxsel 8,3,8,41
+	xxsel 41,35,49,41
+	xxsel 0,0,12,33
+	xxsel 40,51,34,33
+	xxsel 12,6,11,45
+	xxsel 11,7,10,43
+	xvcmpgtsp 33,8,0
+	xxsel 45,37,36,45
+	xvcmpgtsp 32,9,12
+	xxsel 43,39,38,43
+	vadduwm 13,13,15
+	xxsel 0,8,0,33
+	xxsel 33,41,40,33
+	xxsel 12,9,12,32
+	xxsel 32,42,45,32
+	xvcmpgtsp 44,11,0
+	vadduwm 1,1,15
+	vadduwm 0,18,0
+	vadduwm 18,18,14
+	xxsel 0,11,0,44
+	xxsel 33,43,33,44
+	xvcmpgtsp 45,12,0
+	vadduwm 1,29,1
+	xxsel 0,12,0,45
+	xxsel 32,32,33,45
+	xvcmpgtsp 33,4,0
+	xxsel 48,48,32,33
+	xxsel 4,4,0,33
+	bgt 7,.L65
+	xxsldwi 0,4,4,3
+	xxsldwi 11,4,4,2
+	li 9,0
+	li 10,12
+	xxsldwi 12,4,4,1
+	xscvspdp 4,4
+	vextuwrx 3,9,16
+	li 9,4
+	xscvspdp 0,0
+	xscvspdp 11,11
+	xscvspdp 12,12
+	vextuwrx 6,9,16
+	li 9,8
+	vextuwrx 7,9,16
+	vextuwrx 9,10,16
+	rldicl 31,6,0,32
+	rldicl 10,3,0,32
+	rldicl 5,7,0,32
+	rldicl 0,9,0,32
+	fcmpu 7,0,11
+	fmr 10,12
+	beq 7,.L66
+	bng 7,.L9
+	mr 10,31
+	fmr 0,11
+.L9:
+	fcmpu 7,12,4
+	bne 7,.L12
+	cmplw 7,7,9
+	ble 7,.L13
+	mr 7,9
+.L13:
+	rldicl 5,7,0,32
+.L14:
+	fcmpu 7,0,10
+	beq 7,.L67
+	bng 7,.L19
+	mr 10,5
+	fmr 0,10
+.L19:
+	cmpd 7,11,8
+	ld 31,-8(1)
+	bgt 7,.L4
+	b .L33
+	.p2align 4,,15
+.L66:
+	cmplw 7,3,6
+	ble 7,.L8
+	mr 3,6
+.L8:
+	rldicl 10,3,0,32
+	b .L9
+	.p2align 4,,15
+.L40:
+	sldi 0,5,2
+	li 10,0
+	li 9,0
+	b .L23
+	.p2align 4,,15
+.L12:
+	bng 7,.L14
+	mr 5,0
+	fmr 10,4
+	b .L14
+	.p2align 4,,15
+.L67:
+	cmpd 7,10,5
+	ble 7,.L19
+	mr 10,5
+	b .L19
+.L51:
+	li 9,1
+	mtctr 9
+	b .L22
+.L52:
+	li 8,1
+	mtctr 8
+	b .L35
+	.long 0
+	.byte 0,0,0,0,0,1,0,0
+	.size	isamin_k,.-isamin_k
+	.section	.rodata.cst16,"aM",@progbits,16
+	.align 4
+.LC2:
+	.long	0
+	.long	1
+	.long	2
+	.long	3
+.LC3:
+	.long	4
+	.long	5
+	.long	6
+	.long	7
+.LC4:
+	.long	8
+	.long	9
+	.long	10
+	.long	11
+.LC5:
+	.long	12
+	.long	13
+	.long	14
+	.long	15
+	.ident	"GCC: (SUSE Linux) 7.3.1 20180323 [gcc-7-branch revision 258812]"
+	.section	.note.GNU-stack,"",@progbits

From 6b830793686c06fed3f517ca1e95854891bbed6f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 25 Sep 2019 23:13:24 +0200
Subject: [PATCH 045/210] Count cpu cores on ARMV8 and use that to pick the
 GEMM_PQ parameters (#2267)

There is currently no simple way to query cache sizes on ARMV8, so this takes the number of cores as a trivial indication if the target is a server-class device with a big cache, or just a single-board toy or smartphone.
---
 cpuid_arm64.c | 30 ++++++++++++++++++++++++++++--
 param.h       | 31 +++++++++++++++++++++++--------
 2 files changed, 51 insertions(+), 10 deletions(-)

diff --git a/cpuid_arm64.c b/cpuid_arm64.c
index e8aa29813..9e019fe3e 100644
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -206,6 +206,33 @@ void get_subdirname(void)
 	printf("arm64");
 }
 
+void get_cpucount(void)
+{
+int n=0;
+
+#ifdef linux
+	FILE *infile;
+  	char buffer[2048], *p,*t;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("processor", buffer, 9))
+		n++;
+  	}
+
+  	fclose(infile);
+
+	printf("#define NUM_CORES %d\n",n);
+#endif
+
+}
+
+
+
 void get_cpuconfig(void)
 {
 
@@ -309,6 +336,7 @@ void get_cpuconfig(void)
 			printf("#define DTB_SIZE             4096     \n");
 			break;	
 	}
+	get_cpucount();
 }
 
 
@@ -351,5 +379,3 @@ void get_features(void)
 #endif
 	return;
 }
-
-
diff --git a/param.h b/param.h
index 5fbdbcdcd..0ff59f400 100644
--- a/param.h
+++ b/param.h
@@ -2636,15 +2636,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 
-#define SGEMM_DEFAULT_P	512
-#define DGEMM_DEFAULT_P	256
-#define CGEMM_DEFAULT_P 256
-#define ZGEMM_DEFAULT_P 128
+/*FIXME: this should be using the cache size, but there is currently no easy way to
+query that on ARM. So if getarch counted more than 8 cores we simply assume the host
+is a big desktop or server with abundant cache rather than a phone or embedded device */ 
+#if NUM_CORES > 8
+  #define SGEMM_DEFAULT_P 512
+  #define DGEMM_DEFAULT_P 256
+  #define CGEMM_DEFAULT_P 256
+  #define ZGEMM_DEFAULT_P 128
+
+  #define SGEMM_DEFAULT_Q 1024
+  #define DGEMM_DEFAULT_Q 512
+  #define CGEMM_DEFAULT_Q 512
+  #define ZGEMM_DEFAULT_Q 512
+#else
+  #define SGEMM_DEFAULT_P 128
+  #define DGEMM_DEFAULT_P 160
+  #define CGEMM_DEFAULT_P 128
+  #define ZGEMM_DEFAULT_P 128
 
-#define SGEMM_DEFAULT_Q 1024
-#define DGEMM_DEFAULT_Q 512
-#define CGEMM_DEFAULT_Q 512
-#define ZGEMM_DEFAULT_Q 512
+  #define SGEMM_DEFAULT_Q 352
+  #define DGEMM_DEFAULT_Q 128
+  #define CGEMM_DEFAULT_Q 224
+  #define ZGEMM_DEFAULT_Q 112
+#endif
 
 #define SGEMM_DEFAULT_R 4096
 #define DGEMM_DEFAULT_R 4096

From 7f58f3ad0e10304965a6573bb11208cb6e1df446 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 27 Sep 2019 00:44:26 +0200
Subject: [PATCH 046/210] Fix mis-edits in the gcc-derived power8 caxpy kernel

---
 kernel/power/caxpy_power8.S | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S
index 09a423571..0ce61ca3b 100644
--- a/kernel/power/caxpy_power8.S
+++ b/kernel/power/caxpy_power8.S
@@ -34,9 +34,9 @@ caxpy_k:
 	lfs 0,4(10)
 	fmuls 10,2,10
 #ifdef CONJ	
-	fmsubs 11,11,1,10
-#else	
 	fmadds 11,11,1,10
+#else	
+	fmsubs 11,11,1,10
 #endif
 	fadds 12,12,11
 	stfs 12,0(10)
@@ -241,8 +241,13 @@ caxpy_k:
 	lfsx 12,8,5
 	lfsx 0,10,5
 	fmuls 11,2,11
+#ifdef CONJ
 	fmsubs 12,1,12,11
 	fsubs 0,0,12
+#else
+	fmadds 12,1,12,11
+	fadds  0,0,12
+#endif
 	stfsx 0,10,5
 	ble 7,.L39
 	sldi 6,6,2

From 596a22325a1123bed772c61d298c8d14d187cfe3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 27 Sep 2019 00:47:18 +0200
Subject: [PATCH 047/210] Fix prologue of power9 assembly cdot(c) kernel to
 provide cdotc

---
 kernel/power/cdot_power9.S | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S
index 01d194c0c..9ec7cdd85 100644
--- a/kernel/power/cdot_power9.S
+++ b/kernel/power/cdot_power9.S
@@ -1,10 +1,16 @@
-	.file	"cdot.c"
+#define ASSEMBLER
+#include "common.h"
+/*
+.file	"cdot.c"
 	.abiversion 2
 	.section	".text"
 	.align 2
 	.p2align 4,,15
 	.globl cdot_k
 	.type	cdot_k, @function
+*/
+	PROLOGUE
+	
 cdot_k:
 .LCF0:
 0:	addis 2,12,.TOC.-.LCF0@ha

From ede5efebabb5dbde46175b996df59c755248bf29 Mon Sep 17 00:00:00 2001
From: AbdelRauf <quickwritereader@gmail.com>
Date: Sun, 29 Sep 2019 02:27:50 +0000
Subject: [PATCH 048/210] trmm fix

---
 kernel/power/sgemm_logic_power9.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/power/sgemm_logic_power9.S b/kernel/power/sgemm_logic_power9.S
index 053836cbf..a34ed32b8 100644
--- a/kernel/power/sgemm_logic_power9.S
+++ b/kernel/power/sgemm_logic_power9.S
@@ -136,8 +136,8 @@ LSGEMM_L8x16_BEGIN:
 #endif 
  
     ZERO8x16 
-	mtctr		L 
 	ble		LSGEMM_L8x16_SUB0
+	mtctr		L 
     bl      LSGEMM_L8x16_LMAIN_SUB
 	andi.		L,	T12,	127
 	ble		LSGEMM_L8x16_SAVE
@@ -146,7 +146,7 @@ LSGEMM_L8x16_BEGIN:
 LSGEMM_L8x16_SUB0:
 #if defined(TRMMKERNEL)
 	andi.		L,	T11,	255
-    cmpwi   T11,128
+    cmpwi   T11,129
 #else
 	andi.		L,	K,	255
     cmpwi   K,129

From 6355c25dde1ccba0fe6521dc0b36c0fcdddda0ef Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Sun, 29 Sep 2019 22:03:12 -0700
Subject: [PATCH 049/210] Avoid taking root of negative number in symv_thread.c

This is similar to fixes in gh-1929, but there was one remaining
occurance of this type of pattern in the driver/level2/*_thread.c
files.
---
 driver/level2/symv_thread.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c
index ab783de2b..d7cc01768 100644
--- a/driver/level2/symv_thread.c
+++ b/driver/level2/symv_thread.c
@@ -166,7 +166,11 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
     if (nthreads - num_cpu > 1) {
 
       double di = (double)i;
-      width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask;
+      if (di * di - dnum > 0) {
+        width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
+      } else {
+        width = m - i;
+      }
 
       if (width < 4) width = 4;
       if (width > m - i) width = m - i;
@@ -212,9 +216,9 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
 
       double di = (double)(m - i);
       if (di * di - dnum > 0) {
-	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
+        width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
       } else {
-	width = m - i;
+        width = m - i;
       }
 
       if (width < 4) width = 4;

From 8617d75548ae7be8f78406c15f98e218ad89a42a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 1 Oct 2019 23:50:41 +0200
Subject: [PATCH 050/210] Revert "Avoid taking root of negative number in
 symv_thread.c"

---
 driver/level2/symv_thread.c | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/driver/level2/symv_thread.c b/driver/level2/symv_thread.c
index d7cc01768..ab783de2b 100644
--- a/driver/level2/symv_thread.c
+++ b/driver/level2/symv_thread.c
@@ -166,11 +166,7 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
     if (nthreads - num_cpu > 1) {
 
       double di = (double)i;
-      if (di * di - dnum > 0) {
-        width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
-      } else {
-        width = m - i;
-      }
+      width = ((BLASLONG)(sqrt(di * di + dnum) - di) + mask) & ~mask;
 
       if (width < 4) width = 4;
       if (width > m - i) width = m - i;
@@ -216,9 +212,9 @@ int CNAME(BLASLONG m, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG i
 
       double di = (double)(m - i);
       if (di * di - dnum > 0) {
-        width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
+	width = ((BLASLONG)(-sqrt(di * di - dnum) + di) + mask) & ~mask;
       } else {
-        width = m - i;
+	width = m - i;
       }
 
       if (width < 4) width = 4;

From ac10236cc8a7b61e3fa37741ca903ea4d990a62e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 2 Oct 2019 22:35:34 +0200
Subject: [PATCH 051/210] Update the OSX BINARY=32 test to xcode9.2

in response to Homebrew updates
---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 2b1b99b26..51c55acf5 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -169,7 +169,7 @@ matrix:
         - BTYPE="BINARY=64 INTERFACE64=1"
 
     - <<: *test-macos
-      osx_image: xcode8.3
+      osx_image: xcode9.2
       env:
         - BTYPE="BINARY=32"
 

From 32f5907fef1b1a68a3af20278c4f3b3b54b5268b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Oct 2019 01:09:02 +0200
Subject: [PATCH 052/210] Update 32bit macOS again to xcode 9.3

os version 10.13 "High Sierra" appears to be the oldest release now for which Homebrew provides a gcc package.
Anything older and the Travis job will run out of time building gcc from source
---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 51c55acf5..28f95f5e2 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -169,7 +169,7 @@ matrix:
         - BTYPE="BINARY=64 INTERFACE64=1"
 
     - <<: *test-macos
-      osx_image: xcode9.2
+      osx_image: xcode9.3
       env:
         - BTYPE="BINARY=32"
 

From bb5413863fbf52dc5b8f2fd1b814b80c938d8c39 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 4 Oct 2019 14:50:03 +0200
Subject: [PATCH 053/210] Rewrite ARM64 PROLOGUE to make it compatible with
 xcode/ios

---
 common_arm64.h | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/common_arm64.h b/common_arm64.h
index c6ef2fb5d..c5e6948dc 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -103,12 +103,14 @@ static inline int blas_quickdivide(blasint x, blasint y){
 
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 
-#define PROLOGUE \
-	.text ;\
-	.align	4 ;\
-	.global	REALNAME ;\
-	.type	REALNAME, %function ;\
+.macro PROLOGUE 
+	.text ;
+	.p2align 2 ;
+	.global	REALNAME ;
+	.type	REALNAME, %function ;
 REALNAME:
+.endm
+
 
 #define EPILOGUE
 

From 56837e9d92c41290b07bc924915c633e39401abb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 4 Oct 2019 14:53:23 +0200
Subject: [PATCH 054/210] Make local labels in macro compatible with the xcode
 assembler

... which does not perform the automatic numbering on instantiation that the _@ suffix signifies
---
 kernel/arm64/nrm2.S  | 19 ++++++++++---------
 kernel/arm64/znrm2.S | 38 +++++++++++++++++++-------------------
 2 files changed, 29 insertions(+), 28 deletions(-)

diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S
index e2cbd4def..d4f0374cb 100644
--- a/kernel/arm64/nrm2.S
+++ b/kernel/arm64/nrm2.S
@@ -54,37 +54,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 	ldr	s4, [X], #4
 	fcmp	s4, REGZERO
-	beq	KERNEL_F1_NEXT_\@
+	beq	2f     /* KERNEL_F1_NEXT_\@ */
+	beq     2f
 	fabs	s4, s4
 	fcmp	SCALE, s4
-	bge	KERNEL_F1_SCALE_GE_X_\@
+	bge	1f     /* KERNEL_F1_SCALE_GE_X_\@ */
 	fdiv	s2, SCALE, s4
 	fmul	s2, s2, s2
 	fmul	s3, SSQ, s2
 	fadd	SSQ, REGONE, s3
 	fmov	SCALE, s4
-	b	KERNEL_F1_NEXT_\@
-KERNEL_F1_SCALE_GE_X_\@:
+	b	2f     /* KERNEL_F1_NEXT_\@ */
+1:               /* KERNEL_F1_SCALE_GE_X_\@: */
 	fdiv	s2, s4, SCALE
 	fmla	SSQ, s2, v2.s[0]
 #else
 	ldr	d4, [X], #8
 	fcmp	d4, REGZERO
-	beq	KERNEL_F1_NEXT_\@
+	beq	2f     /* KERNEL_F1_NEXT_\@ */
 	fabs	d4, d4
 	fcmp	SCALE, d4
-	bge	KERNEL_F1_SCALE_GE_X_\@
+	bge	1f     /* KERNEL_F1_SCALE_GE_X_\@ */
 	fdiv	d2, SCALE, d4
 	fmul	d2, d2, d2
 	fmul	d3, SSQ, d2
 	fadd	SSQ, REGONE, d3
 	fmov	SCALE, d4
-	b	KERNEL_F1_NEXT_\@
-KERNEL_F1_SCALE_GE_X_\@:
+	b	2f     /* KERNEL_F1_NEXT_\@ */
+1:                     /* KERNEL_F1_SCALE_GE_X_\@: */
 	fdiv	d2, d4, SCALE
 	fmla	SSQ, d2, v2.d[0]
 #endif
-KERNEL_F1_NEXT_\@:
+2:                     /* KERNEL_F1_NEXT_\@: */
 .endm
 
 .macro KERNEL_S1
diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S
index 1c89685ea..ce3f7d4ed 100644
--- a/kernel/arm64/znrm2.S
+++ b/kernel/arm64/znrm2.S
@@ -54,69 +54,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 	ldr	s4, [X], #4
 	fcmp	s4, REGZERO
-	beq	KERNEL_F1_NEXT_\@
+	beq	2f    /* KERNEL_F1_NEXT_\@ */
 	fabs	s4, s4
 	fcmp	SCALE, s4
-	bge	KERNEL_F1_SCALE_GE_XR_\@
+	bge	1f    /* KERNEL_F1_SCALE_GE_XR_\@ */
 	fdiv	s2, SCALE, s4
 	fmul	s2, s2, s2
 	fmul	s3, SSQ, s2
 	fadd	SSQ, REGONE, s3
 	fmov	SCALE, s4
-	b	KERNEL_F1_NEXT_\@
-KERNEL_F1_SCALE_GE_XR_\@:
+	b	2f    /* KERNEL_F1_NEXT_\@ */
+1:                    /* KERNEL_F1_SCALE_GE_XR_\@: */
 	fdiv	s2, s4, SCALE
 	fmla	SSQ, s2, v2.s[0]
-KERNEL_F1_NEXT_\@:
+2:                    /* KERNEL_F1_NEXT_\@: */
 	ldr	s5, [X], #4
 	fcmp	s5, REGZERO
-	beq	KERNEL_F1_END_\@
+	beq	4f    /* KERNEL_F1_END_\@ */
 	fabs	s5, s5
 	fcmp	SCALE, s5
-	bge	KERNEL_F1_SCALE_GE_XI_\@
+	bge	3f    /* KERNEL_F1_SCALE_GE_XI_\@ */
 	fdiv	s2, SCALE, s5
 	fmul	s2, s2, s2
 	fmul	s3, SSQ, s2
 	fadd	SSQ, REGONE, s3
 	fmov	SCALE, s5
-	b	KERNEL_F1_END_\@
-KERNEL_F1_SCALE_GE_XI_\@:
+	b	4f    /* KERNEL_F1_END_\@ */
+3:                    /* KERNEL_F1_SCALE_GE_XI_\@: */
 	fdiv	s2, s5, SCALE
 	fmla	SSQ, s2, v2.s[0]
 #else
 	ldr	d4, [X], #8
 	fcmp	d4, REGZERO
-	beq	KERNEL_F1_NEXT_\@
+	beq	2f    /* KERNEL_F1_NEXT_\@ */
 	fabs	d4, d4
 	fcmp	SCALE, d4
-	bge	KERNEL_F1_SCALE_GE_XR_\@
+	bge	1f    /* KERNEL_F1_SCALE_GE_XR_\@ */
 	fdiv	d2, SCALE, d4
 	fmul	d2, d2, d2
 	fmul	d3, SSQ, d2
 	fadd	SSQ, REGONE, d3
 	fmov	SCALE, d4
-	b	KERNEL_F1_NEXT_\@
-KERNEL_F1_SCALE_GE_XR_\@:
+	b	2f    /* KERNEL_F1_NEXT_\@ */
+1:                    /* KERNEL_F1_SCALE_GE_XR_\@: */
 	fdiv	d2, d4, SCALE
 	fmla	SSQ, d2, v2.d[0]
-KERNEL_F1_NEXT_\@:
+2:                   /* KERNEL_F1_NEXT_\@: */
 	ldr	d5, [X], #8
 	fcmp	d5, REGZERO
-	beq	KERNEL_F1_END_\@
+	beq	4f  /* KERNEL_F1_END_\@ */
 	fabs	d5, d5
 	fcmp	SCALE, d5
-	bge	KERNEL_F1_SCALE_GE_XI_\@
+	bge	3f    /* KERNEL_F1_SCALE_GE_XI_\@ */
 	fdiv	d2, SCALE, d5
 	fmul	d2, d2, d2
 	fmul	d3, SSQ, d2
 	fadd	SSQ, REGONE, d3
 	fmov	SCALE, d5
-	b	KERNEL_F1_END_\@
-KERNEL_F1_SCALE_GE_XI_\@:
+	b	4f    /* KERNEL_F1_END_\@ */
+3:                    /* KERNEL_F1_SCALE_GE_XI_\@: */
 	fdiv	d2, d5, SCALE
 	fmla	SSQ, d2, v2.d[0]
 #endif
-KERNEL_F1_END_\@:
+4:                    /* KERNEL_F1_END_\@: */
 .endm
 
 .macro KERNEL_S1

From 258ac56e0aa46e9b7120bcb5635d1bba48f4c2aa Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 5 Oct 2019 10:52:47 +0200
Subject: [PATCH 055/210] Move 32bit OSX build back to xcode 8.3 but switch to
 gcc8

---
 .travis.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 28f95f5e2..72e29091d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -162,16 +162,16 @@ matrix:
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
         - brew update
-        - brew install gcc # for gfortran
+        - brew install gcc@8 # for gfortran
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="BINARY=64 INTERFACE64=1"
+        - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
 
     - <<: *test-macos
-      osx_image: xcode9.3
+      osx_image: xcode8.3
       env:
-        - BTYPE="BINARY=32"
+        - BTYPE="BINARY=32 FC=gfortran-8"
 
 # whitelist
 branches:

From 3a2df19db6b9bacd88974fbf87ef5b335fa2856f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Oct 2019 08:09:26 +0200
Subject: [PATCH 056/210] Fix accidental duplication of jump instruction

---
 kernel/arm64/nrm2.S | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/arm64/nrm2.S b/kernel/arm64/nrm2.S
index d4f0374cb..0e5a8eed1 100644
--- a/kernel/arm64/nrm2.S
+++ b/kernel/arm64/nrm2.S
@@ -55,7 +55,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldr	s4, [X], #4
 	fcmp	s4, REGZERO
 	beq	2f     /* KERNEL_F1_NEXT_\@ */
-	beq     2f
 	fabs	s4, s4
 	fcmp	SCALE, s4
 	bge	1f     /* KERNEL_F1_SCALE_GE_X_\@ */

From a448884a63f59f54da197a7e1fe921be715ce6d5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Oct 2019 08:37:50 +0200
Subject: [PATCH 057/210] Remove automatic label postfixes from macro included
 only once

---
 kernel/arm64/znrm2.S | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/kernel/arm64/znrm2.S b/kernel/arm64/znrm2.S
index ce3f7d4ed..a530b80f0 100644
--- a/kernel/arm64/znrm2.S
+++ b/kernel/arm64/znrm2.S
@@ -123,69 +123,69 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 	ldr	s4, [X]
 	fcmp	s4, REGZERO
-	beq	KERNEL_S1_NEXT_\@
+	beq	KERNEL_S1_NEXT
 	fabs	s4, s4
 	fcmp	SCALE, s4
-	bge	KERNEL_S1_SCALE_GE_XR_\@
+	bge	KERNEL_S1_SCALE_GE_XR
 	fdiv	s2, SCALE, s4
 	fmul	s2, s2, s2
 	fmul	s3, SSQ, s2
 	fadd	SSQ, REGONE, s3
 	fmov	SCALE, s4
-	b	KERNEL_S1_NEXT_\@
-KERNEL_S1_SCALE_GE_XR_\@:
+	b	KERNEL_S1_NEXT
+KERNEL_S1_SCALE_GE_XR:
 	fdiv	s2, s4, SCALE
 	fmla	SSQ, s2, v2.s[0]
-KERNEL_S1_NEXT_\@:
+KERNEL_S1_NEXT:
 	ldr	s5, [X, #4]
 	fcmp	s5, REGZERO
-	beq	KERNEL_S1_END_\@
+	beq	KERNEL_S1_END
 	fabs	s5, s5
 	fcmp	SCALE, s5
-	bge	KERNEL_S1_SCALE_GE_XI_\@
+	bge	KERNEL_S1_SCALE_GE_XI
 	fdiv	s2, SCALE, s5
 	fmul	s2, s2, s2
 	fmul	s3, SSQ, s2
 	fadd	SSQ, REGONE, s3
 	fmov	SCALE, s5
-	b	KERNEL_S1_END_\@
-KERNEL_S1_SCALE_GE_XI_\@:
+	b	KERNEL_S1_END
+KERNEL_S1_SCALE_GE_XI:
 	fdiv	s2, s5, SCALE
 	fmla	SSQ, s2, v2.s[0]
 #else
 	ldr	d4, [X]
 	fcmp	d4, REGZERO
-	beq	KERNEL_S1_NEXT_\@
+	beq	KERNEL_S1_NEXT
 	fabs	d4, d4
 	fcmp	SCALE, d4
-	bge	KERNEL_S1_SCALE_GE_XR_\@
+	bge	KERNEL_S1_SCALE_GE_XR
 	fdiv	d2, SCALE, d4
 	fmul	d2, d2, d2
 	fmul	d3, SSQ, d2
 	fadd	SSQ, REGONE, d3
 	fmov	SCALE, d4
-	b	KERNEL_S1_NEXT_\@
-KERNEL_S1_SCALE_GE_XR_\@:
+	b	KERNEL_S1_NEXT
+KERNEL_S1_SCALE_GE_XR:
 	fdiv	d2, d4, SCALE
 	fmla	SSQ, d2, v2.d[0]
-KERNEL_S1_NEXT_\@:
+KERNEL_S1_NEXT:
 	ldr	d5, [X, #8]
 	fcmp	d5, REGZERO
-	beq	KERNEL_S1_END_\@
+	beq	KERNEL_S1_END
 	fabs	d5, d5
 	fcmp	SCALE, d5
-	bge	KERNEL_S1_SCALE_GE_XI_\@
+	bge	KERNEL_S1_SCALE_GE_XI
 	fdiv	d2, SCALE, d5
 	fmul	d2, d2, d2
 	fmul	d3, SSQ, d2
 	fadd	SSQ, REGONE, d3
 	fmov	SCALE, d5
-	b	KERNEL_S1_END_\@
-KERNEL_S1_SCALE_GE_XI_\@:
+	b	KERNEL_S1_END
+KERNEL_S1_SCALE_GE_XI:
 	fdiv	d2, d5, SCALE
 	fmla	SSQ, d2, v2.d[0]
 #endif
-KERNEL_S1_END_\@:
+KERNEL_S1_END:
 	add	X, X, INC_X
 .endm
 

From f2cde2ccfb5c58a38300cf003c3edbe2a607c516 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Oct 2019 20:12:08 +0200
Subject: [PATCH 058/210] Update common_arm64.h

---
 common_arm64.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/common_arm64.h b/common_arm64.h
index c5e6948dc..13718af5a 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -78,7 +78,17 @@ static void __inline blas_lock(volatile BLASULONG *address){
 
 #define BLAS_LOCK_DEFINED
 
+static __inline BLASULONG rpcc(void){
+  BLASULONG ret = 0;
+ 
+  __asm__ __volatile__ ("mrs %0,cntvct_el0":"=r"(ret));
 
+  return ret;
+}
+
+#define RPCC_DEFINED
+#define RPCC64BIT
+ 
 
 static inline int blas_quickdivide(blasint x, blasint y){
   return x / y;

From 5f6206fa2de4f533f003379588ca2a8b294e6c2f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Oct 2019 20:13:14 +0200
Subject: [PATCH 059/210] Simplify OSX/IOS cross-compilation and add a CI test
 for it (#2279)

* Add automatic fixups for OSX/IOS cross-compilation

* Add OSX/IOS cross-compilation test to Travis CI

* Handle platforms that lack hwcap.h by falling back to ARMV8

* Fix PROLOGUE for OSX/IOS
---
 .travis.yml                   |  8 ++++++++
 c_check                       | 13 +++++++++++++
 common_arm64.h                |  2 ++
 driver/others/dynamic_arm64.c |  8 +++++++-
 4 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 72e29091d..6016ec1fe 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -173,6 +173,14 @@ matrix:
       env:
         - BTYPE="BINARY=32 FC=gfortran-8"
 
+    - <<: *test-macos
+      osx_image: xcode10.1
+      env:
+        - COMMON_FLAGS="NUM_THREADS=32"
+        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
+        - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
+        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
+
 # whitelist
 branches:
   only:
diff --git a/c_check b/c_check
index 271182c54..3d82aa73c 100644
--- a/c_check
+++ b/c_check
@@ -260,6 +260,19 @@ if ($architecture ne $hostarch) {
 
 $cross = 1 if ($os ne $hostos);
 
+# rework cross suffix and architecture if we are on OSX cross-compiling for ARMV8-based IOS
+# the initial autodetection will have been confused by the command-line arguments to clang
+# and the cross-compiler apparently still claims to build for x86_64 in its CC -E output
+if (($os eq "Darwin") && ($cross_suffix ne "")) {
+  my $tmpnam = `xcrun --sdk iphoneos --find clang`;
+  $cross_suffix = substr($tmpnam, 0, rindex($tmpnam, "/")+1 ); 
+# this should produce something like $cross_suffix="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/";
+  $cross =1;
+  $architecture = arm64;
+}
+
+
+
 $openmp = "" if $ENV{USE_OPENMP} != 1;
 
 $linker_L = "";
diff --git a/common_arm64.h b/common_arm64.h
index c5e6948dc..f27ca8c63 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -107,7 +107,9 @@ static inline int blas_quickdivide(blasint x, blasint y){
 	.text ;
 	.p2align 2 ;
 	.global	REALNAME ;
+#ifndef __APPLE__
 	.type	REALNAME, %function ;
+#endif
 REALNAME:
 .endm
 
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index b4ce6b67d..9db9ba17d 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -37,8 +37,10 @@
 /*********************************************************************/
 
 #include "common.h"
+#if (defined OS_LINUX || defined OS_ANDROID)
 #include <asm/hwcap.h>
 #include <sys/auxv.h>
+#endif
 
 extern gotoblas_t  gotoblas_ARMV8;
 extern gotoblas_t  gotoblas_CORTEXA57;
@@ -105,13 +107,17 @@ static gotoblas_t *force_coretype(char *coretype) {
 
 static gotoblas_t *get_coretype(void) {
   int implementer, variant, part, arch, revision, midr_el1;
-  
+
+#if (defined OS_LINUX || defined OS_ANDROID)
   if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
     char coremsg[128];
     snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
     openblas_warning(1, coremsg);
     return NULL;
   }
+#else
+   return NULL;
+#endif
 
   get_cpu_ftr(MIDR_EL1, midr_el1);
   /*

From f262031685ee8f912f8b2b1f0cdc136fec1f550c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Oct 2019 22:30:02 +0200
Subject: [PATCH 060/210] Support QEMU virtual cpu as CORE2

qemu itself claims it is a 64bit P6, which does not exist in the wild.
---
 cpuid_x86.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 8c954bf21..2181db4db 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1197,7 +1197,11 @@ int get_cpuname(void){
 	case  3:
 	case  5:
 	case  6:
+#ifdef __64BIT__
+	  return CPUTYPE_CORE2;
+#else			
 	  return CPUTYPE_PENTIUM2;
+#endif
 	case  7:
 	case  8:
 	case 10:

From e8a2aed2b9ccf4dbc78e622df408e1f77f837ab0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 9 Oct 2019 18:24:13 +0200
Subject: [PATCH 061/210] Support QEMU cpu calling itself 64bit AMD Athlon as
 well

Some QEMU instances pretend to be "AuthenticAMD" with the same family 6/model 6 even when running on an Intel host
(could be related to qemu or libvirt version and/or kvm availability). Also fix the define to depend on __x86_64__ set by the
compiler, the defines using __64BIT__ will only work for getarch_2nd.
---
 cpuid_x86.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 2181db4db..92c8e1b67 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1197,11 +1197,7 @@ int get_cpuname(void){
 	case  3:
 	case  5:
 	case  6:
-#ifdef __64BIT__
-	  return CPUTYPE_CORE2;
-#else			
 	  return CPUTYPE_PENTIUM2;
-#endif
 	case  7:
 	case  8:
 	case 10:
@@ -1383,8 +1379,6 @@ int get_cpuname(void){
       break;
       case 7: // family 6 exmodel 7
         switch (model) {
-        case 10: // Goldmont Plus
-          return CPUTYPE_NEHALEM;
         case 14: // Ice Lake
           if(support_avx512())
             return CPUTYPE_SKYLAKEX;
@@ -1431,7 +1425,11 @@ int get_cpuname(void){
     case 0x5:
       return CPUTYPE_AMDK6;
     case 0x6:
+#if defined(__x86_64__) || defined(__amd64__)
+      return CPUTYPE_BARCELONA;
+#else
       return CPUTYPE_ATHLON;
+#endif
     case 0xf:
       switch (exfamily) {
       case  0:
@@ -1814,7 +1812,11 @@ int get_coretype(void){
 	case  4:
 	case  5:
 	case  6:
+#if defined(__x86_64__) || defined(__amd64__)
+	  return CORE_CORE2;
+#else
 	  return CORE_P6;
+#endif
 	case  7:
 	  return CORE_KATMAI;
 	case  8:
@@ -2021,7 +2023,11 @@ int get_coretype(void){
 
   if (vendor == VENDOR_AMD){
     if (family <= 0x5) return CORE_80486;
+#if defined(__x86_64__) || defined(__amd64__)
+    if (family <= 0xe) return CORE_BARCELONA;
+#else
     if (family <= 0xe) return CORE_ATHLON;
+#endif
     if (family == 0xf){
       if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
       else if (exfamily == 5) return CORE_BOBCAT;

From 844629af5702148b5eaee909472f4a80b368498d Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 16 Oct 2019 02:00:34 +0800
Subject: [PATCH 062/210] Add files via upload

---
 kernel/x86_64/KERNEL.SKYLAKEX             |   4 +-
 kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 811 ++++++++++++++++++++++
 2 files changed, 812 insertions(+), 3 deletions(-)
 create mode 100644 kernel/x86_64/dgemm_kernel_8x8_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index d61c51628..d73a47925 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -7,10 +7,8 @@ SGEMMITCOPY    =  sgemm_tcopy_16_skylakex.c
 SGEMMONCOPY    =  sgemm_ncopy_4_skylakex.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 
-#DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex.c
+DGEMMKERNEL    =  dgemm_kernel_8x8_skylakex.c
 
-#DGEMMINCOPY    =  dgemm_ncopy_8_skylakex.c
-#DGEMMITCOPY    =  dgemm_tcopy_8_skylakex.c
 DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c
 
diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
new file mode 100644
index 000000000..b4a87cbce
--- /dev/null
+++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
@@ -0,0 +1,811 @@
+#include "common.h"
+#include <immintrin.h>
+/* row-major c_block */
+/* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/
+#define INNER_KERNEL_k1m1n8 \
+    "prefetcht0 384(%1);"\
+    "prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
+
+#define INNER_KERNEL_k1m2n8 \
+    INNER_KERNEL_k1m1n8\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
+
+#define INNER_KERNEL_k1m4n8 \
+    INNER_KERNEL_k1m2n8\
+    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
+    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
+
+#define INNER_KERNEL_k1m8n8 \
+    INNER_KERNEL_k1m4n8\
+    "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
+    "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
+    "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
+    "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
+
+#define INNER_KERNEL_k1m1n16 \
+    "prefetcht0 384(%1); prefetcht0 448(%1);"\
+    "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
+
+#define INNER_KERNEL_k1m2n16 \
+    INNER_KERNEL_k1m1n16\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
+
+#define INNER_KERNEL_k1m4n16 \
+    INNER_KERNEL_k1m2n16\
+    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
+    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
+
+#define INNER_KERNEL_k1m8n16 \
+    INNER_KERNEL_k1m4n16\
+    "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
+    "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
+    "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
+    "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
+
+#define INNER_KERNEL_k1m1n24 \
+    "prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\
+    "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
+
+#define INNER_KERNEL_k1m2n24 \
+    INNER_KERNEL_k1m1n24\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
+
+#define INNER_KERNEL_k1m4n24 \
+    INNER_KERNEL_k1m2n24\
+    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
+    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
+
+#define INNER_KERNEL_k1m8n24 \
+    INNER_KERNEL_k1m4n24\
+    "vbroadcastsd 32(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
+    "vbroadcastsd 40(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
+    "vbroadcastsd 48(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
+    "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
+
+#define INNER_KERNELm1(nn) \
+    "cmpq $1,%2;jb "#nn"3f;"\
+    #nn"4:\n\t"\
+    INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"4b;"\
+    #nn"3:\n\t"
+
+#define INNER_KERNELm2(nn) \
+    "cmpq $1,%2;jb "#nn"0f;"\
+    #nn"1:\n\t"\
+    INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"1b;"\
+    #nn"0:\n\t"
+
+#define INNER_KERNELm4(nn) \
+    "cmpq $1,%2;jb "#nn"00f;"\
+    #nn"01:\n\t"\
+    INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
+    #nn"00:\n\t"
+
+#define INNER_KERNELm8(nn) \
+    "cmpq $8,%2;jb "#nn"001f;"\
+    #nn"008:\n\t"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
+    #nn"001:\n\t"\
+    "cmpq $1,%2;jb "#nn"000f;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"001b;"\
+    ""#nn"000:\n\t"
+
+#define INNER_INIT_m1n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8;"
+
+#define INNER_INIT_m2n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
+
+#define INNER_INIT_m4n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
+
+#define INNER_INIT_m8n8 \
+    INNER_INIT_m4n8\
+    "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
+
+#define INNER_INIT_m1n16 INNER_INIT_m2n8
+
+#define INNER_INIT_m2n16 INNER_INIT_m4n8
+
+#define INNER_INIT_m4n16 INNER_INIT_m8n8
+
+#define INNER_INIT_m8n16 \
+    INNER_INIT_m8n8\
+    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
+    "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
+
+#define INNER_INIT_m1n24 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
+
+#define INNER_INIT_m2n24 \
+    INNER_INIT_m1n24\
+    "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
+
+#define INNER_INIT_m4n24 \
+    INNER_INIT_m4n16\
+    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
+
+#define INNER_INIT_m8n24 \
+    INNER_INIT_m8n16\
+    "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
+    "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
+
+#define INNER_SETINDEX \
+    "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
+    "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
+
+#define INNER_STORE_m1n8(c1,disp) \
+    "kxnorw %%k1,%%k1,%%k1;"\
+    "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
+    "vaddpd %%zmm7,"#c1","#c1";"\
+    "kxnorw %%k1,%%k1,%%k1;"\
+    "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
+
+#define INNER_SAVE_m1n8 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)
+
+#define INNER_SAVE_m1n16 \
+    INNER_SAVE_m1n8\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm9,0)
+
+#define INNER_SAVE_m1n24 \
+    INNER_SAVE_m1n16\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm10,0)
+
+#define INNER_SAVE_m2n8 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm9,8)
+
+#define INNER_SAVE_m2n16 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm10,8)\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm9,0)\
+    INNER_STORE_m1n8(%%zmm11,8)
+
+#define INNER_SAVE_m2n24 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm11,8)\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm9,0)\
+    INNER_STORE_m1n8(%%zmm12,8)\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm10,0)\
+    INNER_STORE_m1n8(%%zmm13,8)
+
+#define INNER_PREF_8x8 \
+    "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
+    "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
+    "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
+    "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
+    "subq %4,%3; subq %4,%3; subq %4,%3;"
+
+#define INNER_TRANS_4x8(c1,c2,c3,c4) \
+    "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
+    "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
+    "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
+    "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
+    "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
+    "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
+
+#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
+    INNER_TRANS_4x8(c1,c2,c3,c4)\
+    INNER_TRANS_4x8(c5,c6,c7,c8)\
+    "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
+    "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd  %%zmm4,"#c5","#c5"%{%5%};"\
+    "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
+    "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd  %%zmm5,"#c6","#c6"%{%5%};"\
+    "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
+    "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd  %%zmm6,"#c7","#c7"%{%5%};"\
+    "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
+    "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd  %%zmm7,"#c8","#c8"%{%5%};"
+
+#define INNER_STORE_4x8(c1,c2,c3,c4) \
+    "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vaddpd %%zmm4,"#c1","#c1";"\
+    "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vaddpd %%zmm5,"#c2","#c2";"\
+    "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vaddpd %%zmm6,"#c3","#c3";"\
+    "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vaddpd %%zmm7,"#c4","#c4";"\
+    "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "leaq (%3,%4,4),%3;"
+
+#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vaddpd (%3),"#c1","#c1"; vmovupd "#c1",(%3); vaddpd (%3,%4,1),"#c2","#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vaddpd (%3),"#c3","#c3"; vmovupd "#c3",(%3); vaddpd (%3,%4,1),"#c4","#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vaddpd (%3),"#c5","#c5"; vmovupd "#c5",(%3); vaddpd (%3,%4,1),"#c6","#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vaddpd (%3),"#c7","#c7"; vmovupd "#c7",(%3); vaddpd (%3,%4,1),"#c8","#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
+
+#define INNER_SAVE_m4n8 \
+    INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
+    INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
+
+#define INNER_SAVE_m4n16 \
+    INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
+    INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
+    INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
+    INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
+
+#define INNER_SAVE_m4n24 \
+    INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
+    INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
+    INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
+    INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
+    INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
+    INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
+
+#define INNER_SAVE_m8n8 \
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
+    INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+
+#define INNER_SAVE_m8n16 \
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
+    INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
+    INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
+
+#define INNER_SAVE_m8n24 \
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
+    INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
+    INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
+    INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
+
+#define COMPUTE_m1n8 {\
+    __asm__ __volatile__(\
+    INNER_INIT_m1n8\
+    INNER_KERNELm1(8)\
+    INNER_SAVE_m1n8\
+    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\
+    :"zmm4","zmm5","zmm6","zmm7","zmm8","cc","memory","k1");\
+    c_pointer += 1;\
+}
+#define COMPUTE_m2n8 {\
+    __asm__ __volatile__(\
+    INNER_INIT_m2n8\
+    INNER_KERNELm2(8)\
+    INNER_SAVE_m2n8\
+    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\
+    :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","cc","memory","k1");\
+    c_pointer += 2;\
+}
+#define COMPUTE_m4n8 {\
+    __asm__ __volatile__(\
+    INNER_INIT_m4n8\
+    INNER_KERNELm4(8)\
+    INNER_SAVE_m4n8\
+    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03),"Yk"(k01)\
+    :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","cc","memory");\
+    c_pointer += 4;\
+}
+#define COMPUTE_m8n8 {\
+    __asm__ __volatile__(\
+    INNER_INIT_m8n8\
+    INNER_KERNELm8(8)\
+    INNER_SAVE_m8n8\
+    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03)\
+    :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory");\
+    c_pointer += 8;\
+}
+
+#define COMPUTE_n8 {\
+    __asm__ __volatile__(\
+    "movq %8,%%r14;movq %2,%%r13;"\
+    "cmpq $8,%8; jb 42222f;"\
+    "42221:\n\t"\
+    INNER_INIT_m8n8\
+    INNER_KERNELm8(8)\
+    INNER_SAVE_m8n8\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
+    "42222:\n\t"\
+    "cmpq $4,%8; jb 42223f;"\
+    INNER_INIT_m4n8\
+    INNER_KERNELm4(8)\
+    INNER_SAVE_m4n8\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
+    "subq $4,%8;"\
+    "42223:\n\t"\
+    "cmpq $2,%8; jb 42224f;"\
+    INNER_INIT_m2n8\
+    INNER_KERNELm2(8)\
+    INNER_SAVE_m2n8\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "addq $16,%3;"\
+    "subq $2,%8;"\
+    "42224:\n\t"\
+    "cmpq $1,%8; jb 42225f;"\
+    INNER_INIT_m1n8\
+    INNER_KERNELm1(8)\
+    INNER_SAVE_m1n8\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "addq $8,%3;"\
+    "42225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
+    ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+#define COMPUTE_n16 {\
+    __asm__ __volatile__(\
+    "movq %8,%%r14;movq %2,%%r13;"\
+    "cmpq $8,%8; jb 32222f;"\
+    "32221:\n\t"\
+    INNER_INIT_m8n16\
+    INNER_KERNELm8(16)\
+    INNER_SAVE_m8n16\
+    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
+    "32222:\n\t"\
+    "cmpq $4,%8; jb 32223f;"\
+    INNER_INIT_m4n16\
+    INNER_KERNELm4(16)\
+    INNER_SAVE_m4n16\
+    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
+    "subq $4,%8;"\
+    "32223:\n\t"\
+    "cmpq $2,%8; jb 32224f;"\
+    INNER_INIT_m2n16\
+    INNER_KERNELm2(16)\
+    INNER_SAVE_m2n16\
+    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
+    "subq $2,%8;"\
+    "32224:\n\t"\
+    "cmpq $1,%8; jb 32225f;"\
+    INNER_INIT_m1n16\
+    INNER_KERNELm1(16)\
+    INNER_SAVE_m1n16\
+    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
+    "32225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
+    :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
+    ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
+    "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+#define COMPUTE_n24 {\
+    __asm__ __volatile__(\
+    "movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\
+    "cmpq $8,%8; jb 22222f;"\
+    "22221:\n\t"\
+    INNER_INIT_m8n24\
+    "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\
+    INNER_KERNELm8(24)\
+    "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\
+    INNER_SAVE_m8n24\
+    "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
+    "22222:\n\t"\
+    "cmpq $4,%8; jb 22223f;"\
+    INNER_INIT_m4n24\
+    INNER_KERNELm4(24)\
+    INNER_SAVE_m4n24\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
+    "subq $4,%8;"\
+    "22223:\n\t"\
+    "cmpq $2,%8; jb 22224f;"\
+    INNER_INIT_m2n24\
+    INNER_KERNELm2(24)\
+    INNER_SAVE_m2n24\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
+    "subq $2,%8;"\
+    "22224:\n\t"\
+    "cmpq $1,%8; jb 22225f;"\
+    INNER_INIT_m1n24\
+    INNER_KERNELm1(24)\
+    INNER_SAVE_m1n24\
+    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
+    "22225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
+    :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\
+    "+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\
+    ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
+    "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\
+    a_block_pointer -= M * K;\
+}
+
+static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
+//perform C += A<pack> B<pack>
+    if(k==0 || m==0 || ndiv8==0) return;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
+    int64_t K = (int64_t)k; int64_t M = (int64_t)m;
+    double *a_block_pointer;
+    double *c_pointer = c;
+    __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
+    BLASLONG ndiv8_count;
+    double *b_scratch = (double *)aligned_alloc(64,192*k);
+    double *packed_b_pointer = packed_b;
+    a_block_pointer = packed_a;
+    for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
+      __asm__ __volatile__ (
+        "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;"
+        "100001:\n\t"
+        "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;"
+        "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;"
+        "decq %2; testq %2,%2; jnz 100001b;"
+        "100002:\n\t"
+        "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;"
+        :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7");
+      COMPUTE_n24
+    }
+    for(;ndiv8_count>1;ndiv8_count-=2){
+      __asm__ __volatile__ (
+        "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;"
+        "1000001:\n\t"
+        "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;"
+        "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;"
+        "decq %2; testq %2,%2; jnz 1000001b;"
+        "1000002:\n\t"
+        "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;"
+        :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6");
+      COMPUTE_n16
+    }
+    if(ndiv8_count>0){
+      COMPUTE_n8
+    }
+    free(b_scratch);b_scratch=NULL;
+}
+
+/* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */
+/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
+/* __m128d accumulators: xc1-xc4; temporary variables: xa1,xb1-xb2 */
+/*  double accumulator:  sc1;     temporary variables: sa1,sb1 */
+/* column-major c_block */
+#define KERNEL_m8n4k1 {\
+    __asm__ __volatile__(\
+    "vmovupd (%0),%2; addq $64,%0;"\
+    "vbroadcastsd   (%1),%3; vfmadd231pd %2,%3,%5; "\
+    "vbroadcastsd  8(%1),%4; vfmadd231pd %2,%4,%6; "\
+    "vbroadcastsd 16(%1),%3; vfmadd231pd %2,%3,%7; "\
+    "vbroadcastsd 24(%1),%4; vfmadd231pd %2,%4,%8; "\
+    "addq $32,%1;"\
+    :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2),"+v"(zc3),"+v"(zc4)::"cc","memory");\
+}
+#define KERNEL_m8n2k1 {\
+    __asm__ __volatile__(\
+    "vmovupd (%0),%2; addq $64,%0;"\
+    "vbroadcastsd   (%1),%3; vfmadd231pd %2,%3,%5; "\
+    "vbroadcastsd  8(%1),%4; vfmadd231pd %2,%4,%6; "\
+    "addq $16,%1;"\
+    :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zb2),"+v"(zc1),"+v"(zc2)::"cc","memory");\
+}
+#define KERNEL_m8n1k1 {\
+    __asm__ __volatile__(\
+    "vmovupd (%0),%2; addq $64,%0;"\
+    "vbroadcastsd   (%1),%3; vfmadd231pd %2,%3,%4; "\
+    "addq $8,%1;"\
+    :"+r"(a_block_pointer),"+r"(b_block_pointer),"+v"(za1),"+v"(zb1),"+v"(zc1)::"cc","memory");\
+}
+#define INIT_m8n1 zc1=_mm512_setzero_pd();
+#define INIT_m8n2 zc2=INIT_m8n1
+#define INIT_m8n4 zc4=zc3=INIT_m8n2
+#define SAVE_m8n1 {\
+    za1 = _mm512_loadu_pd(c_pointer);\
+    zc1 = _mm512_add_pd(zc1,za1);\
+    _mm512_storeu_pd(c_pointer,zc1);\
+    c_pointer += 8;\
+}
+#define SAVE_m8n2 {\
+    zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
+    zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\
+    _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
+    c_pointer += 8;\
+}
+#define SAVE_m8n4 {\
+    zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
+    zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\
+    _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
+    c_pointer += LDC*2;\
+    zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
+    zc3 = _mm512_add_pd(zc3,zb1); zc4 = _mm512_add_pd(zc4,zb2);\
+    _mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\
+    c_pointer += 8-LDC*2;\
+}
+#define KERNEL_m4n4k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
+    yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
+    b_block_pointer+=4;\
+}
+#define KERNEL_m4n2k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
+    b_block_pointer+=2;\
+}
+#define KERNEL_m4n1k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    b_block_pointer++;\
+}
+#define INIT_m4n1 yc1=_mm256_setzero_pd();
+#define INIT_m4n2 yc2=INIT_m4n1
+#define INIT_m4n4 yc4=yc3=INIT_m4n2
+#define SAVE_m4n1 {\
+    ya1 = _mm256_loadu_pd(c_pointer);\
+    yc1 = _mm256_add_pd(yc1,ya1);\
+    _mm256_storeu_pd(c_pointer,yc1);\
+    c_pointer += 4;\
+}
+#define SAVE_m4n2 {\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\
+    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
+    c_pointer += 4;\
+}
+#define SAVE_m4n4 {\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\
+    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
+    c_pointer += LDC*2;\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc3 = _mm256_add_pd(yc3,yb1); yc4 = _mm256_add_pd(yc4,yb2);\
+    _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
+    c_pointer += 4-LDC*2;\
+}
+#define KERNEL_m2n2k1 {\
+    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
+    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
+    b_block_pointer += 2;\
+}
+#define KERNEL_m2n1k1 {\
+    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
+    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    b_block_pointer ++;\
+}
+#define INIT_m2n1 xc1=_mm_setzero_pd();
+#define INIT_m2n2 xc2=INIT_m2n1
+#define SAVE_m2n1 {\
+    xa1 = _mm_loadu_pd(c_pointer);\
+    xc1 = _mm_add_pd(xc1,xa1);\
+    _mm_storeu_pd(c_pointer,xc1);\
+    c_pointer += 2;\
+}
+#define SAVE_m2n2 {\
+    xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
+    xc1 = _mm_add_pd(xc1,xb1); xc2 = _mm_add_pd(xc2,xb2);\
+    _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
+    c_pointer += 2;\
+}
+#define KERNEL_m1n1k1 {\
+    sa1 = *a_block_pointer; a_block_pointer++;\
+    sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
+    b_block_pointer ++;\
+}
+#define INIT_m1n1 sc1=0.0;
+#define SAVE_m1n1 {\
+    *c_pointer += sc1;\
+    c_pointer++;\
+}
+
+/* row-major c_block */
+#define KERNEL_m2n4k1 {\
+    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
+    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
+    a_block_pointer += 2;\
+}
+#define KERNEL_m1n4k1 {\
+    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
+    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    a_block_pointer ++;\
+}
+#define KERNEL_m1n2k1 {\
+    xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
+    xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    a_block_pointer ++;\
+}
+#define INIT_m1n2 INIT_m2n1
+#define INIT_m1n4 INIT_m4n1
+#define INIT_m2n4 INIT_m4n2
+#define SAVE_m2n4 {\
+    yb1 = _mm256_unpacklo_pd(yc1,yc2);\
+    yb2 = _mm256_unpackhi_pd(yc1,yc2);\
+    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
+    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
+    _mm_storeu_pd(c_pointer,xb1);\
+    _mm_storeu_pd(c_pointer+LDC,xb2);\
+    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
+    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
+    _mm_storeu_pd(c_pointer+2*LDC,xb1);\
+    _mm_storeu_pd(c_pointer+3*LDC,xb2);\
+    c_pointer += 2;\
+}
+#define SAVE_m1n2 {\
+    *c_pointer += _mm_cvtsd_f64(xc1);\
+    xa1 = _mm_unpackhi_pd(xc1,xc1);\
+    c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
+    c_pointer ++;\
+}
+#define SAVE_m1n4 {\
+    *c_pointer += _mm256_cvtsd_f64(yc1);\
+    ya1 = _mm256_unpackhi_pd(yc1,yc1);\
+    c_pointer[LDC] += _mm256_cvtsd_f64(ya1);\
+    xb1 = _mm256_extractf128_pd(yc1,1);\
+    c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
+    xb2 = _mm_unpackhi_pd(xb1,xb1);\
+    c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
+    c_pointer ++;\
+}
+
+static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
+//perform C += A<pack> B<pack> , edge_n<8 must be satisfied !
+    if(k==0 || m==0 || edge_n==0) return;
+    double *a_block_pointer,*b_block_pointer,*b_base_pointer;
+    double *c_pointer = c;
+    __m512d zb1,zb2,za1,zc1,zc2,zc3,zc4;
+    __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
+    __m128d xc1,xc2,xa1,xb1,xb2;
+    double sc1,sa1,sb1;
+    BLASLONG m_count,n_count,k_count;
+    b_base_pointer = packed_b;
+//now start calculation of the edge part
+    for(n_count=edge_n;n_count>3;n_count-=4){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>7;m_count-=8){
+        b_block_pointer = b_base_pointer;
+        INIT_m8n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m8n4k1
+        SAVE_m8n4
+      }
+      for(;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
+        SAVE_m4n4
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
+        SAVE_m2n4
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
+        SAVE_m1n4
+      }
+      b_base_pointer += 4*k;
+      c_pointer += 4 * LDC - m;
+    }
+    for(;n_count>1;n_count-=2){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>7;m_count-=8){
+        b_block_pointer = b_base_pointer;
+        INIT_m8n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m8n2k1
+        SAVE_m8n2
+      }
+      for(;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
+        SAVE_m4n2
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
+        SAVE_m2n2
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
+        SAVE_m1n2
+      }
+      b_base_pointer += 2*k;
+      c_pointer += 2 * LDC - m;
+    }
+    if(n_count>0){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>7;m_count-=8){
+        b_block_pointer = b_base_pointer;
+        INIT_m8n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m8n1k1
+        SAVE_m8n1
+      }
+      for(;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
+        SAVE_m4n1
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
+        SAVE_m2n1
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
+        SAVE_m1n1
+      }
+    }
+}
+static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k,double alpha){
+    BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp,alp;
+    src1 = src; dst1 = dst; src2 = src1 + 4 * k; alp = _mm256_set1_pd(alpha);
+    for(m_count=m;m_count>7;m_count-=8){
+      for(k_count=k;k_count>0;k_count--){
+        tmp = _mm256_loadu_pd(src1);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+0,tmp);src1+=4;
+        tmp = _mm256_loadu_pd(src2);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+4,tmp);src2+=4;
+        dst1+=8;
+      }
+      src1+=4*k;src2+=4*k;
+    }
+    for(;m_count>0;m_count--){
+      for(k_count=k;k_count>0;k_count--){
+        *dst1 = (*src1) * alpha; src1++; dst1++;
+      }
+    }
+}
+int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
+    if(m==0 || n==0 || k==0) return 0;
+    BLASLONG ndiv8 = n/8;
+    double *packed_a = (double *)malloc(m*k*sizeof(double));
+    copy_4_to_8(A,packed_a,m,k,alpha);
+    if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C);
+    if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8);
+    free(packed_a);packed_a=NULL;
+    return 0;
+}
+

From 5da9484d932cd220934207f83ff111df248bba7f Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 16 Oct 2019 02:01:13 +0800
Subject: [PATCH 063/210] Add files via upload

---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 0ff59f400..860106991 100644
--- a/param.h
+++ b/param.h
@@ -1700,7 +1700,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_Q 128
 #else
 #define SGEMM_DEFAULT_Q 384
-#define DGEMM_DEFAULT_Q 256
+#define DGEMM_DEFAULT_Q 128
 #endif
 #define CGEMM_DEFAULT_Q 192
 #define ZGEMM_DEFAULT_Q 128

From 6bd67ddbab5ef752e1cafca4e4b7b66ecbb57452 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 16 Oct 2019 03:20:08 +0800
Subject: [PATCH 064/210] Update dgemm_kernel_8x8_skylakex.c

---
 kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 38 +----------------------
 1 file changed, 1 insertion(+), 37 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
index b4a87cbce..69437e665 100644
--- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
@@ -1,4 +1,5 @@
 #include "common.h"
+#include <stdint.h>
 #include <immintrin.h>
 /* row-major c_block */
 /* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/
@@ -289,43 +290,6 @@
     INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
     INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
 
-#define COMPUTE_m1n8 {\
-    __asm__ __volatile__(\
-    INNER_INIT_m1n8\
-    INNER_KERNELm1(8)\
-    INNER_SAVE_m1n8\
-    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\
-    :"zmm4","zmm5","zmm6","zmm7","zmm8","cc","memory","k1");\
-    c_pointer += 1;\
-}
-#define COMPUTE_m2n8 {\
-    __asm__ __volatile__(\
-    INNER_INIT_m2n8\
-    INNER_KERNELm2(8)\
-    INNER_SAVE_m2n8\
-    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes)\
-    :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","cc","memory","k1");\
-    c_pointer += 2;\
-}
-#define COMPUTE_m4n8 {\
-    __asm__ __volatile__(\
-    INNER_INIT_m4n8\
-    INNER_KERNELm4(8)\
-    INNER_SAVE_m4n8\
-    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03),"Yk"(k01)\
-    :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","cc","memory");\
-    c_pointer += 4;\
-}
-#define COMPUTE_m8n8 {\
-    __asm__ __volatile__(\
-    INNER_INIT_m8n8\
-    INNER_KERNELm8(8)\
-    INNER_SAVE_m8n8\
-    :"+r"(a_block_pointer):"r"(packed_b_pointer),"r"((int64_t)k),"r"(c_pointer),"r"(ldc_in_bytes),"Yk"(k02),"Yk"(k03)\
-    :"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory");\
-    c_pointer += 8;\
-}
-
 #define COMPUTE_n8 {\
     __asm__ __volatile__(\
     "movq %8,%%r14;movq %2,%%r13;"\

From 9b19e9e1b01c6820aaff0c7683ac6b317c9dfaba Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 16 Oct 2019 10:14:51 +0800
Subject: [PATCH 065/210] Update dgemm_kernel_8x8_skylakex.c

---
 kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
index 69437e665..1db955776 100644
--- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
@@ -429,7 +429,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
     double *c_pointer = c;
     __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
     BLASLONG ndiv8_count;
-    double *b_scratch = (double *)aligned_alloc(64,192*k);
+    double *b_scratch;
+    posix_memalign(&b_scratch,64,192*k);
     double *packed_b_pointer = packed_b;
     a_block_pointer = packed_a;
     for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
@@ -637,9 +638,10 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
     c_pointer ++;\
 }
 #define SAVE_m1n4 {\
-    *c_pointer += _mm256_cvtsd_f64(yc1);\
-    ya1 = _mm256_unpackhi_pd(yc1,yc1);\
-    c_pointer[LDC] += _mm256_cvtsd_f64(ya1);\
+    xb1 = _mm256_extractf128_pd(yc1,0);\
+    *c_pointer += _mm_cvtsd_f64(xb1);\
+    xb2 = _mm_unpackhi_pd(xb1,xb1);\
+    c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
     xb1 = _mm256_extractf128_pd(yc1,1);\
     c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
     xb2 = _mm_unpackhi_pd(xb1,xb1);\

From b7315f8401089a91ae382b87be7e2683745828da Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 16 Oct 2019 19:23:36 +0800
Subject: [PATCH 066/210] Add files via upload

---
 kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 75 ++++++++---------------
 1 file changed, 26 insertions(+), 49 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
index 1db955776..b8b3234d1 100644
--- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
@@ -25,8 +25,8 @@
     "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
 
 #define INNER_KERNEL_k1m1n16 \
-    "prefetcht0 384(%1); prefetcht0 448(%1);"\
-    "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; addq $128,%1;"\
+    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1);"\
+    "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; addq $64,%1;"\
     "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
 
 #define INNER_KERNEL_k1m2n16 \
@@ -46,8 +46,8 @@
     "vbroadcastsd 56(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
 
 #define INNER_KERNEL_k1m1n24 \
-    "prefetcht0 384(%1); prefetcht0 448(%1); prefetcht0 512(%1);"\
-    "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd 64(%1),%%zmm6; vmovupd 128(%1),%%zmm7; addq $192,%1;"\
+    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,1); prefetcht0 128(%1,%%r12,2);"\
+    "prefetcht0 768(%0); vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,1),%%zmm6; vmovupd (%1,%%r12,2),%%zmm7; addq $64,%1;"\
     "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
 
 #define INNER_KERNEL_k1m2n24 \
@@ -292,13 +292,13 @@
 
 #define COMPUTE_n8 {\
     __asm__ __volatile__(\
-    "movq %8,%%r14;movq %2,%%r13;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
     "cmpq $8,%8; jb 42222f;"\
     "42221:\n\t"\
     INNER_INIT_m8n8\
     INNER_KERNELm8(8)\
     INNER_SAVE_m8n8\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
     "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
     "42222:\n\t"\
@@ -306,7 +306,7 @@
     INNER_INIT_m4n8\
     INNER_KERNELm4(8)\
     INNER_SAVE_m4n8\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
     "subq $4,%8;"\
     "42223:\n\t"\
@@ -314,7 +314,7 @@
     INNER_INIT_m2n8\
     INNER_KERNELm2(8)\
     INNER_SAVE_m2n8\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "addq $16,%3;"\
     "subq $2,%8;"\
     "42224:\n\t"\
@@ -322,7 +322,7 @@
     INNER_INIT_m1n8\
     INNER_KERNELm1(8)\
     INNER_SAVE_m1n8\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shrq $6,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "addq $8,%3;"\
     "42225:\n\t"\
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
@@ -333,13 +333,13 @@
 }
 #define COMPUTE_n16 {\
     __asm__ __volatile__(\
-    "movq %8,%%r14;movq %2,%%r13;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
     "cmpq $8,%8; jb 32222f;"\
     "32221:\n\t"\
     INNER_INIT_m8n16\
     INNER_KERNELm8(16)\
     INNER_SAVE_m8n16\
-    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
     "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
     "32222:\n\t"\
@@ -347,7 +347,7 @@
     INNER_INIT_m4n16\
     INNER_KERNELm4(16)\
     INNER_SAVE_m4n16\
-    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
     "subq $4,%8;"\
     "32223:\n\t"\
@@ -355,7 +355,7 @@
     INNER_INIT_m2n16\
     INNER_KERNELm2(16)\
     INNER_SAVE_m2n16\
-    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
     "subq $2,%8;"\
     "32224:\n\t"\
@@ -363,28 +363,26 @@
     INNER_INIT_m1n16\
     INNER_KERNELm1(16)\
     INNER_SAVE_m1n16\
-    "movq %%r13,%2; shlq $7,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
     "32225:\n\t"\
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
-    :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
+    "leaq (%1,%%r12,2),%1;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
     ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
-    "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r13","r14");\
+    "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 #define COMPUTE_n24 {\
     __asm__ __volatile__(\
-    "movq %8,%%r14;movq %9,%%r15;movq %2,%%r13;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
     "cmpq $8,%8; jb 22222f;"\
     "22221:\n\t"\
     INNER_INIT_m8n24\
-    "prefetcht2 (%%r15); prefetcht2 64(%%r15);"\
     INNER_KERNELm8(24)\
-    "prefetcht2 128(%%r15); prefetcht2 192(%%r15);"\
     INNER_SAVE_m8n24\
-    "prefetcht2 256(%%r15); prefetcht2 320(%%r15); addq $384,%%r15;"\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
     "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
     "22222:\n\t"\
@@ -392,7 +390,7 @@
     INNER_INIT_m4n24\
     INNER_KERNELm4(24)\
     INNER_SAVE_m4n24\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
     "subq $4,%8;"\
     "22223:\n\t"\
@@ -400,7 +398,7 @@
     INNER_INIT_m2n24\
     INNER_KERNELm2(24)\
     INNER_SAVE_m2n24\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
     "subq $2,%8;"\
     "22224:\n\t"\
@@ -408,19 +406,19 @@
     INNER_INIT_m1n24\
     INNER_KERNELm1(24)\
     INNER_SAVE_m1n24\
-    "movq %%r13,%2; shlq $6,%2;subq %2,%1;shlq $1,%2;subq %2,%1;shrq $7,%2;"\
+    "movq %%r13,%2; subq %%r12,%1;"\
     "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
     "22225:\n\t"\
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
-    :"+r"(a_block_pointer),"+r"(b_scratch),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),\
-    "+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(packed_b_pointer)\
+    "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
     ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
-    "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r13","r14","r15");\
+    "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 
-static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
+static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
 //perform C += A<pack> B<pack>
     if(k==0 || m==0 || ndiv8==0) return;
     int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
@@ -429,38 +427,17 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
     double *c_pointer = c;
     __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
     BLASLONG ndiv8_count;
-    double *b_scratch;
-    posix_memalign(&b_scratch,64,192*k);
     double *packed_b_pointer = packed_b;
     a_block_pointer = packed_a;
     for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
-      __asm__ __volatile__ (
-        "testq %2,%2; jz 100002f;movq %2,%%r13;shlq $6,%%r13;"
-        "100001:\n\t"
-        "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; vmovupd (%0,%%r13,2),%%zmm7; addq $64,%0;"
-        "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); vmovupd %%zmm7,128(%1); addq $192,%1;"
-        "decq %2; testq %2,%2; jnz 100001b;"
-        "100002:\n\t"
-        "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,2),%0;subq %%r13,%1;subq %%r13,%1;subq %%r13,%1;"
-        :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6","zmm7");
       COMPUTE_n24
     }
     for(;ndiv8_count>1;ndiv8_count-=2){
-      __asm__ __volatile__ (
-        "testq %2,%2; jz 1000002f;movq %2,%%r13;shlq $6,%%r13;"
-        "1000001:\n\t"
-        "vmovupd (%0),%%zmm5; vmovupd (%0,%%r13,1),%%zmm6; addq $64,%0;"
-        "vmovupd %%zmm5,(%1); vmovupd %%zmm6,64(%1); addq $128,%1;"
-        "decq %2; testq %2,%2; jnz 1000001b;"
-        "1000002:\n\t"
-        "movq %%r13,%2;shrq $6,%2;leaq (%0,%%r13,1),%0;subq %%r13,%1;subq %%r13,%1;"
-        :"+r"(packed_b_pointer),"+r"(b_scratch),"+r"(K)::"r13","cc","memory","zmm5","zmm6");
       COMPUTE_n16
     }
     if(ndiv8_count>0){
       COMPUTE_n8
     }
-    free(b_scratch);b_scratch=NULL;
 }
 
 /* __m512d accumulators: zc1-zc4; temporary variables: za1,zb1-zb2 */

From 6bcb06fcb1d3f2b79fbe67c79db0e0af1d76297c Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 18 Oct 2019 10:47:31 +0800
Subject: [PATCH 067/210] make further changes to icopy_8 easier

---
 kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 103 +++++++++++++---------
 1 file changed, 61 insertions(+), 42 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
index b8b3234d1..49facd751 100644
--- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
@@ -1,8 +1,8 @@
 #include "common.h"
 #include <stdint.h>
 #include <immintrin.h>
+//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
 /* row-major c_block */
-/* 64-bit pointer registers: a_block_pointer,b_block_pointer,c_pointer;*/
 #define INNER_KERNEL_k1m1n8 \
     "prefetcht0 384(%1);"\
     "prefetcht0 768(%0); vmovupd (%1),%%zmm5; addq $64,%1;"\
@@ -158,7 +158,7 @@
 #define INNER_STORE_m1n8(c1,disp) \
     "kxnorw %%k1,%%k1,%%k1;"\
     "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
-    "vaddpd %%zmm7,"#c1","#c1";"\
+    "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
     "kxnorw %%k1,%%k1,%%k1;"\
     "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
 
@@ -227,26 +227,27 @@
     "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
     "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd  %%zmm7,"#c8","#c8"%{%5%};"
 
+//%7 for k01(input) only when m=4
 #define INNER_STORE_4x8(c1,c2,c3,c4) \
-    "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vaddpd %%zmm4,"#c1","#c1";"\
+    "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
     "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
-    "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vaddpd %%zmm5,"#c2","#c2";"\
+    "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
     "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
-    "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vaddpd %%zmm6,"#c3","#c3";"\
+    "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
     "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
-    "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vaddpd %%zmm7,"#c4","#c4";"\
+    "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
     "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
     "leaq (%3,%4,4),%3;"
 
 #define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
     "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vaddpd (%3),"#c1","#c1"; vmovupd "#c1",(%3); vaddpd (%3,%4,1),"#c2","#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
     "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vaddpd (%3),"#c3","#c3"; vmovupd "#c3",(%3); vaddpd (%3,%4,1),"#c4","#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
     "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vaddpd (%3),"#c5","#c5"; vmovupd "#c5",(%3); vaddpd (%3,%4,1),"#c6","#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
     "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vaddpd (%3),"#c7","#c7"; vmovupd "#c7",(%3); vaddpd (%3,%4,1),"#c8","#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
+    "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
 
 #define INNER_SAVE_m4n8 \
     INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
@@ -292,6 +293,7 @@
 
 #define COMPUTE_n8 {\
     __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
     "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
     "cmpq $8,%8; jb 42222f;"\
     "42221:\n\t"\
@@ -327,12 +329,13 @@
     "42225:\n\t"\
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
-    ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\
     a_block_pointer -= M * K;\
 }
 #define COMPUTE_n16 {\
     __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
     "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
     "cmpq $8,%8; jb 32222f;"\
     "32221:\n\t"\
@@ -369,13 +372,14 @@
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
     "leaq (%1,%%r12,2),%1;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
-    ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
     "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 #define COMPUTE_n24 {\
     __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
     "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $6,%%r12;"\
     "cmpq $8,%8; jb 22222f;"\
     "22221:\n\t"\
@@ -412,13 +416,13 @@
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
     "leaq (%1,%%r12,2),%1; addq %%r12,%1;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M)\
-    ::"zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
     "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 
-static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
+static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
 //perform C += A<pack> B<pack>
     if(k==0 || m==0 || ndiv8==0) return;
     int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
@@ -426,7 +430,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
     double *a_block_pointer;
     double *c_pointer = c;
     __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
-    BLASLONG ndiv8_count;
+    BLASLONG m_count,ndiv8_count,k_count;
     double *packed_b_pointer = packed_b;
     a_block_pointer = packed_a;
     for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
@@ -474,24 +478,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
 #define INIT_m8n2 zc2=INIT_m8n1
 #define INIT_m8n4 zc4=zc3=INIT_m8n2
 #define SAVE_m8n1 {\
-    za1 = _mm512_loadu_pd(c_pointer);\
-    zc1 = _mm512_add_pd(zc1,za1);\
+    __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
+    zb1 = _mm512_loadu_pd(c_pointer);\
+    zc1 = _mm512_fmadd_pd(zc1,za1,zb1);\
     _mm512_storeu_pd(c_pointer,zc1);\
     c_pointer += 8;\
 }
 #define SAVE_m8n2 {\
+    __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
     zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
-    zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\
+    zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
     _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
     c_pointer += 8;\
 }
 #define SAVE_m8n4 {\
+    __asm__ __volatile__("vbroadcastsd (%0),%1;":"+r"(alpha),"+v"(za1)::"memory");\
     zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
-    zc1 = _mm512_add_pd(zc1,zb1); zc2 = _mm512_add_pd(zc2,zb2);\
+    zc1 = _mm512_fmadd_pd(zc1,za1,zb1); zc2 = _mm512_fmadd_pd(zc2,za1,zb2);\
     _mm512_storeu_pd(c_pointer,zc1); _mm512_storeu_pd(c_pointer+LDC,zc2);\
     c_pointer += LDC*2;\
     zb1 = _mm512_loadu_pd(c_pointer); zb2 = _mm512_loadu_pd(c_pointer+LDC);\
-    zc3 = _mm512_add_pd(zc3,zb1); zc4 = _mm512_add_pd(zc4,zb2);\
+    zc3 = _mm512_fmadd_pd(zc3,za1,zb1); zc4 = _mm512_fmadd_pd(zc4,za1,zb2);\
     _mm512_storeu_pd(c_pointer,zc3); _mm512_storeu_pd(c_pointer+LDC,zc4);\
     c_pointer += 8-LDC*2;\
 }
@@ -518,24 +525,27 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
 #define INIT_m4n2 yc2=INIT_m4n1
 #define INIT_m4n4 yc4=yc3=INIT_m4n2
 #define SAVE_m4n1 {\
+    yb1 = _mm256_broadcast_sd(alpha);\
     ya1 = _mm256_loadu_pd(c_pointer);\
-    yc1 = _mm256_add_pd(yc1,ya1);\
+    yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
     _mm256_storeu_pd(c_pointer,yc1);\
     c_pointer += 4;\
 }
 #define SAVE_m4n2 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
     yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
-    yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\
+    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
     _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
     c_pointer += 4;\
 }
 #define SAVE_m4n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
     yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
-    yc1 = _mm256_add_pd(yc1,yb1); yc2 = _mm256_add_pd(yc2,yb2);\
+    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
     _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
     c_pointer += LDC*2;\
     yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
-    yc3 = _mm256_add_pd(yc3,yb1); yc4 = _mm256_add_pd(yc4,yb2);\
+    yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
     _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
     c_pointer += 4-LDC*2;\
 }
@@ -553,14 +563,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
 #define INIT_m2n1 xc1=_mm_setzero_pd();
 #define INIT_m2n2 xc2=INIT_m2n1
 #define SAVE_m2n1 {\
+    xb1 = _mm_loaddup_pd(alpha);\
     xa1 = _mm_loadu_pd(c_pointer);\
-    xc1 = _mm_add_pd(xc1,xa1);\
+    xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
     _mm_storeu_pd(c_pointer,xc1);\
     c_pointer += 2;\
 }
 #define SAVE_m2n2 {\
+    xa1 = _mm_loaddup_pd(alpha);\
     xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
-    xc1 = _mm_add_pd(xc1,xb1); xc2 = _mm_add_pd(xc2,xb2);\
+    xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
     _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
     c_pointer += 2;\
 }
@@ -571,7 +583,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
 }
 #define INIT_m1n1 sc1=0.0;
 #define SAVE_m1n1 {\
-    *c_pointer += sc1;\
+    *c_pointer += sc1 * (*alpha);\
     c_pointer++;\
 }
 
@@ -596,6 +608,9 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
 #define INIT_m1n4 INIT_m4n1
 #define INIT_m2n4 INIT_m4n2
 #define SAVE_m2n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yc1 = _mm256_mul_pd(yc1,ya1);\
+    yc2 = _mm256_mul_pd(yc2,ya1);\
     yb1 = _mm256_unpacklo_pd(yc1,yc2);\
     yb2 = _mm256_unpackhi_pd(yc1,yc2);\
     xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
@@ -609,12 +624,16 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
     c_pointer += 2;\
 }
 #define SAVE_m1n2 {\
+    xb1 = _mm_loaddup_pd(alpha);\
+    xc1 = _mm_mul_pd(xc1,xb1);\
     *c_pointer += _mm_cvtsd_f64(xc1);\
     xa1 = _mm_unpackhi_pd(xc1,xc1);\
     c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
     c_pointer ++;\
 }
 #define SAVE_m1n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yc1 = _mm256_mul_pd(yc1,ya1);\
     xb1 = _mm256_extractf128_pd(yc1,0);\
     *c_pointer += _mm_cvtsd_f64(xb1);\
     xb2 = _mm_unpackhi_pd(xb1,xb1);\
@@ -626,7 +645,7 @@ static void __attribute__ ((noinline)) KERNEL_MAIN(double *packed_a, double *pac
     c_pointer ++;\
 }
 
-static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c){//icopy=8,ocopy=8
+static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
 //perform C += A<pack> B<pack> , edge_n<8 must be satisfied !
     if(k==0 || m==0 || edge_n==0) return;
     double *a_block_pointer,*b_block_pointer,*b_base_pointer;
@@ -724,30 +743,30 @@ static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
       }
     }
 }
-static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k,double alpha){
-    BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp,alp;
-    src1 = src; dst1 = dst; src2 = src1 + 4 * k; alp = _mm256_set1_pd(alpha);
+static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
+    BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp;
+    src1 = src; dst1 = dst; src2 = src1 + 4 * k;
     for(m_count=m;m_count>7;m_count-=8){
       for(k_count=k;k_count>0;k_count--){
-        tmp = _mm256_loadu_pd(src1);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+0,tmp);src1+=4;
-        tmp = _mm256_loadu_pd(src2);tmp = _mm256_mul_pd(tmp,alp);_mm256_storeu_pd(dst1+4,tmp);src2+=4;
+        tmp = _mm256_loadu_pd(src1);_mm256_storeu_pd(dst1+0,tmp);src1+=4;
+        tmp = _mm256_loadu_pd(src2);_mm256_storeu_pd(dst1+4,tmp);src2+=4;
         dst1+=8;
       }
       src1+=4*k;src2+=4*k;
     }
     for(;m_count>0;m_count--){
       for(k_count=k;k_count>0;k_count--){
-        *dst1 = (*src1) * alpha; src1++; dst1++;
+        *dst1 = (*src1); src1++; dst1++;
       }
     }
 }
 int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
-    if(m==0 || n==0 || k==0) return 0;
-    BLASLONG ndiv8 = n/8;
+    if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
+    BLASLONG ndiv8 = n/8;double ALPHA = alpha;
     double *packed_a = (double *)malloc(m*k*sizeof(double));
-    copy_4_to_8(A,packed_a,m,k,alpha);
-    if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C);
-    if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8);
+    copy_4_to_8(A,packed_a,m,k);
+    if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
+    if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
     free(packed_a);packed_a=NULL;
     return 0;
 }

From 17cdd9f9e17728e4d8a044aa23028aa40b5e9946 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 18 Oct 2019 14:58:07 +0800
Subject: [PATCH 068/210] some correction

---
 kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
index 49facd751..bfd63bbc7 100644
--- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
@@ -330,7 +330,7 @@
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
     :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
-    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r13","r14");\
+    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 #define COMPUTE_n16 {\
@@ -645,8 +645,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
     c_pointer ++;\
 }
 
-static void __attribute__ ((noinline)) KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
-//perform C += A<pack> B<pack> , edge_n<8 must be satisfied !
+static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
+//perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
     if(k==0 || m==0 || edge_n==0) return;
     double *a_block_pointer,*b_block_pointer,*b_base_pointer;
     double *c_pointer = c;
@@ -763,11 +763,16 @@ static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
 int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
     if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
     BLASLONG ndiv8 = n/8;double ALPHA = alpha;
+#ifdef ICOPY_4
     double *packed_a = (double *)malloc(m*k*sizeof(double));
     copy_4_to_8(A,packed_a,m,k);
+#else //ICOPY_8
+    double *packed_a = A;
+#endif
     if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
     if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
+#ifdef ICOPY_4
     free(packed_a);packed_a=NULL;
+#endif
     return 0;
 }
-

From 0d669e04bb2716b6a7767f0f449a2b09caf2d456 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 18 Oct 2019 15:00:17 +0800
Subject: [PATCH 069/210] Update dgemm_kernel_8x8_skylakex.c

---
 kernel/x86_64/dgemm_kernel_8x8_skylakex.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
index bfd63bbc7..1139090e2 100644
--- a/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_8x8_skylakex.c
@@ -1,6 +1,8 @@
 #include "common.h"
 #include <stdint.h>
 #include <immintrin.h>
+
+#define ICOPY_4
 //register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
 /* row-major c_block */
 #define INNER_KERNEL_k1m1n8 \
@@ -743,6 +745,7 @@ static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
       }
     }
 }
+#ifdef ICOPY_4
 static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
     BLASLONG m_count,k_count;double *src1,*dst1,*src2;__m256d tmp;
     src1 = src; dst1 = dst; src2 = src1 + 4 * k;
@@ -760,6 +763,7 @@ static void copy_4_to_8(double *src,double *dst,BLASLONG m,BLASLONG k){
       }
     }
 }
+#endif
 int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
     if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
     BLASLONG ndiv8 = n/8;double ALPHA = alpha;

From 6ff013bae0965d791841aead9c9dcb0e27e1b7be Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 19 Oct 2019 03:54:44 +0800
Subject: [PATCH 070/210] native support for icopy_4

90% MKL 1-thread performance.
---
 kernel/x86_64/KERNEL.SKYLAKEX               |   2 +-
 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 666 ++++++++++++++++++++
 2 files changed, 667 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index d73a47925..82a455b44 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -7,7 +7,7 @@ SGEMMITCOPY    =  sgemm_tcopy_16_skylakex.c
 SGEMMONCOPY    =  sgemm_ncopy_4_skylakex.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 
-DGEMMKERNEL    =  dgemm_kernel_8x8_skylakex.c
+DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex_2.c
 
 DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
 DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c
diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
new file mode 100644
index 000000000..a958a1a6f
--- /dev/null
+++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
@@ -0,0 +1,666 @@
+#include "common.h"
+#include <stdint.h>
+#include <immintrin.h>
+
+//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
+/* row-major c_block */
+#define INNER_KERNEL_k1m1n8 \
+    "prefetcht0 384(%1);"\
+    "vmovupd (%1),%%zmm5; addq $64,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8;"
+
+#define INNER_KERNEL_k1m2n8 \
+    INNER_KERNEL_k1m1n8\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
+
+#define INNER_KERNEL_k1m4n8 \
+    INNER_KERNEL_k1m2n8\
+    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
+    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
+
+#define INNER_KERNEL_k1m8n8 \
+    INNER_KERNEL_k1m4n8\
+    "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
+    "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
+    "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
+    "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
+
+#define INNER_KERNEL_k1m1n16 \
+    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
+    "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9;"
+
+#define INNER_KERNEL_k1m2n16 \
+    INNER_KERNEL_k1m1n16\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
+
+#define INNER_KERNEL_k1m4n16 \
+    INNER_KERNEL_k1m2n16\
+    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
+    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
+
+#define INNER_KERNEL_k1m8n16 \
+    INNER_KERNEL_k1m4n16\
+    "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
+    "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
+    "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
+    "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
+
+#define INNER_KERNEL_k1m1n24 \
+    "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
+    "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
+    "vbroadcastsd   (%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm8; vfmadd231pd %%zmm6,%%zmm4,%%zmm9; vfmadd231pd %%zmm7,%%zmm4,%%zmm10;"
+
+#define INNER_KERNEL_k1m2n24 \
+    INNER_KERNEL_k1m1n24\
+    "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
+
+#define INNER_KERNEL_k1m4n24 \
+    INNER_KERNEL_k1m2n24\
+    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
+    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
+
+#define INNER_KERNEL_k1m8n24 \
+    INNER_KERNEL_k1m4n24\
+    "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
+    "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
+    "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
+    "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
+
+#define INNER_KERNELm1(nn) \
+    "cmpq $1,%2;jb "#nn"3f;"\
+    #nn"4:\n\t"\
+    INNER_KERNEL_k1m1n##nn "addq $8,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"4b;"\
+    #nn"3:\n\t"
+
+#define INNER_KERNELm2(nn) \
+    "cmpq $1,%2;jb "#nn"0f;"\
+    #nn"1:\n\t"\
+    INNER_KERNEL_k1m2n##nn "addq $16,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"1b;"\
+    #nn"0:\n\t"
+
+#define INNER_KERNELm4(nn) \
+    "cmpq $1,%2;jb "#nn"00f;"\
+    #nn"01:\n\t"\
+    INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
+    "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
+    #nn"00:\n\t"
+
+#define INNER_KERNELm8(nn) \
+    "cmpq $8,%2;jb "#nn"001f;"\
+    #nn"008:\n\t"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
+    #nn"001:\n\t"\
+    "cmpq $1,%2;jb "#nn"000f;"\
+    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    "decq %2;jmp "#nn"001b;"\
+    ""#nn"000:\n\t"
+
+#define INNER_INIT_m1n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8;"
+
+#define INNER_INIT_m2n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9;"
+
+#define INNER_INIT_m4n8 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
+
+#define INNER_INIT_m8n8 \
+    INNER_INIT_m4n8\
+    "vpxorq %%zmm12,%%zmm12,%%zmm12;vpxorq %%zmm13,%%zmm13,%%zmm13;vpxorq %%zmm14,%%zmm14,%%zmm14;vpxorq %%zmm15,%%zmm15,%%zmm15;"
+
+#define INNER_INIT_m1n16 INNER_INIT_m2n8
+
+#define INNER_INIT_m2n16 INNER_INIT_m4n8
+
+#define INNER_INIT_m4n16 INNER_INIT_m8n8
+
+#define INNER_INIT_m8n16 \
+    INNER_INIT_m8n8\
+    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"\
+    "vpxorq %%zmm20,%%zmm20,%%zmm20;vpxorq %%zmm21,%%zmm21,%%zmm21;vpxorq %%zmm22,%%zmm22,%%zmm22;vpxorq %%zmm23,%%zmm23,%%zmm23;"
+
+#define INNER_INIT_m1n24 \
+    "vpxorq %%zmm8, %%zmm8, %%zmm8; vpxorq %%zmm9, %%zmm9, %%zmm9; vpxorq %%zmm10,%%zmm10,%%zmm10;"
+
+#define INNER_INIT_m2n24 \
+    INNER_INIT_m1n24\
+    "vpxorq %%zmm11,%%zmm11,%%zmm11; vpxorq %%zmm12,%%zmm12,%%zmm12; vpxorq %%zmm13,%%zmm13,%%zmm13;"
+
+#define INNER_INIT_m4n24 \
+    INNER_INIT_m4n16\
+    "vpxorq %%zmm16,%%zmm16,%%zmm16;vpxorq %%zmm17,%%zmm17,%%zmm17;vpxorq %%zmm18,%%zmm18,%%zmm18;vpxorq %%zmm19,%%zmm19,%%zmm19;"
+
+#define INNER_INIT_m8n24 \
+    INNER_INIT_m8n16\
+    "vpxorq %%zmm24,%%zmm24,%%zmm24;vpxorq %%zmm25,%%zmm25,%%zmm25;vpxorq %%zmm26,%%zmm26,%%zmm26;vpxorq %%zmm27,%%zmm27,%%zmm27;"\
+    "vpxorq %%zmm28,%%zmm28,%%zmm28;vpxorq %%zmm29,%%zmm29,%%zmm29;vpxorq %%zmm30,%%zmm30,%%zmm30;vpxorq %%zmm31,%%zmm31,%%zmm31;"
+
+#define INNER_SETINDEX \
+    "vpinsrq $0,%4,%%xmm4,%%xmm4; vbroadcastsd %%xmm4,%%zmm4;"\
+    "kxnorw %%k1,%%k1,%%k1; kshiftlw $1,%%k1,%%k1; vpxorq %%zmm6,%%zmm6,%%zmm6; vmovapd %%zmm4,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"\
+    "kshiftlw $1,%%k1,%%k1; vpaddq %%zmm4,%%zmm6,%%zmm6%{%%k1%};"
+
+#define INNER_STORE_m1n8(c1,disp) \
+    "kxnorw %%k1,%%k1,%%k1;"\
+    "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
+    "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
+    "kxnorw %%k1,%%k1,%%k1;"\
+    "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
+
+#define INNER_SAVE_m1n8 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)
+
+#define INNER_SAVE_m1n16 \
+    INNER_SAVE_m1n8\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm9,0)
+
+#define INNER_SAVE_m1n24 \
+    INNER_SAVE_m1n16\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm10,0)
+
+#define INNER_SAVE_m2n8 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm9,8)
+
+#define INNER_SAVE_m2n16 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm10,8)\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm9,0)\
+    INNER_STORE_m1n8(%%zmm11,8)
+#define INNER_SAVE_m2n24 \
+    INNER_SETINDEX\
+    INNER_STORE_m1n8(%%zmm8,0)\
+    INNER_STORE_m1n8(%%zmm11,8)\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm9,0)\
+    INNER_STORE_m1n8(%%zmm12,8)\
+    "leaq (%3,%4,8),%3;"\
+    INNER_STORE_m1n8(%%zmm10,0)\
+    INNER_STORE_m1n8(%%zmm13,8)
+#define INNER_PREF_8x8 \
+    "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
+    "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
+    "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
+    "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
+    "subq %4,%3; subq %4,%3; subq %4,%3;"
+#define INNER_TRANS_4x8(c1,c2,c3,c4) \
+    "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
+    "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
+    "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
+    "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
+    "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
+    "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
+#define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
+    INNER_TRANS_4x8(c1,c2,c3,c4)\
+    INNER_TRANS_4x8(c5,c6,c7,c8)\
+    "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
+    "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd  %%zmm4,"#c5","#c5"%{%5%};"\
+    "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
+    "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd  %%zmm5,"#c6","#c6"%{%5%};"\
+    "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
+    "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd  %%zmm6,"#c7","#c7"%{%5%};"\
+    "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
+    "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd  %%zmm7,"#c8","#c8"%{%5%};"
+//%7 for k01(input) only when m=4
+#define INNER_STORE_4x8(c1,c2,c3,c4) \
+    "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
+    "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
+    "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
+    "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
+    "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
+    "leaq (%3,%4,4),%3;"
+#define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
+    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
+    "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
+#define INNER_SAVE_m4n8 \
+    INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
+    INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
+#define INNER_SAVE_m4n16 \
+    INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
+    INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
+    INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
+    INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
+#define INNER_SAVE_m4n24 \
+    INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
+    INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
+    INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
+    INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
+    INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
+    INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
+#define INNER_SAVE_m8n8 \
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
+    INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+#define INNER_SAVE_m8n16 \
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
+    INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
+    INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
+#define INNER_SAVE_m8n24 \
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
+    INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
+    INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
+    INNER_PREF_8x8\
+    INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
+    INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
+
+#define COMPUTE_n8 {\
+    __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
+    "cmpq $8,%8; jb 42222f;"\
+    "42221:\n\t"\
+    INNER_INIT_m8n8\
+    INNER_KERNELm8(8)\
+    INNER_SAVE_m8n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
+    "42222:\n\t"\
+    "cmpq $4,%8; jb 42223f;"\
+    INNER_INIT_m4n8\
+    INNER_KERNELm4(8)\
+    INNER_SAVE_m4n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
+    "subq $4,%8;"\
+    "42223:\n\t"\
+    "cmpq $2,%8; jb 42224f;"\
+    INNER_INIT_m2n8\
+    INNER_KERNELm2(8)\
+    INNER_SAVE_m2n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $16,%3;"\
+    "subq $2,%8;"\
+    "42224:\n\t"\
+    "cmpq $1,%8; jb 42225f;"\
+    INNER_INIT_m1n8\
+    INNER_KERNELm1(8)\
+    INNER_SAVE_m1n8\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "addq $8,%3;"\
+    "42225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+#define COMPUTE_n16 {\
+    __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
+    "cmpq $8,%8; jb 32222f;"\
+    "32221:\n\t"\
+    INNER_INIT_m8n16\
+    INNER_KERNELm8(16)\
+    INNER_SAVE_m8n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
+    "32222:\n\t"\
+    "cmpq $4,%8; jb 32223f;"\
+    INNER_INIT_m4n16\
+    INNER_KERNELm4(16)\
+    INNER_SAVE_m4n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
+    "subq $4,%8;"\
+    "32223:\n\t"\
+    "cmpq $2,%8; jb 32224f;"\
+    INNER_INIT_m2n16\
+    INNER_KERNELm2(16)\
+    INNER_SAVE_m2n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
+    "subq $2,%8;"\
+    "32224:\n\t"\
+    "cmpq $1,%8; jb 32225f;"\
+    INNER_INIT_m1n16\
+    INNER_KERNELm1(16)\
+    INNER_SAVE_m1n16\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
+    "32225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
+    "leaq (%1,%%r12,4),%1;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
+    "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+#define COMPUTE_n24 {\
+    __asm__ __volatile__(\
+    "vbroadcastsd (%9),%%zmm3;"\
+    "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
+    "cmpq $8,%8; jb 22222f;"\
+    "22221:\n\t"\
+    INNER_INIT_m8n24\
+    INNER_KERNELm8(24)\
+    INNER_SAVE_m8n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
+    "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
+    "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
+    "22222:\n\t"\
+    "cmpq $4,%8; jb 22223f;"\
+    INNER_INIT_m4n24\
+    INNER_KERNELm4(24)\
+    INNER_SAVE_m4n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
+    "subq $4,%8;"\
+    "22223:\n\t"\
+    "cmpq $2,%8; jb 22224f;"\
+    INNER_INIT_m2n24\
+    INNER_KERNELm2(24)\
+    INNER_SAVE_m2n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
+    "subq $2,%8;"\
+    "22224:\n\t"\
+    "cmpq $1,%8; jb 22225f;"\
+    INNER_INIT_m1n24\
+    INNER_KERNELm1(24)\
+    INNER_SAVE_m1n24\
+    "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
+    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
+    "22225:\n\t"\
+    "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
+    "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
+    "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
+    "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
+    a_block_pointer -= M * K;\
+}
+static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8
+//perform C += A<pack> B<pack>
+    if(k==0 || m==0 || ndiv8==0) return;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
+    int64_t K = (int64_t)k; int64_t M = (int64_t)m;
+    double *a_block_pointer;
+    double *c_pointer = c;
+    __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
+    BLASLONG ndiv8_count;
+    double *packed_b_pointer = packed_b;
+    a_block_pointer = packed_a;
+    for(ndiv8_count=ndiv8;ndiv8_count>2;ndiv8_count-=3){
+      COMPUTE_n24
+    }
+    for(;ndiv8_count>1;ndiv8_count-=2){
+      COMPUTE_n16
+    }
+    if(ndiv8_count>0){
+      COMPUTE_n8
+    }
+}
+
+/* __m256d accumulators: yc1-yc4; temporary variables: ya1,yb1-yb2 */
+/* __m128d accumulators: xc1-xc2; temporary variables: xa1,xb1-xb2 */
+/*  double accumulator:  sc1;     temporary variables: sa1,sb1 */
+/* column-major c_block */
+#define KERNEL_m4n4k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
+    yb1 = _mm256_broadcast_sd(b_block_pointer+2); yc3 = _mm256_fmadd_pd(ya1,yb1,yc3);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+3); yc4 = _mm256_fmadd_pd(ya1,yb2,yc4);\
+    b_block_pointer+=4;\
+}
+#define KERNEL_m4n2k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    yb2 = _mm256_broadcast_sd(b_block_pointer+1); yc2 = _mm256_fmadd_pd(ya1,yb2,yc2);\
+    b_block_pointer+=2;\
+}
+#define KERNEL_m4n1k1 {\
+    ya1 = _mm256_loadu_pd(a_block_pointer);a_block_pointer+=4;\
+    yb1 = _mm256_broadcast_sd(b_block_pointer);   yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    b_block_pointer++;\
+}
+#define INIT_m4n1 yc1=_mm256_setzero_pd();
+#define INIT_m4n2 yc2=INIT_m4n1
+#define INIT_m4n4 yc4=yc3=INIT_m4n2
+#define SAVE_m4n1 {\
+    yb1 = _mm256_broadcast_sd(alpha);\
+    ya1 = _mm256_loadu_pd(c_pointer);\
+    yc1 = _mm256_fmadd_pd(yc1,yb1,ya1);\
+    _mm256_storeu_pd(c_pointer,yc1);\
+    c_pointer += 4;\
+}
+#define SAVE_m4n2 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
+    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
+    c_pointer += 4;\
+}
+#define SAVE_m4n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc1 = _mm256_fmadd_pd(yc1,ya1,yb1); yc2 = _mm256_fmadd_pd(yc2,ya1,yb2);\
+    _mm256_storeu_pd(c_pointer,yc1); _mm256_storeu_pd(c_pointer+LDC,yc2);\
+    c_pointer += LDC*2;\
+    yb1 = _mm256_loadu_pd(c_pointer); yb2 = _mm256_loadu_pd(c_pointer+LDC);\
+    yc3 = _mm256_fmadd_pd(yc3,ya1,yb1); yc4 = _mm256_fmadd_pd(yc4,ya1,yb2);\
+    _mm256_storeu_pd(c_pointer,yc3); _mm256_storeu_pd(c_pointer+LDC,yc4);\
+    c_pointer += 4-LDC*2;\
+}
+#define KERNEL_m2n2k1 {\
+    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
+    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    xb2 = _mm_loaddup_pd(b_block_pointer+1); xc2 = _mm_fmadd_pd(xa1,xb2,xc2);\
+    b_block_pointer += 2;\
+}
+#define KERNEL_m2n1k1 {\
+    xa1 = _mm_loadu_pd(a_block_pointer); a_block_pointer+=2;\
+    xb1 = _mm_loaddup_pd(b_block_pointer);   xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    b_block_pointer ++;\
+}
+#define INIT_m2n1 xc1=_mm_setzero_pd();
+#define INIT_m2n2 xc2=INIT_m2n1
+#define SAVE_m2n1 {\
+    xb1 = _mm_loaddup_pd(alpha);\
+    xa1 = _mm_loadu_pd(c_pointer);\
+    xc1 = _mm_fmadd_pd(xc1,xb1,xa1);\
+    _mm_storeu_pd(c_pointer,xc1);\
+    c_pointer += 2;\
+}
+#define SAVE_m2n2 {\
+    xa1 = _mm_loaddup_pd(alpha);\
+    xb1 = _mm_loadu_pd(c_pointer); xb2 = _mm_loadu_pd(c_pointer+LDC);\
+    xc1 = _mm_fmadd_pd(xc1,xa1,xb1); xc2 = _mm_fmadd_pd(xc2,xa1,xb2);\
+    _mm_storeu_pd(c_pointer,xc1); _mm_storeu_pd(c_pointer+LDC,xc2);\
+    c_pointer += 2;\
+}
+#define KERNEL_m1n1k1 {\
+    sa1 = *a_block_pointer; a_block_pointer++;\
+    sb1 = *b_block_pointer; sc1 += sa1 * sb1;\
+    b_block_pointer ++;\
+}
+#define INIT_m1n1 sc1=0.0;
+#define SAVE_m1n1 {\
+    *c_pointer += sc1 * (*alpha);\
+    c_pointer++;\
+}
+/* row-major c_block */
+#define KERNEL_m2n4k1 {\
+    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
+    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    ya1 = _mm256_broadcast_sd(a_block_pointer+1);yc2 = _mm256_fmadd_pd(ya1,yb1,yc2);\
+    a_block_pointer += 2;\
+}
+#define KERNEL_m1n4k1 {\
+    yb1 = _mm256_loadu_pd(b_block_pointer);b_block_pointer+=4;\
+    ya1 = _mm256_broadcast_sd(a_block_pointer);  yc1 = _mm256_fmadd_pd(ya1,yb1,yc1);\
+    a_block_pointer ++;\
+}
+#define KERNEL_m1n2k1 {\
+    xb1 = _mm_loadu_pd(b_block_pointer);b_block_pointer+=2;\
+    xa1 = _mm_loaddup_pd(a_block_pointer); xc1 = _mm_fmadd_pd(xa1,xb1,xc1);\
+    a_block_pointer ++;\
+}
+#define INIT_m1n2 INIT_m2n1
+#define INIT_m1n4 INIT_m4n1
+#define INIT_m2n4 INIT_m4n2
+#define SAVE_m2n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yc1 = _mm256_mul_pd(yc1,ya1);\
+    yc2 = _mm256_mul_pd(yc2,ya1);\
+    yb1 = _mm256_unpacklo_pd(yc1,yc2);\
+    yb2 = _mm256_unpackhi_pd(yc1,yc2);\
+    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer),_mm256_extractf128_pd(yb1,0));\
+    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+LDC),_mm256_extractf128_pd(yb2,0));\
+    _mm_storeu_pd(c_pointer,xb1);\
+    _mm_storeu_pd(c_pointer+LDC,xb2);\
+    xb1 = _mm_add_pd(_mm_loadu_pd(c_pointer+2*LDC),_mm256_extractf128_pd(yb1,1));\
+    xb2 = _mm_add_pd(_mm_loadu_pd(c_pointer+3*LDC),_mm256_extractf128_pd(yb2,1));\
+    _mm_storeu_pd(c_pointer+2*LDC,xb1);\
+    _mm_storeu_pd(c_pointer+3*LDC,xb2);\
+    c_pointer += 2;\
+}
+#define SAVE_m1n2 {\
+    xb1 = _mm_loaddup_pd(alpha);\
+    xc1 = _mm_mul_pd(xc1,xb1);\
+    *c_pointer += _mm_cvtsd_f64(xc1);\
+    xa1 = _mm_unpackhi_pd(xc1,xc1);\
+    c_pointer[LDC]+= _mm_cvtsd_f64(xa1);\
+    c_pointer ++;\
+}
+#define SAVE_m1n4 {\
+    ya1 = _mm256_broadcast_sd(alpha);\
+    yc1 = _mm256_mul_pd(yc1,ya1);\
+    xb1 = _mm256_extractf128_pd(yc1,0);\
+    *c_pointer += _mm_cvtsd_f64(xb1);\
+    xb2 = _mm_unpackhi_pd(xb1,xb1);\
+    c_pointer[LDC] += _mm_cvtsd_f64(xb2);\
+    xb1 = _mm256_extractf128_pd(yc1,1);\
+    c_pointer[LDC*2] += _mm_cvtsd_f64(xb1);\
+    xb2 = _mm_unpackhi_pd(xb1,xb1);\
+    c_pointer[LDC*3] += _mm_cvtsd_f64(xb2);\
+    c_pointer ++;\
+}
+static void KERNEL_EDGE(double *packed_a, double *packed_b, BLASLONG m, BLASLONG edge_n, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=8,ocopy=8
+//perform C += A<pack> B<pack> , edge_n<8 must be satisfied.
+    if(k==0 || m==0 || edge_n==0 || (*alpha)==0.0) return;
+    double *a_block_pointer,*b_block_pointer,*b_base_pointer;
+    double *c_pointer = c;
+    __m256d yc1,yc2,yc3,yc4,ya1,yb1,yb2;
+    __m128d xc1,xc2,xa1,xb1,xb2;
+    double sc1,sa1,sb1;
+    BLASLONG m_count,n_count,k_count;
+    b_base_pointer = packed_b;
+//now start calculation of the edge part
+    for(n_count=edge_n;n_count>3;n_count-=4){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n4k1
+        SAVE_m4n4
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n4k1
+        SAVE_m2n4
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n4
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n4k1
+        SAVE_m1n4
+      }
+      b_base_pointer += 4*k;
+      c_pointer += 4 * LDC - m;
+    }
+    for(;n_count>1;n_count-=2){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n2k1
+        SAVE_m4n2
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n2k1
+        SAVE_m2n2
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n2
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n2k1
+        SAVE_m1n2
+      }
+      b_base_pointer += 2*k;
+      c_pointer += 2 * LDC - m;
+    }
+    if(n_count>0){
+      a_block_pointer = packed_a;
+      for(m_count=m;m_count>3;m_count-=4){
+        b_block_pointer = b_base_pointer;
+        INIT_m4n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m4n1k1
+        SAVE_m4n1
+      }
+      for(;m_count>1;m_count-=2){
+        b_block_pointer = b_base_pointer;
+        INIT_m2n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m2n1k1
+        SAVE_m2n1
+      }
+      if(m_count>0){
+        b_block_pointer = b_base_pointer;
+        INIT_m1n1
+        for(k_count=0;k_count<k;k_count++) KERNEL_m1n1k1
+        SAVE_m1n1
+      }
+    }
+}
+int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc){
+    if(m==0 || n==0 || k==0 || alpha == 0.0) return 0;
+    BLASLONG ndiv8 = n/8;double ALPHA = alpha;
+    double *packed_a = A;
+    if(ndiv8>0) KERNEL_MAIN(packed_a,B,m,ndiv8,k,ldc,C,&ALPHA);
+    if(n>ndiv8*8) KERNEL_EDGE(packed_a,B+(int64_t)k*(int64_t)ndiv8*8,m,n-ndiv8*8,k,ldc,C+(int64_t)ldc*(int64_t)ndiv8*8,&ALPHA);
+    return 0;
+}

From e9437eebd26ce9d6b4a51b5d87fda5dedf329527 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 24 Oct 2019 18:45:27 +0200
Subject: [PATCH 071/210] Restore Goldmont ID and improve QEMU support

#2283 had inadvertently removed Goldmont+, and cpuid was reporting a mix of Core2 and Pentium2 for some QEMU configurations
---
 cpuid_x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 92c8e1b67..9e1c8e752 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -1197,7 +1197,11 @@ int get_cpuname(void){
 	case  3:
 	case  5:
 	case  6:
+#if defined(__x86_64__) || defined(__amd64__)
+	  return CPUTYPE_CORE2;
+#else
 	  return CPUTYPE_PENTIUM2;
+#endif
 	case  7:
 	case  8:
 	case 10:
@@ -1379,6 +1383,8 @@ int get_cpuname(void){
       break;
       case 7: // family 6 exmodel 7
         switch (model) {
+	case 10: // Goldmont Plus
+	    return CPUTYPE_NEHALEM;
         case 14: // Ice Lake
           if(support_avx512())
             return CPUTYPE_SKYLAKEX;

From 46a8c2519a85553e63f6bcfd7970731edbf9f013 Mon Sep 17 00:00:00 2001
From: luzpaz <luzpaz@users.noreply.github.com>
Date: Thu, 24 Oct 2019 12:56:53 -0400
Subject: [PATCH 072/210] Remove prototype of unused, unimplemented function
 (#2274)

* Fix source typo

Found via `codespell -q 3 -L amin,als,ba,dum,mone,nd,nto,orign -S Changelog.txt,./lapack*`

* Remove beta-thread function per request
---
 common_thread.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/common_thread.h b/common_thread.h
index bd964445e..6ec40e096 100644
--- a/common_thread.h
+++ b/common_thread.h
@@ -194,10 +194,6 @@ int trsm_thread(int mode, BLASLONG m, BLASLONG n,
 
 int syrk_thread(int mode, blas_arg_t *, BLASLONG *, BLASLONG *, int (*function)(), void *, void *, BLASLONG);
 
-int beta_thread(int mode, BLASLONG m, BLASLONG n,
-		double alpha_r, double alpha_i,
-		void *c, BLASLONG ldc, int (*fuction)());
-
 int getrf_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k,
 		 void *offsetA, BLASLONG lda,
 		 void *offsetB, BLASLONG jb,

From b687fba5bcb9192499d84f2d0d250230cad09407 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 24 Oct 2019 21:18:17 +0200
Subject: [PATCH 073/210] Disable direct clock register access on IOS and
 Android

as I find conflicting information on accessibility from non-priviledged processes
---
 common_arm64.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/common_arm64.h b/common_arm64.h
index 13718af5a..376f81e60 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -78,17 +78,18 @@ static void __inline blas_lock(volatile BLASULONG *address){
 
 #define BLAS_LOCK_DEFINED
 
+#if !defined(OS_DARWIN) && !defined (OS_ANDROID)
 static __inline BLASULONG rpcc(void){
   BLASULONG ret = 0;
  
-  __asm__ __volatile__ ("mrs %0,cntvct_el0":"=r"(ret));
+  __asm__ __volatile__ ("isb; mrs %0,cntvct_el0":"=r"(ret));
 
   return ret;
 }
 
 #define RPCC_DEFINED
 #define RPCC64BIT
- 
+#endif 
 
 static inline int blas_quickdivide(blasint x, blasint y){
   return x / y;

From fab49e49e5edbcb2b39560cd419b38faaf9694c1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 24 Oct 2019 21:26:20 +0200
Subject: [PATCH 074/210] Move most lapack 3.7/3.8 additions to the
 embedded_underscores list

to allow linktest to pass with a compiler that adds a second underscore to such names
---
 exports/gensymbol | 197 +++++++++++++---------------------------------
 1 file changed, 54 insertions(+), 143 deletions(-)

diff --git a/exports/gensymbol b/exports/gensymbol
index 21a1b703d..37ba0b191 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -618,19 +618,6 @@
     # functions added for lapack-3.7.0
 
     slarfy,
-    slasyf_rk,
-    ssyconvf_rook,
-    ssytf2_rk,
-    ssytrf_rk,
-    ssytrs_3,
-    ssytri_3,
-    ssytri_3x,
-    ssycon_3,
-    ssysv_rk,
-    slasyf_aa,
-    ssysv_aa,
-    ssytrf_aa,
-    ssytrs_aa,
     strevc3,
     sgelqt,
     sgelqt3,
@@ -647,33 +634,8 @@
     stplqt,
     stplqt2,
     stpmlqt,
-    ssytrd_2stage,
-    ssytrd_sy2sb,
-    ssytrd_sb2st,
-    ssb2st_kernels,
-    ssyevd_2stage,
-    ssyev_2stage,
-    ssyevx_2stage,
-    ssyevr_2stage,
-    ssbev_2stage,
-    ssbevx_2stage,
-    ssbevd_2stage,
-    ssygv_2stage,
     dlarfy,
-    dlasyf_rk,
     dsyconvf,
-    dsyconvf_rook,
-    dsytf2_rk,
-    dsytrf_rk,
-    dsytrs_3,
-    dsytri_3,
-    dsytri_3x,
-    dsycon_3,
-    dsysv_rk,
-    dlasyf_aa,
-    dsysv_aa,
-    dsytrf_aa,
-    dsytrs_aa,
     dtrevc3,
     dgelqt,
     dgelqt3,
@@ -690,45 +652,8 @@
     dtplqt,
     dtplqt2,
     dtpmlqt,
-    dsytrd_2stage,
-    dsytrd_sy2sb,
-    dsytrd_sb2st,
-    dsb2st_kernels,
-    dsyevd_2stage,
-    dsyev_2stage,
-    dsyevx_2stage,
-    dsyevr_2stage,
-    dsbev_2stage,
-    dsbevx_2stage,
-    dsbevd_2stage,
-    dsygv_2stage,
-    chetf2_rk,
-    chetrf_rk,
-    chetri_3,
-    chetri_3x,
-    chetrs_3,
-    checon_3,
-    chesv_rk,
-    chesv_aa,
-    chetrf_aa,
-    chetrs_aa,
-    clahef_aa,
-    clahef_rk,
     clarfy,
-    clasyf_rk,
-    clasyf_aa,
     csyconvf,
-    csyconvf_rook,
-    csytf2_rk,
-    csytrf_rk,
-    csytrf_aa,
-    csytrs_3,
-    csytrs_aa,
-    csytri_3,
-    csytri_3x,
-    csycon_3,
-    csysv_rk,
-    csysv_aa,
     ctrevc3,
     cgelqt,
     cgelqt3,
@@ -745,45 +670,8 @@
     ctplqt,
     ctplqt2,
     ctpmlqt,
-    chetrd_2stage,
-    chetrd_he2hb,
-    chetrd_hb2st,
-    chb2st_kernels,
-    cheevd_2stage,
-    cheev_2stage,
-    cheevx_2stage,
-    cheevr_2stage,
-    chbev_2stage,
-    chbevx_2stage,
-    chbevd_2stage,
-    chegv_2stage,
-    zhetf2_rk,
-    zhetrf_rk,
-    zhetri_3,
-    zhetri_3x,
-    zhetrs_3,
-    zhecon_3,
-    zhesv_rk,
-    zhesv_aa,
-    zhetrf_aa,
-    zhetrs_aa,
-    zlahef_aa,
-    zlahef_rk,
     zlarfy,
-    zlasyf_rk,
-    zlasyf_aa,
     zsyconvf,
-    zsyconvf_rook,
-    zsytrs_aa,
-    zsytf2_rk,
-    zsytrf_rk,
-    zsytrf_aa,
-    zsytrs_3,
-    zsytri_3,
-    zsytri_3x,
-    zsycon_3,
-    zsysv_rk,
-    zsysv_aa,
     ztrevc3,
     ztplqt,
     ztplqt2,
@@ -800,43 +688,13 @@
     zlaswlq,
     zlamswlq,
     zgemlq,
-    zhetrd_2stage,
-    zhetrd_he2hb,
-    zhetrd_hb2st,
-    zhb2st_kernels,
-    zheevd_2stage,
-    zheev_2stage,
-    zheevx_2stage,
-    zheevr_2stage,
-    zhbev_2stage,
-    zhbevx_2stage,
-    zhbevd_2stage,
-    zhegv_2stage,
     sladiv1,
     dladiv1,
     iparam2stage,
     
     # functions added for lapack-3.8.0
 
-    ilaenv2stage,
-    ssysv_aa_2stage,
-    ssytrf_aa_2stage,
-    ssytrs_aa_2stage,
-    chesv_aa_2stage,
-    chetrf_aa_2stage,
-    chetrs_aa_2stage,
-    csysv_aa_2stage,
-    csytrf_aa_2stage,
-    csytrs_aa_2stage,
-    dsysv_aa_2stage,
-    dsytrf_aa_2stage,
-    dsytrs_aa_2stage,
-    zhesv_aa_2stage,
-    zhetrf_aa_2stage,
-    zhetrs_aa_2stage,
-    zsysv_aa_2stage,
-    zsytrf_aa_2stage,
-    zsytrs_aa_2stage
+    ilaenv2stage
 );
 
 @lapack_extendedprecision_objs = (
@@ -3509,6 +3367,59 @@
     zlahef_rook, zlasyf_rook,
     zsytf2_rook, zsytrf_rook, zsytrs_rook,
     zsytri_rook, zsycon_rook, zsysv_rook,
+# 3.7.0
+    slasyf_rk, ssyconvf_rook, ssytf2_rk,
+    ssytrf_rk, ssytrs_3, ssytri_3,
+    ssytri_3x, ssycon_3, ssysv_rk,
+    slasyf_aa, ssysv_aa, ssytrf_aa,
+    ssytrs_aa, ssytrd_2stage, ssytrd_sy2sb,
+    ssytrd_sb2st, ssb2st_kernels, ssyevd_2stage,
+    ssyev_2stage, ssyevx_2stage, ssyevr_2stage,
+    ssbev_2stage, ssbevx_2stage, ssbevd_2stage,
+    ssygv_2stage, dlasyf_rk, dsyconvf_rook,
+    dsytf2_rk, dsytrf_rk, dsytrs_3,
+    dsytri_3, dsytri_3x, dsycon_3,
+    dsysv_rk, dlasyf_aa, dsysv_aa,
+    dsytrf_aa, dsytrs_aa, dsytrd_2stage,
+    dsytrd_sy2sb, dsytrd_sb2st, dsb2st_kernels,
+    dsyevd_2stage, dsyev_2stage, dsyevx_2stage,
+    dsyevr_2stage, dsbev_2stage, dsbevx_2stage,
+    dsbevd_2stage, dsygv_2stage, chetf2_rk,
+    chetrf_rk, chetri_3, chetri_3x,
+    chetrs_3, checon_3, chesv_rk,
+    chesv_aa, chetrf_aa, chetrs_aa,
+    clahef_aa, clahef_rk, clasyf_rk,
+    clasyf_aa, csytf2_rk, csytrf_rk,
+    csytrf_aa, csytrs_3, csytrs_aa,
+    csytri_3, csytri_3x, csycon_3,
+    csysv_rk, csysv_aa, csyconvf_rook,
+    chetrd_2stage, chetrd_he2hb, chetrd_hb2st,
+    chb2st_kernels, cheevd_2stage, cheev_2stage,
+    cheevx_2stage, cheevr_2stage, chbev_2stage,
+    chbevx_2stage, chbevd_2stage, chegv_2stage,
+    zhetf2_rk, zhetrf_rk, zhetri_3,
+    zhetri_3x, zhetrs_3, zhecon_3,
+    zhesv_rk, zhesv_aa, zhetrf_aa,
+    zhetrs_aa, zlahef_aa, zlahef_rk,
+    zlasyf_rk, zlasyf_aa, zsyconvf_rook,
+    zsytrs_aa, zsytf2_rk, zsytrf_rk,
+    zsytrf_aa, zsytrs_3, zsytri_3,
+    zsytri_3x, zsycon_3, zsysv_rk,
+    zsysv_aa, zhetrd_2stage, zhetrd_he2hb,
+    zhetrd_hb2st, zhb2st_kernels, zheevd_2stage,
+    zheev_2stage, zheevx_2stage, zheevr_2stage,
+    zhbev_2stage, zhbevx_2stage, zhbevd_2stage,
+    zhegv_2stage,
+# 3.8.0
+    ssysv_aa_2stage, ssytrf_aa_2stage,
+    ssytrs_aa_2stage, chesv_aa_2stage,
+    chetrf_aa_2stage, chetrs_aa_2stage,
+    csysv_aa_2stage, csytrf_aa_2stage,
+    csytrs_aa_2stage, dsysv_aa_2stage,
+    dsytrf_aa_2stage, dsytrs_aa_2stage,
+    zhesv_aa_2stage, zhetrf_aa_2stage,
+    zhetrs_aa_2stage, zsysv_aa_2stage,
+    zsytrf_aa_2stage, zsytrs_aa_2stage
 );
 
 

From 911c3e2f4b4d557e8b65624beb90d5138289a4f3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 24 Oct 2019 22:43:27 +0200
Subject: [PATCH 075/210] Improve support for g95 and non-GNU ld

Auto-add "-fno-second-underscore" option to make LAPACKE compile (as it calls LAPACK functions that may have gotten a second underscore added otherwise). Also support -R for rpath when parsing compiler directives in f_check
---
 Makefile.system | 3 +++
 f_check         | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 8843d0ad3..4cb4dc954 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -769,6 +769,9 @@ else
 FCOMMON_OPT += -m32
 endif
 endif
+ifneq ($(NO_LAPACKE), 1)
+FCOMMON_OPT += -fno-second-underscore
+endif
 endif
 endif
 
diff --git a/f_check b/f_check
index b05db85bd..0afbab23a 100644
--- a/f_check
+++ b/f_check
@@ -130,6 +130,11 @@ if ($compiler eq "") {
 	if ($data =~ / zho_ge__/) {
 	    $need2bu       = 1;
 	}
+	if ($vendor =~ /G95/) {
+    	  if ($ENV{NO_LAPACKE} != 1) {
+		$need2bu = "";
+	  }
+	}
     }
 
     if ($vendor eq "") {
@@ -277,6 +282,8 @@ $linker_a = "";
 if ($link ne "") {
 
     $link =~ s/\-Y\sP\,/\-Y/g;
+    
+    $link =~ s/\-R+/\-rpath\@/g;
 
     $link =~ s/\-rpath\s+/\-rpath\@/g;
 

From e3e8b5cdca829e49edfc3e2d1a691e0c0ae33837 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Oct 2019 12:51:06 +0200
Subject: [PATCH 076/210] Add NetBSD

---
 getarch.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/getarch.c b/getarch.c
index 4d960356c..1f590390a 100644
--- a/getarch.c
+++ b/getarch.c
@@ -82,7 +82,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef OS_WINDOWS
 #include <windows.h>
 #endif
-#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
+#if defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
 #include <sys/types.h>
 #include <sys/sysctl.h>
 #endif
@@ -1201,7 +1201,7 @@ static int get_num_cores(void) {
 
 #ifdef OS_WINDOWS
   SYSTEM_INFO sysinfo;
-#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
+#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
   int m[2], count;
   size_t len;
 #endif
@@ -1215,7 +1215,7 @@ static int get_num_cores(void) {
   GetSystemInfo(&sysinfo);
   return sysinfo.dwNumberOfProcessors;
 
-#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__DragonFly__) || defined(__APPLE__)
+#elif defined(__FreeBSD__) || defined(__OpenBSD__) || defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__)
   m[0] = CTL_HW;
   m[1] = HW_NCPU;
   len = sizeof(int);

From 1b9098966242810b7b53c4a8d3009e5a38df63e9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Oct 2019 12:52:49 +0200
Subject: [PATCH 077/210] Add NetBSD to the xBSD conditionals

---
 driver/others/memory.c | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 534d6d9fc..55dce72b8 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -129,7 +129,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <unistd.h>
 #endif
 
-#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
 #include <sys/sysctl.h>
 #include <sys/resource.h>
 #endif
@@ -192,7 +192,7 @@ void goto_set_num_threads(int num_threads) {};
 
 #else
 
-#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
+#if defined(OS_LINUX) || defined(OS_SUNOS)
 #ifndef NO_AFFINITY
 int get_num_procs(void);
 #else
@@ -312,7 +312,7 @@ int get_num_procs(void) {
 
 #endif
 
-#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
 
 int get_num_procs(void) {
 
@@ -404,7 +404,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -412,7 +412,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   max_num = get_num_procs();
 #endif
 
@@ -436,7 +436,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 
@@ -1673,7 +1673,7 @@ void gotoblas_dummy_for_PGI(void) {
 #include <sys/resource.h>
 #endif
 
-#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
 #include <sys/sysctl.h>
 #include <sys/resource.h>
 #endif
@@ -1736,7 +1736,7 @@ void goto_set_num_threads(int num_threads) {};
 
 #else
 
-#if defined(OS_LINUX) || defined(OS_SUNOS) || defined(OS_NETBSD)
+#if defined(OS_LINUX) || defined(OS_SUNOS)
 #ifndef NO_AFFINITY
 int get_num_procs(void);
 #else
@@ -1855,7 +1855,7 @@ int get_num_procs(void) {
 
 #endif
 
-#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
 
 int get_num_procs(void) {
 
@@ -1945,7 +1945,7 @@ extern int openblas_goto_num_threads_env();
 extern int openblas_omp_num_threads_env();
 
 int blas_get_cpu_number(void){
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   int max_num;
 #endif
   int blas_goto_num   = 0;
@@ -1953,7 +1953,7 @@ int blas_get_cpu_number(void){
 
   if (blas_num_threads) return blas_num_threads;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   max_num = get_num_procs();
 #endif
 
@@ -1977,7 +1977,7 @@ int blas_get_cpu_number(void){
   else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
   else blas_num_threads = MAX_CPU_NUMBER;
 
-#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
   if (blas_num_threads > max_num) blas_num_threads = max_num;
 #endif
 

From aeabe0a83fffce9ab43ab8d10795e1696574887c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Oct 2019 22:52:30 +0200
Subject: [PATCH 078/210] Fix regex to parse -R options with and without
 whitespace

Both forms are seen on NetBSD (#2288)
---
 f_check | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/f_check b/f_check
index 0afbab23a..993ad9a35 100644
--- a/f_check
+++ b/f_check
@@ -19,7 +19,7 @@ $nofortran = 0;
 
 $compiler = join(" ", @ARGV);
 $compiler_bin = shift(@ARGV);
-
+ 
 # f77 is too ambiguous
 $compiler = "" if $compiler eq "f77";
 
@@ -283,7 +283,7 @@ if ($link ne "") {
 
     $link =~ s/\-Y\sP\,/\-Y/g;
     
-    $link =~ s/\-R+/\-rpath\@/g;
+    $link =~ s/\-R\s*/\-rpath\@/g;
 
     $link =~ s/\-rpath\s+/\-rpath\@/g;
 

From 85ccdce8c4bfeb3f8de6dd939317631c52f1cca7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Oct 2019 23:02:37 +0200
Subject: [PATCH 079/210] Remove the IOS fallbacks to generic C kernels

---
 kernel/arm64/KERNEL.ARMV8 | 36 ------------------------------------
 1 file changed, 36 deletions(-)

diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index a2a435738..efc1ec8bc 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -91,12 +91,10 @@ IDAMAXKERNEL   = iamax.S
 ICAMAXKERNEL   = izamax.S
 IZAMAXKERNEL   = izamax.S
 
-ifneq ($(OS_DARWIN)$(CROSS),11)
 SNRM2KERNEL    = nrm2.S
 DNRM2KERNEL    = nrm2.S
 CNRM2KERNEL    = znrm2.S
 ZNRM2KERNEL    = znrm2.S
-endif
 
 DDOTKERNEL     = dot.S
 SDOTKERNEL     = dot.S
@@ -104,38 +102,6 @@ CDOTKERNEL     = zdot.S
 ZDOTKERNEL     = zdot.S
 DSDOTKERNEL    = dot.S
 
-ifeq ($(OS_DARWIN)$(CROSS),11)
-
-STRMMKERNEL	= ../generic/trmmkernel_2x2.c
-DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
-CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
-DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-else
 SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
@@ -202,5 +168,3 @@ ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-endif

From df857551c0d7062ebe03c9600de97fcf0620e0c2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 25 Oct 2019 23:07:00 +0200
Subject: [PATCH 080/210] Remove special parameter set for obsolete IOS/ARMV8
 workaround

---
 param.h | 34 ----------------------------------
 1 file changed, 34 deletions(-)

diff --git a/param.h b/param.h
index 860106991..238089e60 100644
--- a/param.h
+++ b/param.h
@@ -2588,38 +2588,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P	16
 
-// Darwin / Cross
-#if defined(OS_DARWIN) && defined(CROSS)
-
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
-
-#define DGEMM_DEFAULT_UNROLL_M  2
-#define DGEMM_DEFAULT_UNROLL_N  2
-
-#define CGEMM_DEFAULT_UNROLL_M  2
-#define CGEMM_DEFAULT_UNROLL_N  2
-
-#define ZGEMM_DEFAULT_UNROLL_M  2
-#define ZGEMM_DEFAULT_UNROLL_N  2
-
-#define SGEMM_DEFAULT_P	128
-#define DGEMM_DEFAULT_P	128
-#define CGEMM_DEFAULT_P 96
-#define ZGEMM_DEFAULT_P 64
-
-#define SGEMM_DEFAULT_Q 240
-#define DGEMM_DEFAULT_Q 120
-#define CGEMM_DEFAULT_Q 120
-#define ZGEMM_DEFAULT_Q 120
-
-#define SGEMM_DEFAULT_R 12288
-#define DGEMM_DEFAULT_R 8192
-#define CGEMM_DEFAULT_R 4096
-#define ZGEMM_DEFAULT_R 4096
-
-#else // Linux / Native
-
 #if defined(CORTEXA53) || defined(CORTEXA57) || \
     defined(CORTEXA72) || defined(CORTEXA73) || \
     defined(FALKOR)    || defined(TSV110)
@@ -2755,8 +2723,6 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 
 #endif // Cores
 
-#endif // Linux / Darwin
-
 #endif // ARMv8
 
 #if defined(ARMV5)

From 8691825944521a6706988b12daa2a6c77cba4f53 Mon Sep 17 00:00:00 2001
From: "k.dunikowski" <k.dunikowski@pixel.com.pl>
Date: Mon, 28 Oct 2019 08:51:05 +0100
Subject: [PATCH 081/210] Fixed a minor cmake problem, occuring when
 DYNAMIC_CORE=ON and CMAKE_C_FLAGS was empty

---
 cmake/arch.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index 5a7434551..f3ae84fe0 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -73,7 +73,7 @@ if (DYNAMIC_ARCH)
     endif ()
     if (NOT NO_AVX512)
       set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
-      string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS})
+      string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
    endif ()
     if (DYNAMIC_LIST)
 	set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})

From 274ff5cdb884f869c8cb99afceb56b1e8f59f87f Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 1 Nov 2019 23:59:18 +0800
Subject: [PATCH 082/210] update sgemm_q on skylakex cpus

---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 860106991..198839b4f 100644
--- a/param.h
+++ b/param.h
@@ -1699,7 +1699,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 320
 #define DGEMM_DEFAULT_Q 128
 #else
-#define SGEMM_DEFAULT_Q 384
+#define SGEMM_DEFAULT_Q 192
 #define DGEMM_DEFAULT_Q 128
 #endif
 #define CGEMM_DEFAULT_Q 192

From 1df9a2013d4793af36064a2ccb1bf147ca9a17c2 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 2 Nov 2019 00:00:48 +0800
Subject: [PATCH 083/210] new sgemm kernel for skylakex

---
 kernel/x86_64/KERNEL.SKYLAKEX                |   2 +-
 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 871 +++++++++++++++++++
 2 files changed, 872 insertions(+), 1 deletion(-)
 create mode 100644 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 82a455b44..a39030c53 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -1,6 +1,6 @@
 include $(KERNELDIR)/KERNEL.HASWELL
 
-SGEMMKERNEL    =  sgemm_kernel_16x4_skylakex.c
+SGEMMKERNEL    =  sgemm_kernel_16x4_skylakex_2.c
 
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    =  sgemm_tcopy_16_skylakex.c
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
new file mode 100644
index 000000000..79c70e4f6
--- /dev/null
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
@@ -0,0 +1,871 @@
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
+/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
+
+#include "common.h"
+#include <stdint.h>
+
+/* m = 16 */ /* zmm8-zmm31 for accumulators, zmm1-zmm7 for temporary use, zmm0 for alpha */
+#define KERNEL_k1m16n1 \
+    "vmovups (%0),%%zmm4; addq $64,%0;"\
+    "vbroadcastss (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8;"\
+    "addq $4,%1;"
+#define KERNEL_h_k1m16n2 \
+    "vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\
+    "vbroadcastsd (%1),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,%%zmm8; vfmadd231ps %%zmm5,%%zmm6,%%zmm9;"
+#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $8,%1;"
+#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "vbroadcastsd 8(%1),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,%%zmm10; vfmadd231ps %%zmm5,%%zmm7,%%zmm11;"
+#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
+#define unit_kernel_k1m16n4(c1,c2,c3,c4, ...) \
+    "vbroadcastsd  ("#__VA_ARGS__"),%%zmm6; vfmadd231ps %%zmm4,%%zmm6,"#c1"; vfmadd231ps %%zmm5,%%zmm6,"#c2";"\
+    "vbroadcastsd 8("#__VA_ARGS__"),%%zmm7; vfmadd231ps %%zmm4,%%zmm7,"#c3"; vfmadd231ps %%zmm5,%%zmm7,"#c4";"
+#define KERNEL_h_k1m16n8 KERNEL_h_k1m16n4 unit_kernel_k1m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1)
+#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%1;"
+#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n8 unit_kernel_k1m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2)
+#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%1;"
+#define KERNEL_h_k1m16n16 KERNEL_k1m16n12 unit_kernel_k1m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15)
+#define KERNEL_k1m16n16 KERNEL_h_k1m16n16 "addq $16,%%r15;"
+#define KERNEL_h_k1m16n20 KERNEL_h_k1m16n16 unit_kernel_k1m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1)
+#define KERNEL_k1m16n20 KERNEL_h_k1m16n20 "addq $16,%%r15;"
+#define KERNEL_h_k1m16n24 KERNEL_h_k1m16n20 unit_kernel_k1m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2)
+#define KERNEL_k1m16n24 KERNEL_h_k1m16n24 "addq $16,%%r15;"
+#define INIT_m16n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;"
+#define INIT_m16n2 INIT_m16n1 "vpxorq %%zmm9,%%zmm9,%%zmm9;"
+#define INIT_m16n4 INIT_m16n2 "vpxorq %%zmm10,%%zmm10,%%zmm10;vpxorq %%zmm11,%%zmm11,%%zmm11;"
+#define unit_init_m16n4(c1,c2,c3,c4) \
+    "vpxorq "#c1","#c1","#c1";vpxorq "#c2","#c2","#c2";vpxorq "#c3","#c3","#c3";vpxorq "#c4","#c4","#c4";"
+#define INIT_m16n8 INIT_m16n4 unit_init_m16n4(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+#define INIT_m16n12 INIT_m16n8 unit_init_m16n4(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
+#define INIT_m16n16 INIT_m16n12 unit_init_m16n4(%%zmm20,%%zmm21,%%zmm22,%%zmm23)
+#define INIT_m16n20 INIT_m16n16 unit_init_m16n4(%%zmm24,%%zmm25,%%zmm26,%%zmm27)
+#define INIT_m16n24 INIT_m16n20 unit_init_m16n4(%%zmm28,%%zmm29,%%zmm30,%%zmm31)
+#define SAVE_h_m16n1 "vfmadd213ps (%2),%%zmm0,%%zmm8; vmovups %%zmm8,(%2);"
+#define unit_save_m16n2(c1,c2) \
+    "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\
+    "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\
+    "prefetcht1 127(%5); prefetcht1 127(%5,%3,1);"\
+    "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;"
+#define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9)
+#define SAVE_h_m16n4  SAVE_h_m16n2  unit_save_m16n2(%%zmm10,%%zmm11)
+#define SAVE_h_m16n8  SAVE_h_m16n4  unit_save_m16n2(%%zmm12,%%zmm13) unit_save_m16n2(%%zmm14,%%zmm15)
+#define SAVE_h_m16n12 SAVE_h_m16n8  unit_save_m16n2(%%zmm16,%%zmm17) unit_save_m16n2(%%zmm18,%%zmm19)
+#define SAVE_h_m16n16 SAVE_h_m16n12 unit_save_m16n2(%%zmm20,%%zmm21) unit_save_m16n2(%%zmm22,%%zmm23)
+#define SAVE_h_m16n20 SAVE_h_m16n16 unit_save_m16n2(%%zmm24,%%zmm25) unit_save_m16n2(%%zmm26,%%zmm27)
+#define SAVE_h_m16n24 SAVE_h_m16n20 unit_save_m16n2(%%zmm28,%%zmm29) unit_save_m16n2(%%zmm30,%%zmm31)
+#define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;"
+#define COMPUTE_m16(ndim) \
+    INIT_m16n##ndim\
+    "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\
+    "cmpq $4,%4; jb "#ndim"016162f;"\
+    #ndim"016161:\n\t"\
+    KERNEL_k1m16n##ndim\
+    KERNEL_k1m16n##ndim\
+    KERNEL_k1m16n##ndim\
+    KERNEL_k1m16n##ndim\
+    "subq $4,%4; cmpq $4,%4; jnb "#ndim"016161b;"\
+    #ndim"016162:\n\t"\
+    "testq %4,%4; jz "#ndim"016163f;"\
+    KERNEL_k1m16n##ndim\
+    "decq %4; jmp "#ndim"016162b;"\
+    #ndim"016163:\n\t"\
+    SAVE_m16(ndim)
+
+/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
+#define KERNEL_k1m8n1(b_addr) \
+    "vmovups (%0),%%ymm1; addq $32,%0;"\
+    "vbroadcastss ("#b_addr"),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
+    "addq $4,"#b_addr";"
+#define KERNEL_h_k1m8n2(b_addr) \
+    "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
+    "vbroadcastsd ("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
+#define KERNEL_k1m8n2(b_addr) KERNEL_h_k1m8n2(b_addr) "addq $8,"#b_addr";"
+#define KERNEL_h_k1m8n4(b_addr) \
+    KERNEL_h_k1m8n2(b_addr) "vbroadcastsd 8("#b_addr"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
+#define KERNEL_k1m8n4(b_addr) KERNEL_h_k1m8n4(b_addr) "addq $16,"#b_addr";"
+#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \
+    "vbroadcastsd  ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
+    "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
+#define KERNEL_h_k1m8n8(b_addr) KERNEL_h_k1m8n4(b_addr) unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,b_addr,%%r12,1)
+#define KERNEL_k1m8n8(b_addr) KERNEL_h_k1m8n8(b_addr) "addq $16,"#b_addr";"
+#define KERNEL_h_k1m8n12(b_addr) KERNEL_h_k1m8n8(b_addr) unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,b_addr,%%r12,2)
+#define KERNEL_k1m8n12(b_addr) KERNEL_h_k1m8n12(b_addr) "addq $16,"#b_addr";"
+#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
+#define unit_init_m8n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m8n8  INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
+#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
+#define SAVE_L_m8n1 "vfmadd213ps (%2),%%ymm0,%%ymm4; vmovups %%ymm4,(%2);"
+#define unit_save_m8n2(c1,c2) \
+    "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3;"\
+    "vunpcklpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5),     %%ymm0,%%ymm1;vmovups %%ymm1,(%5);"\
+    "vunpckhpd %%ymm3,%%ymm2,%%ymm1;vfmadd213ps (%5,%3,1),%%ymm0,%%ymm1;vmovups %%ymm1,(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_L_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
+#define SAVE_L_m8n4  SAVE_L_m8n2  unit_save_m8n2(%%ymm6,%%ymm7)
+#define SAVE_L_m8n8  SAVE_L_m8n4  unit_save_m8n2(%%ymm8,%%ymm9)   unit_save_m8n2(%%ymm10,%%ymm11)
+#define SAVE_L_m8n12 SAVE_L_m8n8  unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
+#define SAVE_R_m8n4               unit_save_m8n2(%%ymm4,%%ymm5)   unit_save_m8n2(%%ymm6,%%ymm7)
+#define SAVE_R_m8n8  SAVE_R_m8n4  unit_save_m8n2(%%ymm8,%%ymm9)   unit_save_m8n2(%%ymm10,%%ymm11)
+#define SAVE_R_m8n12 SAVE_R_m8n8  unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
+#define COMPUTE_L_m8(ndim,sim) \
+    INIT_m8n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim""#sim"882:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"883f;"\
+    KERNEL_k1m8n##ndim(%1)\
+    "decq %4; jmp "#ndim""#sim"882b;"\
+    #ndim""#sim"883:\n\t"\
+    SAVE_L_m8n##ndim "addq $32,%2;"
+#define COMPUTE_R_m8(ndim,sim) \
+    "subq %%r12,%0; subq %%r12,%0;"\
+    INIT_m8n##ndim\
+    "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
+    #ndim""#sim"882:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"883f;"\
+    KERNEL_k1m8n##ndim(%%r15)\
+    "decq %4; jmp "#ndim""#sim"882b;"\
+    #ndim""#sim"883:\n\t"\
+    SAVE_R_m8n##ndim
+#define COMPUTE_m8_n1  COMPUTE_L_m8(1,33833)
+#define COMPUTE_m8_n2  COMPUTE_L_m8(2,33833)
+#define COMPUTE_m8_n4  COMPUTE_L_m8(4,33833)
+#define COMPUTE_m8_n8  COMPUTE_L_m8(8,33833)
+#define COMPUTE_m8_n12 COMPUTE_L_m8(12,33833)
+#define COMPUTE_m8_n16 COMPUTE_L_m8(12,33733) COMPUTE_R_m8(4,33933)
+#define COMPUTE_m8_n20 COMPUTE_L_m8(12,33633) COMPUTE_R_m8(8,33933)
+#define COMPUTE_m8_n24 COMPUTE_L_m8(12,33533) COMPUTE_R_m8(12,33933)
+#define COMPUTE_m8(ndim) COMPUTE_m8_n##ndim
+
+/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
+#define KERNEL_k1m4n1(b_addr) \
+    "vmovups (%0),%%xmm1; addq $16,%0;"\
+    "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $4,"#b_addr";"
+#define KERNEL_h_k1m4n2(b_addr) \
+    "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
+    "vmovddup ("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
+#define KERNEL_k1m4n2(b_addr) KERNEL_h_k1m4n2(b_addr) "addq $8,"#b_addr";"
+#define KERNEL_h_k1m4n4(b_addr) \
+    KERNEL_h_k1m4n2(b_addr) "vmovddup 8("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
+#define KERNEL_k1m4n4(b_addr) KERNEL_h_k1m4n4(b_addr) "addq $16,"#b_addr";"
+#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
+    "vmovddup  ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
+    "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
+#define KERNEL_h_k1m4n8(b_addr) KERNEL_h_k1m4n4(b_addr) unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,b_addr,%%r12,1)
+#define KERNEL_k1m4n8(b_addr) KERNEL_h_k1m4n8(b_addr) "addq $16,"#b_addr";"
+#define KERNEL_h_k1m4n12(b_addr) KERNEL_h_k1m4n8(b_addr) unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,b_addr,%%r12,2)
+#define KERNEL_k1m4n12(b_addr) KERNEL_h_k1m4n12(b_addr) "addq $16,"#b_addr";"
+#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define unit_init_m4n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m4n8  INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
+#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
+#define SAVE_L_m4n1 "vfmadd213ps (%2),%%xmm0,%%xmm4; vmovups %%xmm4,(%2);"
+#define unit_save_m4n2(c1,c2) \
+    "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3;"\
+    "vunpcklpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5),     %%xmm0,%%xmm1;vmovups %%xmm1,(%5);"\
+    "vunpckhpd %%xmm3,%%xmm2,%%xmm1;vfmadd213ps (%5,%3,1),%%xmm0,%%xmm1;vmovups %%xmm1,(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_L_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
+#define SAVE_L_m4n4  SAVE_L_m4n2  unit_save_m4n2(%%xmm6,%%xmm7)
+#define SAVE_L_m4n8  SAVE_L_m4n4  unit_save_m4n2(%%xmm8,%%xmm9)   unit_save_m4n2(%%xmm10,%%xmm11)
+#define SAVE_L_m4n12 SAVE_L_m4n8  unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
+#define SAVE_R_m4n4               unit_save_m4n2(%%xmm4,%%xmm5)   unit_save_m4n2(%%xmm6,%%xmm7)
+#define SAVE_R_m4n8  SAVE_R_m4n4  unit_save_m4n2(%%xmm8,%%xmm9)   unit_save_m4n2(%%xmm10,%%xmm11)
+#define SAVE_R_m4n12 SAVE_R_m4n8  unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
+#define COMPUTE_L_m4(ndim,sim) \
+    INIT_m4n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim""#sim"442:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"443f;"\
+    KERNEL_k1m4n##ndim(%1)\
+    "decq %4; jmp "#ndim""#sim"442b;"\
+    #ndim""#sim"443:\n\t"\
+    SAVE_L_m4n##ndim "addq $16,%2;"
+#define COMPUTE_R_m4(ndim,sim) \
+    "subq %%r12,%0;"\
+    INIT_m4n##ndim\
+    "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
+    #ndim""#sim"442:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"443f;"\
+    KERNEL_k1m4n##ndim(%%r15)\
+    "decq %4; jmp "#ndim""#sim"442b;"\
+    #ndim""#sim"443:\n\t"\
+    SAVE_R_m4n##ndim
+#define COMPUTE_m4_n1  COMPUTE_L_m4(1,55855)
+#define COMPUTE_m4_n2  COMPUTE_L_m4(2,55855)
+#define COMPUTE_m4_n4  COMPUTE_L_m4(4,55855)
+#define COMPUTE_m4_n8  COMPUTE_L_m4(8,55855)
+#define COMPUTE_m4_n12 COMPUTE_L_m4(12,55855)
+#define COMPUTE_m4_n16 COMPUTE_L_m4(12,55755) COMPUTE_R_m4(4,55955)
+#define COMPUTE_m4_n20 COMPUTE_L_m4(12,55655) COMPUTE_R_m4(8,55955)
+#define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955)
+#define COMPUTE_m4(ndim) COMPUTE_m4_n##ndim
+
+/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
+#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define KERNEL_k1m2n1(b_addr) \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $4,"#b_addr";"
+#define SAVE_L_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
+#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define KERNEL_k1m2n2(b_addr) \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vbroadcastss  ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "vbroadcastss 4("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
+    "addq $8,"#b_addr";"
+#define SAVE_L_m2n2 SAVE_L_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
+#define INIT_m2n4  INIT_m2n2
+#define INIT_m2n8  INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
+#define KERNEL_k1m2n4(b_addr) \
+    "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
+    "addq $8,%0;"
+#define KERNEL_k1m2n8(b_addr) \
+    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
+    "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
+    "addq $8,%0;"
+#define KERNEL_k1m2n12(b_addr) \
+    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\
+    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
+    "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
+    "addq $8,%0;"
+#define unit_save_m2n4(c1,c2) \
+    "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
+    "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"\
+    "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_L_m2n4  "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
+#define SAVE_L_m2n8  SAVE_L_m2n4   unit_save_m2n4(%%xmm6,%%xmm7)
+#define SAVE_L_m2n12 SAVE_L_m2n8   unit_save_m2n4(%%xmm8,%%xmm9)
+#define SAVE_R_m2n4                unit_save_m2n4(%%xmm4,%%xmm5)
+#define SAVE_R_m2n8  SAVE_R_m2n4   unit_save_m2n4(%%xmm6,%%xmm7)
+#define SAVE_R_m2n12 SAVE_R_m2n8   unit_save_m2n4(%%xmm8,%%xmm9)
+#define COMPUTE_L_m2(ndim,sim) \
+    INIT_m2n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim""#sim"222:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"223f;"\
+    KERNEL_k1m2n##ndim(%1)\
+    "decq %4; jmp "#ndim""#sim"222b;"\
+    #ndim""#sim"223:\n\t"\
+    SAVE_L_m2n##ndim "addq $8,%2;"
+#define COMPUTE_R_m2(ndim,sim) \
+    "salq $3,%%r13;subq %%r13,%0;sarq $3,%%r13;"\
+    INIT_m2n##ndim\
+    "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
+    #ndim""#sim"222:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"223f;"\
+    KERNEL_k1m2n##ndim(%%r15)\
+    "decq %4; jmp "#ndim""#sim"222b;"\
+    #ndim""#sim"223:\n\t"\
+    SAVE_R_m2n##ndim
+#define COMPUTE_m2_n1  COMPUTE_L_m2(1,77877)
+#define COMPUTE_m2_n2  COMPUTE_L_m2(2,77877)
+#define COMPUTE_m2_n4  COMPUTE_L_m2(4,77877)
+#define COMPUTE_m2_n8  COMPUTE_L_m2(8,77877)
+#define COMPUTE_m2_n12 COMPUTE_L_m2(12,77877)
+#define COMPUTE_m2_n16 COMPUTE_L_m2(12,77777) COMPUTE_R_m2(4,77977)
+#define COMPUTE_m2_n20 COMPUTE_L_m2(12,77677) COMPUTE_R_m2(8,77977)
+#define COMPUTE_m2_n24 COMPUTE_L_m2(12,77577) COMPUTE_R_m2(12,77977)
+#define COMPUTE_m2(ndim) COMPUTE_m2_n##ndim
+
+/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
+#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define KERNEL_k1m1n1(b_addr) \
+    "vmovss ("#b_addr"),%%xmm3; addq $4,"#b_addr";"\
+    "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#define SAVE_L_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
+#define INIT_m1n2 INIT_m1n1
+#define KERNEL_k1m1n2(b_addr) \
+    "vmovsd ("#b_addr"),%%xmm3; addq $8,"#b_addr";"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#define SAVE_L_m1n2 \
+    "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
+    "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
+#define INIT_m1n4  INIT_m1n2
+#define INIT_m1n8  INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
+#define KERNEL_k1m1n4(b_addr) \
+    "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#define KERNEL_k1m1n8(b_addr) \
+    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
+    "addq $4,%0;"
+#define KERNEL_k1m1n12(b_addr) \
+    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\
+    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
+    "addq $4,%0;"
+#define unit_save_m1n4(c1) \
+    "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
+    "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
+    "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
+    "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
+#define SAVE_L_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
+#define SAVE_L_m1n8  SAVE_L_m1n4  unit_save_m1n4(%%xmm5)
+#define SAVE_L_m1n12 SAVE_L_m1n8  unit_save_m1n4(%%xmm6)
+#define SAVE_R_m1n4               unit_save_m1n4(%%xmm4)
+#define SAVE_R_m1n8  SAVE_R_m1n4  unit_save_m1n4(%%xmm5)
+#define SAVE_R_m1n12 SAVE_R_m1n8  unit_save_m1n4(%%xmm6)
+#define COMPUTE_L_m1(ndim,sim) \
+    INIT_m1n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim""#sim"112:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"113f;"\
+    KERNEL_k1m1n##ndim(%1)\
+    "decq %4; jmp "#ndim""#sim"112b;"\
+    #ndim""#sim"113:\n\t"\
+    SAVE_L_m1n##ndim "addq $4,%2;"
+#define COMPUTE_R_m1(ndim,sim) \
+    "salq $2,%%r13;subq %%r13,%0;sarq $2,%%r13;"\
+    INIT_m1n##ndim\
+    "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
+    #ndim""#sim"112:\n\t"\
+    "testq %4,%4; jz "#ndim""#sim"113f;"\
+    KERNEL_k1m1n##ndim(%%r15)\
+    "decq %4; jmp "#ndim""#sim"112b;"\
+    #ndim""#sim"113:\n\t"\
+    SAVE_R_m1n##ndim
+#define COMPUTE_m1_n1  COMPUTE_L_m1(1,99899)
+#define COMPUTE_m1_n2  COMPUTE_L_m1(2,99899)
+#define COMPUTE_m1_n4  COMPUTE_L_m1(4,99899)
+#define COMPUTE_m1_n8  COMPUTE_L_m1(8,99899)
+#define COMPUTE_m1_n12 COMPUTE_L_m1(12,99899)
+#define COMPUTE_m1_n16 COMPUTE_L_m1(12,99799) COMPUTE_R_m1(4,99999)
+#define COMPUTE_m1_n20 COMPUTE_L_m1(12,99699) COMPUTE_R_m1(8,99999)
+#define COMPUTE_m1_n24 COMPUTE_L_m1(12,99599) COMPUTE_R_m1(12,99999)
+#define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim
+
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
+/* %6 = "+r"(&alpha), %7 = "+r"(M) */
+/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
+
+#define COMPUTE(ndim) {\
+    __asm__ __volatile__(\
+    "vbroadcastss (%6),%%zmm0;"\
+    "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
+    "cmpq $16,%7;jb 33101"#ndim"f;"\
+    "33109"#ndim":\n\t"\
+    COMPUTE_m16(ndim)\
+    "subq $16,%7;cmpq $16,%7;jnb 33109"#ndim"b;"\
+    "33101"#ndim":\n\t"\
+    "cmpq $8,%7;jb 33102"#ndim"f;"\
+    COMPUTE_m8(ndim)\
+    "subq $8,%7;"\
+    "33102"#ndim":\n\t"\
+    "cmpq $4,%7;jb 33103"#ndim"f;"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%7;"\
+    "33103"#ndim":\n\t"\
+    "cmpq $2,%7;jb 33104"#ndim"f;"\
+    COMPUTE_m2(ndim)\
+    "subq $2,%7;"\
+    "33104"#ndim":\n\t"\
+    "testq %7,%7;jz 33105"#ndim"f;"\
+    COMPUTE_m1(ndim)\
+    "33105"#ndim":\n\t"\
+    "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M)\
+    ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
+    "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
+    "cc","memory");\
+    a_pointer -= M * K; b_pointer += ndim * K;c_pointer += LDC * ndim - M;\
+}
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0||alpha==(float)0.0) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha;
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA;
+    for(;n_count>23;n_count-=24) COMPUTE(24)
+    for(;n_count>19;n_count-=20) COMPUTE(20)
+    for(;n_count>15;n_count-=16) COMPUTE(16)
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}
+
+#include <immintrin.h>
+/* codes below are copied from the sgemm kernel written by Arjan van der Ven */
+
+/*
+ * "Direct sgemm" code. This code operates directly on the inputs and outputs
+ * of the sgemm call, avoiding the copies, memory realignments and threading,
+ * and only supports alpha = 1 and beta = 0.
+ * This is a common case and provides value for relatively small matrixes.
+ * For larger matrixes the "regular" sgemm code is superior, there the cost of
+ * copying/shuffling the B matrix really pays off.
+ */
+
+
+
+#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps()
+#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
+#define LOAD_B_512(N,M)  __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)])
+#define MATMUL_512(N,M)  result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M)
+#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M)
+
+
+#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps()
+#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
+#define LOAD_B_256(N,M)  __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)])
+#define MATMUL_256(N,M)  result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M)
+#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M)
+
+#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps()
+#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
+#define LOAD_B_128(N,M)  __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)])
+#define MATMUL_128(N,M)  result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M)
+#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M)
+
+#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0;
+#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)];
+#define LOAD_B_SCALAR(N,M)  float Bval##N  = B[k * strideB + j + N];
+#define MATMUL_SCALAR(N,M) result##N##M +=  Aval##M * Bval##N;
+#define STORE_SCALAR(N,M)  R[(i+M) * strideR + j + N] = result##N##M;
+
+int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
+{
+	int mnk = M * N * K;
+	/* large matrixes -> not performant */
+	if (mnk >= 28 * 512 * 512)
+		return 0;
+
+	/*
+	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
+	 * and the regular sgemm copy/realignment of data pays off much quicker
+	 */
+	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
+		return 0;
+
+#ifdef SMP
+	/* if we can run multithreaded, the threading changes the based threshold */
+	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
+		return 0;
+#endif
+
+	return 1;
+}
+
+
+
+void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+{
+	int i, j, k;
+
+        int m4 = M & ~3;
+	int m2 = M & ~1;
+
+	int n64 = N & ~63;
+	int n32 = N & ~31;
+	int n16 = N & ~15;
+	int n8 = N & ~7;
+	int n4 = N & ~3;
+	int n2 = N & ~1;
+
+	i = 0;
+
+	for (i = 0; i < m4; i+=4) {
+
+		for (j = 0; j < n64; j+= 64) {
+			k = 0;
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);    			DECLARE_RESULT_512(2, 2);    DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);    			DECLARE_RESULT_512(2, 3);    DECLARE_RESULT_512(3, 3);
+
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+				BROADCAST_LOAD_A_512(x, 2);
+				BROADCAST_LOAD_A_512(x, 3);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
+				MATMUL_512(0, 2);		MATMUL_512(1, 2);			MATMUL_512(2, 2);		MATMUL_512(3, 2);
+				MATMUL_512(0, 3);		MATMUL_512(1, 3);			MATMUL_512(2, 3);		MATMUL_512(3, 3);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
+			STORE_512(0, 2);		STORE_512(1, 2);			STORE_512(2, 2);		STORE_512(3, 2);
+			STORE_512(0, 3);		STORE_512(1, 3);			STORE_512(2, 3);		STORE_512(3, 3);
+		}
+
+		for (; j < n32; j+= 32) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+				BROADCAST_LOAD_A_512(x, 2);
+				BROADCAST_LOAD_A_512(x, 3);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);
+				MATMUL_512(0, 2);		MATMUL_512(1, 2);
+				MATMUL_512(0, 3);		MATMUL_512(1, 3);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);
+			STORE_512(0, 2);		STORE_512(1, 2);
+			STORE_512(0, 3);		STORE_512(1, 3);
+		}
+
+		for (; j < n16; j+= 16) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+
+		 	for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+				BROADCAST_LOAD_A_512(x, 2);
+				BROADCAST_LOAD_A_512(x, 3);
+
+				LOAD_B_512(0, x);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+		}
+
+		for (; j < n8; j+= 8) {
+			DECLARE_RESULT_256(0, 0);
+			DECLARE_RESULT_256(0, 1);
+			DECLARE_RESULT_256(0, 2);
+			DECLARE_RESULT_256(0, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_256(x, 0);
+				BROADCAST_LOAD_A_256(x, 1);
+				BROADCAST_LOAD_A_256(x, 2);
+				BROADCAST_LOAD_A_256(x, 3);
+
+				LOAD_B_256(0, x);
+
+				MATMUL_256(0, 0);
+				MATMUL_256(0, 1);
+				MATMUL_256(0, 2);
+				MATMUL_256(0, 3);
+			}
+			STORE_256(0, 0);
+			STORE_256(0, 1);
+			STORE_256(0, 2);
+			STORE_256(0, 3);
+		}
+
+		for (; j < n4; j+= 4) {
+			DECLARE_RESULT_128(0, 0);
+			DECLARE_RESULT_128(0, 1);
+			DECLARE_RESULT_128(0, 2);
+			DECLARE_RESULT_128(0, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_128(x, 0);
+				BROADCAST_LOAD_A_128(x, 1);
+				BROADCAST_LOAD_A_128(x, 2);
+				BROADCAST_LOAD_A_128(x, 3);
+
+				LOAD_B_128(0, x);
+
+				MATMUL_128(0, 0);
+				MATMUL_128(0, 1);
+				MATMUL_128(0, 2);
+				MATMUL_128(0, 3);
+			}
+			STORE_128(0, 0);
+			STORE_128(0, 1);
+			STORE_128(0, 2);
+			STORE_128(0, 3);
+		}
+
+		for (; j < n2; j+= 2) {
+			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
+			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
+			DECLARE_RESULT_SCALAR(0, 2);	DECLARE_RESULT_SCALAR(1, 2);
+			DECLARE_RESULT_SCALAR(0, 3);	DECLARE_RESULT_SCALAR(1, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(x, 0);
+				BROADCAST_LOAD_A_SCALAR(x, 1);
+				BROADCAST_LOAD_A_SCALAR(x, 2);
+				BROADCAST_LOAD_A_SCALAR(x, 3);
+
+				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
+
+				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
+				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
+				MATMUL_SCALAR(0, 2);	MATMUL_SCALAR(1, 2);
+				MATMUL_SCALAR(0, 3);	MATMUL_SCALAR(1, 3);
+			}
+			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
+			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
+			STORE_SCALAR(0, 2);	STORE_SCALAR(1, 2);
+			STORE_SCALAR(0, 3);	STORE_SCALAR(1, 3);
+		}
+
+		for (; j < N; j++) {
+			DECLARE_RESULT_SCALAR(0, 0)
+			DECLARE_RESULT_SCALAR(0, 1)
+			DECLARE_RESULT_SCALAR(0, 2)
+			DECLARE_RESULT_SCALAR(0, 3)
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(0, 0);
+				BROADCAST_LOAD_A_SCALAR(0, 1);
+				BROADCAST_LOAD_A_SCALAR(0, 2);
+				BROADCAST_LOAD_A_SCALAR(0, 3);
+
+				LOAD_B_SCALAR(0, 0);
+
+				MATMUL_SCALAR(0, 0);
+				MATMUL_SCALAR(0, 1);
+				MATMUL_SCALAR(0, 2);
+				MATMUL_SCALAR(0, 3);
+			}
+			STORE_SCALAR(0, 0);
+			STORE_SCALAR(0, 1);
+			STORE_SCALAR(0, 2);
+			STORE_SCALAR(0, 3);
+		}
+	}
+
+	for (; i < m2; i+=2) {
+		j = 0;
+
+		for (; j < n64; j+= 64) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
+
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
+		}
+
+		for (; j < n32; j+= 32) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);
+		}
+
+
+		for (; j < n16; j+= 16) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+
+				LOAD_B_512(0, x);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+
+		for (; j < n8; j+= 8) {
+			DECLARE_RESULT_256(0, 0);
+			DECLARE_RESULT_256(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_256(x, 0);
+				BROADCAST_LOAD_A_256(x, 1);
+
+				LOAD_B_256(0, x);
+
+				MATMUL_256(0, 0);
+				MATMUL_256(0, 1);
+			}
+			STORE_256(0, 0);
+			STORE_256(0, 1);
+		}
+
+		for (; j < n4; j+= 4) {
+			DECLARE_RESULT_128(0, 0);
+			DECLARE_RESULT_128(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_128(x, 0);
+				BROADCAST_LOAD_A_128(x, 1);
+
+				LOAD_B_128(0, x);
+
+				MATMUL_128(0, 0);
+				MATMUL_128(0, 1);
+			}
+			STORE_128(0, 0);
+			STORE_128(0, 1);
+		}
+		for (; j < n2; j+= 2) {
+			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
+			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(x, 0);
+				BROADCAST_LOAD_A_SCALAR(x, 1);
+
+				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
+
+				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
+				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
+			}
+			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
+			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
+		}
+
+		for (; j < N; j++) {
+			DECLARE_RESULT_SCALAR(0, 0);
+			DECLARE_RESULT_SCALAR(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(0, 0);
+				BROADCAST_LOAD_A_SCALAR(0, 1);
+
+				LOAD_B_SCALAR(0, 0);
+
+				MATMUL_SCALAR(0, 0);
+				MATMUL_SCALAR(0, 1);
+			}
+			STORE_SCALAR(0, 0);
+			STORE_SCALAR(0, 1);
+		}
+	}
+
+	for (; i < M; i+=1) {
+		j = 0;
+		for (; j < n64; j+= 64) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
+		}
+		for (; j < n32; j+= 32) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);
+		}
+
+
+		for (; j < n16; j+= 16) {
+			DECLARE_RESULT_512(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+
+				LOAD_B_512(0, x);
+
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+
+		for (; j < n8; j+= 8) {
+			DECLARE_RESULT_256(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_256(x, 0);
+				LOAD_B_256(0, x);
+				MATMUL_256(0, 0);
+			}
+			STORE_256(0, 0);
+		}
+
+		for (; j < n4; j+= 4) {
+			DECLARE_RESULT_128(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_128(x, 0);
+				LOAD_B_128(0, x);
+				MATMUL_128(0, 0);
+			}
+			STORE_128(0, 0);
+		}
+
+		for (; j < n2; j+= 2) {
+			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(x, 0);
+				LOAD_B_SCALAR(0, 0);	LOAD_B_SCALAR(1, 0);
+				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
+			}
+			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
+		}
+
+		for (; j < N; j++) {
+			DECLARE_RESULT_SCALAR(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(0, 0);
+				LOAD_B_SCALAR(0, 0);
+				MATMUL_SCALAR(0, 0);
+			}
+			STORE_SCALAR(0, 0);
+		}
+	}
+}

From ae43b75a6a12d17a6ad769c64a306198a2b032b1 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 2 Nov 2019 10:09:19 +0800
Subject: [PATCH 085/210] Add files via upload

---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 198839b4f..aa7cede0c 100644
--- a/param.h
+++ b/param.h
@@ -1696,7 +1696,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_P 256
 
 #ifdef WINDOWS_ABI
-#define SGEMM_DEFAULT_Q 320
+#define SGEMM_DEFAULT_Q 192
 #define DGEMM_DEFAULT_Q 128
 #else
 #define SGEMM_DEFAULT_Q 192

From 928fe1b28e91aa55748bfc7d2abf2ed2786d3ef5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Nov 2019 22:37:27 +0100
Subject: [PATCH 086/210] The assembly microkernel is not safe to use on ELFv1

---
 kernel/power/idamax.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c
index 5bdc0a13c..337fa54f8 100644
--- a/kernel/power/idamax.c
+++ b/kernel/power/idamax.c
@@ -324,6 +324,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 
     if (inc_x == 1) {
 
+#if defined(_CALL_ELF) && (_CALL_ELF == 2)
         BLASLONG n1 = n & -32;
         if (n1 > 0) {
 
@@ -331,7 +332,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 
             i = n1;
         }
-
+#endif
         while (i < n) {
             if (ABS(x[i]) > maxf) {
                 max = i;

From d999688d1a7a78ba8c69eedb8f26945aa1f04baf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Nov 2019 22:39:06 +0100
Subject: [PATCH 087/210] The assembly microkernel is not safe to use on ELFv1

---
 kernel/power/idamin.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c
index 7fe0f8a33..85dd49ac1 100644
--- a/kernel/power/idamin.c
+++ b/kernel/power/idamin.c
@@ -326,13 +326,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
     minf = ABS(x[0]); //index's not incremented
     if (inc_x == 1) {
 
+#if defined(_CALL_ELF) && (_CALL_ELF == 2)
         BLASLONG n1 = n & -32;
         if (n1 > 0) {
 
             min = diamin_kernel_32(n1, x, &minf);
             i = n1;
         }
-
+#endif
+ 
         while (i < n) {
             if (ABS(x[i]) < minf) {
                 min = i;

From d2a628554921577f7353256c5888fc09dc3ccdae Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Nov 2019 22:41:19 +0100
Subject: [PATCH 088/210] The assembly microkernel is not safe to use on ELFv1

---
 kernel/power/izamin.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c
index 1ffa3ba8b..8da2189c6 100644
--- a/kernel/power/izamin.c
+++ b/kernel/power/izamin.c
@@ -314,6 +314,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 
     if (inc_x == 1) {
         minf = CABS1(x,0); //index will not be incremented
+
+#if defined(_CALL_ELF) && (_CALL_ELF == 2)
         BLASLONG n1 = n & -16;
         if (n1 > 0) {
 
@@ -321,7 +323,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             i = n1;
             ix = n1 << 1;
         }
-      
+#endif      
 
         while(i < n)
         {

From 68597002ea1342e370c892cddb8845db61936e4f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Nov 2019 22:42:46 +0100
Subject: [PATCH 089/210] The assembly microkernel is not safe to use on ELFv1

---
 kernel/power/izamax.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/power/izamax.c b/kernel/power/izamax.c
index cfe78c8c0..3c132f81a 100644
--- a/kernel/power/izamax.c
+++ b/kernel/power/izamax.c
@@ -316,6 +316,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
      
     if (inc_x == 1) {
 
+#if defined(_CALL_ELF) && (_CALL_ELF == 2)
       BLASLONG n1 = n & -16;
       if (n1 > 0) {
 
@@ -323,6 +324,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
             i = n1;
             ix = n1 << 1;
       }
+#endif
 
       while(i < n)
     {

From 6fa89b06a1f480065d29bfcafaedcae477ef8206 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 3 Nov 2019 22:55:31 +0100
Subject: [PATCH 090/210] Use the two-operand form of DCBT on all PPC970
 regardless of OS

There seems to be no advantage to the three-operand form used in the earliest GotoBLAS kernels, and it causes compilation problems  on other than the previously special-cased platforms as well
---
 common_power.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common_power.h b/common_power.h
index 5e15b7554..bcfc209a9 100644
--- a/common_power.h
+++ b/common_power.h
@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define HAVE_PREFETCH
 #endif
 
-#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) )
+#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970)
 #define DCBT_ARG	0
 #else
 #define DCBT_ARG	8

From fbacd2605dd67f86f6c097b7b738138dd76913fa Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 4 Nov 2019 19:37:19 +0800
Subject: [PATCH 091/210] optimizations via software prefetches

---
 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
index 79c70e4f6..3646c7dda 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
@@ -42,7 +42,6 @@
 #define unit_save_m16n2(c1,c2) \
     "vunpcklps "#c2","#c1",%%zmm6; vunpckhps "#c2","#c1",%%zmm7; vunpcklpd %%zmm7,%%zmm6,%%zmm4; vunpckhpd %%zmm7,%%zmm6,%%zmm5;"\
     "vfmadd213ps (%5),%%zmm0,%%zmm4; vfmadd213ps (%5,%3,1),%%zmm0,%%zmm5;"\
-    "prefetcht1 127(%5); prefetcht1 127(%5,%3,1);"\
     "vmovups %%zmm4,(%5); vmovups %%zmm5,(%5,%3,1); leaq (%5,%3,2),%5;"
 #define SAVE_h_m16n2 "movq %2,%5;" unit_save_m16n2(%%zmm8,%%zmm9)
 #define SAVE_h_m16n4  SAVE_h_m16n2  unit_save_m16n2(%%zmm10,%%zmm11)
@@ -54,19 +53,25 @@
 #define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;"
 #define COMPUTE_m16(ndim) \
     INIT_m16n##ndim\
-    "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\
-    "cmpq $4,%4; jb "#ndim"016162f;"\
+    "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\
+    "cmpq $16,%4; jb "#ndim"016162f;"\
     #ndim"016161:\n\t"\
     KERNEL_k1m16n##ndim\
     KERNEL_k1m16n##ndim\
+    "prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\
     KERNEL_k1m16n##ndim\
     KERNEL_k1m16n##ndim\
-    "subq $4,%4; cmpq $4,%4; jnb "#ndim"016161b;"\
+    "prefetcht1 (%8); addq $"#ndim",%8;"\
+    "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\
+    "movq %2,%5;"\
     #ndim"016162:\n\t"\
     "testq %4,%4; jz "#ndim"016163f;"\
+    "prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1);"\
     KERNEL_k1m16n##ndim\
+    "leaq (%5,%3,2),%5;"\
     "decq %4; jmp "#ndim"016162b;"\
     #ndim"016163:\n\t"\
+    "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
     SAVE_m16(ndim)
 
 /* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
@@ -350,10 +355,11 @@
 #define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim
 
 /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
-/* %6 = "+r"(&alpha), %7 = "+r"(M) */
+/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */
 /* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
 
 #define COMPUTE(ndim) {\
+    next_b = b_pointer + ndim * K;\
     __asm__ __volatile__(\
     "vbroadcastss (%6),%%zmm0;"\
     "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
@@ -378,7 +384,7 @@
     COMPUTE_m1(ndim)\
     "33105"#ndim":\n\t"\
     "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
-    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M)\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M),"+r"(next_b)\
     ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
     "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
     "cc","memory");\
@@ -391,7 +397,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
     int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha;
     int64_t M = (int64_t)m, K = (int64_t)k;
     BLASLONG n_count = n;
-    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA;
+    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA,*next_b = B;
     for(;n_count>23;n_count-=24) COMPUTE(24)
     for(;n_count>19;n_count-=20) COMPUTE(20)
     for(;n_count>15;n_count-=16) COMPUTE(16)

From 430c11e1357b78f6a2872ea48ef6e71989488386 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 4 Nov 2019 20:10:12 +0800
Subject: [PATCH 092/210] Add files via upload

---
 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
index 3646c7dda..5d491237b 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
@@ -54,15 +54,17 @@
 #define COMPUTE_m16(ndim) \
     INIT_m16n##ndim\
     "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\
-    "cmpq $16,%4; jb "#ndim"016162f;"\
+    "cmpq $18,%4; jb "#ndim"016162f;"\
     #ndim"016161:\n\t"\
     KERNEL_k1m16n##ndim\
     KERNEL_k1m16n##ndim\
+    KERNEL_k1m16n##ndim\
     "prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\
     KERNEL_k1m16n##ndim\
     KERNEL_k1m16n##ndim\
-    "prefetcht1 (%8); addq $"#ndim",%8;"\
-    "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\
+    KERNEL_k1m16n##ndim\
+    "prefetcht1 (%8); addq $32,%8;"\
+    "subq $6,%4; cmpq $18,%4; jnb "#ndim"016161b;"\
     "movq %2,%5;"\
     #ndim"016162:\n\t"\
     "testq %4,%4; jz "#ndim"016163f;"\

From 836c414e22a52b8fe2a4c714d9711ac8aa204b0c Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 5 Nov 2019 13:36:56 +0800
Subject: [PATCH 093/210] optimizations of software prefetching

---
 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 126 +++++++++++---------
 1 file changed, 69 insertions(+), 57 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
index a958a1a6f..72878acfd 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
+++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
@@ -88,20 +88,21 @@
     "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
     #nn"00:\n\t"
 
+/* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
 #define INNER_KERNELm8(nn) \
-    "cmpq $8,%2;jb "#nn"001f;"\
+    "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
     #nn"008:\n\t"\
     INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
     INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
     INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
     INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
-    "subq $8,%2;cmpq $8,%2;jnb "#nn"008b;"\
+    "prefetcht1 (%11); addq $16,%11;"\
+    "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
+    "movq %3,%10;"\
     #nn"001:\n\t"\
     "cmpq $1,%2;jb "#nn"000f;"\
+    "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
     INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
     "decq %2;jmp "#nn"001b;"\
     ""#nn"000:\n\t"
@@ -158,53 +159,53 @@
 
 #define INNER_STORE_m1n8(c1,disp) \
     "kxnorw %%k1,%%k1,%%k1;"\
-    "vgatherqpd "#disp"(%3,%%zmm6,1), %%zmm7 %{%%k1%};"\
+    "vgatherqpd "#disp"(%10,%%zmm6,1), %%zmm7 %{%%k1%};"\
     "vfmadd132pd %%zmm3,%%zmm7,"#c1";"\
     "kxnorw %%k1,%%k1,%%k1;"\
-    "vscatterqpd "#c1", "#disp"(%3,%%zmm6,1) %{%%k1%};"
+    "vscatterqpd "#c1", "#disp"(%10,%%zmm6,1) %{%%k1%};"
 
 #define INNER_SAVE_m1n8 \
+    "movq %3,%10;"\
     INNER_SETINDEX\
     INNER_STORE_m1n8(%%zmm8,0)
 
 #define INNER_SAVE_m1n16 \
     INNER_SAVE_m1n8\
-    "leaq (%3,%4,8),%3;"\
+    "leaq (%10,%4,8),%10;"\
     INNER_STORE_m1n8(%%zmm9,0)
 
 #define INNER_SAVE_m1n24 \
     INNER_SAVE_m1n16\
-    "leaq (%3,%4,8),%3;"\
+    "leaq (%10,%4,8),%10;"\
     INNER_STORE_m1n8(%%zmm10,0)
 
 #define INNER_SAVE_m2n8 \
+    "movq %3,%10;"\
     INNER_SETINDEX\
     INNER_STORE_m1n8(%%zmm8,0)\
     INNER_STORE_m1n8(%%zmm9,8)
 
 #define INNER_SAVE_m2n16 \
+    "movq %3,%10;"\
     INNER_SETINDEX\
     INNER_STORE_m1n8(%%zmm8,0)\
     INNER_STORE_m1n8(%%zmm10,8)\
-    "leaq (%3,%4,8),%3;"\
+    "leaq (%10,%4,8),%10;"\
     INNER_STORE_m1n8(%%zmm9,0)\
     INNER_STORE_m1n8(%%zmm11,8)
+
 #define INNER_SAVE_m2n24 \
+    "movq %3,%10;"\
     INNER_SETINDEX\
     INNER_STORE_m1n8(%%zmm8,0)\
     INNER_STORE_m1n8(%%zmm11,8)\
-    "leaq (%3,%4,8),%3;"\
+    "leaq (%10,%4,8),%10;"\
     INNER_STORE_m1n8(%%zmm9,0)\
     INNER_STORE_m1n8(%%zmm12,8)\
-    "leaq (%3,%4,8),%3;"\
+    "leaq (%10,%4,8),%10;"\
     INNER_STORE_m1n8(%%zmm10,0)\
     INNER_STORE_m1n8(%%zmm13,8)
-#define INNER_PREF_8x8 \
-    "prefetcht0 (%3); prefetcht0 56(%3); prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2);"\
-    "prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,2),%3;"\
-    "prefetcht0 (%3,%4,1); prefetcht0 56(%3,%4,1); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4); leaq (%3,%4,1),%3;"\
-    "prefetcht0 (%3,%4,2); prefetcht0 56(%3,%4,2); prefetcht0 (%3,%4,4); prefetcht0 56(%3,%4,4);"\
-    "subq %4,%3; subq %4,%3; subq %4,%3;"
+
 #define INNER_TRANS_4x8(c1,c2,c3,c4) \
     "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
     "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
@@ -212,6 +213,7 @@
     "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
     "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
     "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
+
 #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
     INNER_TRANS_4x8(c1,c2,c3,c4)\
     INNER_TRANS_4x8(c5,c6,c7,c8)\
@@ -223,64 +225,69 @@
     "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd  %%zmm6,"#c7","#c7"%{%5%};"\
     "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
     "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd  %%zmm7,"#c8","#c8"%{%5%};"
+
 //%7 for k01(input) only when m=4
 #define INNER_STORE_4x8(c1,c2,c3,c4) \
-    "vmovupd (%3),%%zmm4%{%5%};vmovupd -32(%3,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
-    "vmovupd "#c1",(%3)%{%5%}; vmovupd "#c1",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
-    "vmovupd (%3),%%zmm5%{%5%};vmovupd -32(%3,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
-    "vmovupd "#c2",(%3)%{%5%}; vmovupd "#c2",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
-    "vmovupd (%3),%%zmm6%{%5%};vmovupd -32(%3,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
-    "vmovupd "#c3",(%3)%{%5%}; vmovupd "#c3",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
-    "vmovupd (%3),%%zmm7%{%5%};vmovupd -32(%3,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
-    "vmovupd "#c4",(%3)%{%5%}; vmovupd "#c4",-32(%3,%4,4)%{%7%}; leaq (%3,%4,1),%3;"\
-    "leaq (%3,%4,4),%3;"
+    "vmovupd (%10),%%zmm4%{%5%};vmovupd -32(%10,%4,4),%%zmm4%{%7%};vfmadd132pd %%zmm3,%%zmm4,"#c1";"\
+    "vmovupd "#c1",(%10)%{%5%}; vmovupd "#c1",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "vmovupd (%10),%%zmm5%{%5%};vmovupd -32(%10,%4,4),%%zmm5%{%7%};vfmadd132pd %%zmm3,%%zmm5,"#c2";"\
+    "vmovupd "#c2",(%10)%{%5%}; vmovupd "#c2",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "vmovupd (%10),%%zmm6%{%5%};vmovupd -32(%10,%4,4),%%zmm6%{%7%};vfmadd132pd %%zmm3,%%zmm6,"#c3";"\
+    "vmovupd "#c3",(%10)%{%5%}; vmovupd "#c3",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "vmovupd (%10),%%zmm7%{%5%};vmovupd -32(%10,%4,4),%%zmm7%{%7%};vfmadd132pd %%zmm3,%%zmm7,"#c4";"\
+    "vmovupd "#c4",(%10)%{%5%}; vmovupd "#c4",-32(%10,%4,4)%{%7%}; leaq (%10,%4,1),%10;"\
+    "leaq (%10,%4,4),%10;"
+
 #define INNER_STORE_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
-    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vfmadd213pd (%3),%%zmm3,"#c1"; vmovupd "#c1",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%3,%4,1); leaq (%3,%4,2),%3;"\
-    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vfmadd213pd (%3),%%zmm3,"#c3"; vmovupd "#c3",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%3,%4,1); leaq (%3,%4,2),%3;"\
-    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vfmadd213pd (%3),%%zmm3,"#c5"; vmovupd "#c5",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%3,%4,1); leaq (%3,%4,2),%3;"\
-    "prefetcht1 120(%3); prefetcht1 120(%3,%4,1);"\
-    "vfmadd213pd (%3),%%zmm3,"#c7"; vmovupd "#c7",(%3); vfmadd213pd (%3,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%3,%4,1); leaq (%3,%4,2),%3;"
+    "vfmadd213pd (%10),%%zmm3,"#c1"; vmovupd "#c1",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c2"; vmovupd "#c2",(%10,%4,1); leaq (%10,%4,2),%10;"\
+    "vfmadd213pd (%10),%%zmm3,"#c3"; vmovupd "#c3",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c4"; vmovupd "#c4",(%10,%4,1); leaq (%10,%4,2),%10;"\
+    "vfmadd213pd (%10),%%zmm3,"#c5"; vmovupd "#c5",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c6"; vmovupd "#c6",(%10,%4,1); leaq (%10,%4,2),%10;"\
+    "vfmadd213pd (%10),%%zmm3,"#c7"; vmovupd "#c7",(%10); vfmadd213pd (%10,%4,1),%%zmm3,"#c8"; vmovupd "#c8",(%10,%4,1); leaq (%10,%4,2),%10;"
+
 #define INNER_SAVE_m4n8 \
+    "movq %3,%10;"\
     INNER_TRANS_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)\
     INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
+
 #define INNER_SAVE_m4n16 \
+    "movq %3,%10;"\
     INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
     INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
     INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
     INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
+
 #define INNER_SAVE_m4n24 \
+    "movq %3,%10;"\
     INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
     INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
     INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
     INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
     INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
     INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
+
 #define INNER_SAVE_m8n8 \
-    INNER_PREF_8x8\
+    "movq %3,%10;"\
     INNER_TRANS_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
     INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+
 #define INNER_SAVE_m8n16 \
-    INNER_PREF_8x8\
+    "movq %3,%10;"\
     INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
     INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
-    INNER_PREF_8x8\
     INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
     INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
+
 #define INNER_SAVE_m8n24 \
-    INNER_PREF_8x8\
+    "movq %3,%10;"\
     INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
     INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
-    INNER_PREF_8x8\
     INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
     INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
-    INNER_PREF_8x8\
     INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
     INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
 
 #define COMPUTE_n8 {\
+    b_pref = packed_b_pointer + 8 * K;\
     __asm__ __volatile__(\
     "vbroadcastsd (%9),%%zmm3;"\
     "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
@@ -290,7 +297,7 @@
     INNER_KERNELm8(8)\
     INNER_SAVE_m8n8\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
-    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $64,%3;"\
+    "addq $64,%3;"\
     "subq $8,%8; cmpq $8,%8; jnb 42221b;"\
     "42222:\n\t"\
     "cmpq $4,%8; jb 42223f;"\
@@ -298,7 +305,7 @@
     INNER_KERNELm4(8)\
     INNER_SAVE_m4n8\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $32,%3;"\
+    "addq $32,%3;"\
     "subq $4,%8;"\
     "42223:\n\t"\
     "cmpq $2,%8; jb 42224f;"\
@@ -318,11 +325,13 @@
     "42225:\n\t"\
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
+    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
     ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 #define COMPUTE_n16 {\
+    b_pref = packed_b_pointer + 16 * K;\
     __asm__ __volatile__(\
     "vbroadcastsd (%9),%%zmm3;"\
     "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
@@ -332,7 +341,7 @@
     INNER_KERNELm8(16)\
     INNER_SAVE_m8n16\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
-    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
+    "addq $64,%3;"\
     "subq $8,%8; cmpq $8,%8; jnb 32221b;"\
     "32222:\n\t"\
     "cmpq $4,%8; jb 32223f;"\
@@ -340,7 +349,7 @@
     INNER_KERNELm4(16)\
     INNER_SAVE_m4n16\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
+    "addq $32,%3;"\
     "subq $4,%8;"\
     "32223:\n\t"\
     "cmpq $2,%8; jb 32224f;"\
@@ -348,7 +357,7 @@
     INNER_KERNELm2(16)\
     INNER_SAVE_m2n16\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $16,%3;"\
+    "addq $16,%3;"\
     "subq $2,%8;"\
     "32224:\n\t"\
     "cmpq $1,%8; jb 32225f;"\
@@ -356,17 +365,19 @@
     INNER_KERNELm1(16)\
     INNER_SAVE_m1n16\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "shlq $3,%4;subq %4,%3;shrq $3,%4;addq $8,%3;"\
+    "addq $8,%3;"\
     "32225:\n\t"\
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $4,%4;addq %4,%3;shrq $4,%4;"\
     "leaq (%1,%%r12,4),%1;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
+    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
     ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
     "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 #define COMPUTE_n24 {\
+    b_pref = packed_b_pointer + 24 * K;\
     __asm__ __volatile__(\
     "vbroadcastsd (%9),%%zmm3;"\
     "movq %8,%%r14;movq %2,%%r13;movq %2,%%r12;shlq $5,%%r12;"\
@@ -376,7 +387,7 @@
     INNER_KERNELm8(24)\
     INNER_SAVE_m8n24\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1; addq %%r12,%0;"\
-    "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $64,%3;"\
+    "addq $64,%3;"\
     "subq $8,%8; cmpq $8,%8; jnb 22221b;"\
     "22222:\n\t"\
     "cmpq $4,%8; jb 22223f;"\
@@ -384,7 +395,7 @@
     INNER_KERNELm4(24)\
     INNER_SAVE_m4n24\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "shlq $3,%4;subq %4,%3;shlq $1,%4;subq %4,%3;shrq $4,%4;addq $32,%3;"\
+    "addq $32,%3;"\
     "subq $4,%8;"\
     "22223:\n\t"\
     "cmpq $2,%8; jb 22224f;"\
@@ -392,7 +403,7 @@
     INNER_KERNELm2(24)\
     INNER_SAVE_m2n24\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $16,%3;"\
+    "addq $16,%3;"\
     "subq $2,%8;"\
     "22224:\n\t"\
     "cmpq $1,%8; jb 22225f;"\
@@ -400,12 +411,13 @@
     INNER_KERNELm1(24)\
     INNER_SAVE_m1n24\
     "movq %%r13,%2; subq %%r12,%1; subq %%r12,%1;"\
-    "shlq $4,%4;subq %4,%3;shrq $4,%4;addq $8,%3;"\
+    "addq $8,%3;"\
     "22225:\n\t"\
     "movq %%r14,%8;shlq $3,%8;subq %8,%3;shrq $3,%8;"\
     "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
     "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
-    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),"+r"(M),"+r"(alpha)\
+    :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
+    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
     ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
     "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
@@ -415,8 +427,8 @@ static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG
     if(k==0 || m==0 || ndiv8==0) return;
     int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double);
     int64_t K = (int64_t)k; int64_t M = (int64_t)m;
-    double *a_block_pointer;
-    double *c_pointer = c;
+    double *a_block_pointer,*b_pref;
+    double *c_pointer = c,*c_store = c;
     __mmask16 k01 = 0x00f0,k02 = 0x000f,k03 = 0x0033;
     BLASLONG ndiv8_count;
     double *packed_b_pointer = packed_b;

From 819e852ae76f49931ddd0c242b8d5569729677f9 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 11 Nov 2019 20:04:52 +0800
Subject: [PATCH 094/210] AVX512 CGEMM & ZGEMM kernels

96-99% 1-thread performance of MKL2018
---
 kernel/x86_64/KERNEL.SKYLAKEX             |   3 +
 kernel/x86_64/cgemm_kernel_8x2_skylakex.c | 352 ++++++++++++++++++++++
 kernel/x86_64/zgemm_kernel_4x2_skylakex.c | 283 +++++++++++++++++
 3 files changed, 638 insertions(+)
 create mode 100644 kernel/x86_64/cgemm_kernel_8x2_skylakex.c
 create mode 100644 kernel/x86_64/zgemm_kernel_4x2_skylakex.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index a39030c53..d5d32d1b3 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -14,3 +14,6 @@ DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c
 
 SGEMM_BETA = sgemm_beta_skylakex.c
 DGEMM_BETA = dgemm_beta_skylakex.c
+
+CGEMMKERNEL    =  cgemm_kernel_8x2_skylakex.c
+ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
diff --git a/kernel/x86_64/cgemm_kernel_8x2_skylakex.c b/kernel/x86_64/cgemm_kernel_8x2_skylakex.c
new file mode 100644
index 000000000..35a57b98a
--- /dev/null
+++ b/kernel/x86_64/cgemm_kernel_8x2_skylakex.c
@@ -0,0 +1,352 @@
+#include <stdint.h>
+#include "common.h"
+
+#if  defined(NN) || defined(NT) || defined(TN) || defined(TT)
+  #define CGEMM_SKX_MODE 0 //not to do conjugation on a_block and b_block
+#endif
+#if  defined(RN) || defined(RT) || defined(CN) || defined(CT)
+  #define CGEMM_SKX_MODE 1 //do conjugation on a_block, not b_block
+#endif
+#if  defined(NR) || defined(NC) || defined(TR) || defined(TC)
+  #define CGEMM_SKX_MODE 2 //do conjugation on b_block, not a_block
+#endif
+#if  defined(RR) || defined(RC) || defined(CR) || defined(CC)
+  #define CGEMM_SKX_MODE 3 //do conjugation on a_block and b_block
+#endif
+
+// recommended settings: GEMM_DEFAULT_Q = 192, GEMM_DEFAULT_P = 384
+/* %0=a_pointer, %1=b_pointer, %2=c_pointer, %3=c_store, %4=ldc(bytes), %5=&constval, %6 = k_counter, %7 = m_counter, %8 = b_pref */
+// const float constval[4] = {alpha_r, alpha_i, -1, 1};
+/* r11 = m; r12 = k * 16; r13 = k; r14 = b_head; r15 = %1 + r12 * 3; */
+#define GENERAL_INIT "movq %7,%%r11; movq %1,%%r14; movq %6,%%r13; movq %6,%%r12; salq $4,%%r12;"
+#define GENERAL_RECOVER "movq %%r11,%7; movq %%r13,%6; movq %%r14,%1;"
+#define CONSTZMM_INIT "vbroadcastss (%5),%%zmm0; vbroadcastss 4(%5),%%zmm1; vbroadcastsd 8(%5),%%zmm2;"
+#define COMPUTE_INIT "movq %%r13,%6; movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"
+
+/* m=8, zmm0=alpha_r, zmm1=alpha_i, zmm2={-1,1,...,-1,1}, zmm3-zmm7 for temporary use, zmm8-zmm31 for accumulators */
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
+  #define unit_kernel_k1m8n1(a_r,a_i,b_off,c_le,c_ri,...) \
+    "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231ps "#a_r",%%zmm3,"#c_le"; vfmadd231ps "#a_i",%%zmm3,"#c_ri";"
+#else //do conjugation on a_block
+  #define unit_kernel_k1m8n1(a_r,a_i,b_off,c_le,c_ri,...) \
+    "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231ps "#a_r",%%zmm3,"#c_le"; vfnmadd231ps "#a_i",%%zmm3,"#c_ri";"
+#endif
+#define KERNEL_h_k1m8n1 \
+    "vmovsldup (%0),%%zmm4; vmovshdup (%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\
+    unit_kernel_k1m8n1(%%zmm4,%%zmm5,0,%%zmm8,%%zmm9,%1)
+#define KERNEL_t_k1m8n1 KERNEL_h_k1m8n1 "addq $8,%1;"
+#define KERNEL_h_k1m8n2 KERNEL_h_k1m8n1 unit_kernel_k1m8n1(%%zmm4,%%zmm5,8,%%zmm10,%%zmm11,%1)
+#define KERNEL_t_k1m8n2 KERNEL_h_k1m8n2 "addq $16,%1;"
+#define unit_kernel_k1m8n2(c1le,c1ri,c2le,c2ri,...) \
+    unit_kernel_k1m8n1(%%zmm4,%%zmm5,0,c1le,c1ri,__VA_ARGS__)\
+    unit_kernel_k1m8n1(%%zmm4,%%zmm5,8,c2le,c2ri,__VA_ARGS__)
+#define KERNEL_h_k1m8n4 KERNEL_h_k1m8n2 unit_kernel_k1m8n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1)
+#define KERNEL_t_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
+#define KERNEL_t_k1m8n6 KERNEL_h_k1m8n4 unit_kernel_k1m8n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m8n8 KERNEL_t_k1m8n6 unit_kernel_k1m8n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15)
+#define KERNEL_t_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m8n10 KERNEL_h_k1m8n8 unit_kernel_k1m8n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1)
+#define KERNEL_t_k1m8n10 KERNEL_h_k1m8n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n10 unit_kernel_k1m8n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2)
+#define KERNEL_t_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 1 //not to do conjugation on b_block
+  #define unit_save_m8n1(c_le,c_ri,...) \
+    "vpermilps $177,"#c_ri","#c_ri"; vfmadd231ps "#c_ri",%%zmm2,"#c_le"; vpermilps $177,"#c_le",%%zmm4;"\
+    "vfmaddsub213ps ("#__VA_ARGS__"),%%zmm1,%%zmm4; vfmaddsub213ps %%zmm4,%%zmm0,"#c_le"; vmovups "#c_le",("#__VA_ARGS__");"
+#else //do conjugation on b_block
+  #define unit_save_m8n1(c_le,c_ri,...) \
+    "vpermilps $177,"#c_ri","#c_ri"; vfnmadd231ps "#c_ri",%%zmm2,"#c_le"; vpermilps $177,"#c_le",%%zmm4;"\
+    "vfmsubadd213ps ("#__VA_ARGS__"),%%zmm0,"#c_le"; vfmsubadd231ps %%zmm4,%%zmm1,"#c_le"; vmovups "#c_le",("#__VA_ARGS__");"
+#endif
+#define SAVE_SETUP_m8 "movq %2,%3; addq $64,%2;"
+#define SAVE_m8n1 SAVE_SETUP_m8 unit_save_m8n1(%%zmm8,%%zmm9,%3)
+#define SAVE_m8n2 SAVE_m8n1 unit_save_m8n1(%%zmm10,%%zmm11,%3,%4,1)
+#define unit_save_m8n2(c1le,c1ri,c2le,c2ri) \
+    "leaq (%3,%4,2),%3;" unit_save_m8n1(c1le,c1ri,%3) unit_save_m8n1(c2le,c2ri,%3,%4,1)
+#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+#define SAVE_m8n6 SAVE_m8n4 unit_save_m8n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
+#define SAVE_m8n8 SAVE_m8n6 unit_save_m8n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23)
+#define SAVE_m8n10 SAVE_m8n8 unit_save_m8n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27)
+#define SAVE_m8n12 SAVE_m8n10 unit_save_m8n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31)
+#define unit_init_m8n1(c_le,c_ri) "vpxorq "#c_le","#c_le","#c_le"; vpxorq "#c_ri","#c_ri","#c_ri";"
+#define INIT_m8n1 unit_init_m8n1(%%zmm8,%%zmm9)
+#define INIT_m8n2 INIT_m8n1 unit_init_m8n1(%%zmm10,%%zmm11)
+#define INIT_m8n4 INIT_m8n2 unit_init_m8n1(%%zmm12,%%zmm13) unit_init_m8n1(%%zmm14,%%zmm15)
+#define INIT_m8n6 INIT_m8n4 unit_init_m8n1(%%zmm16,%%zmm17) unit_init_m8n1(%%zmm18,%%zmm19)
+#define INIT_m8n8 INIT_m8n6 unit_init_m8n1(%%zmm20,%%zmm21) unit_init_m8n1(%%zmm22,%%zmm23)
+#define INIT_m8n10 INIT_m8n8 unit_init_m8n1(%%zmm24,%%zmm25) unit_init_m8n1(%%zmm26,%%zmm27)
+#define INIT_m8n12 INIT_m8n10 unit_init_m8n1(%%zmm28,%%zmm29) unit_init_m8n1(%%zmm30,%%zmm31)
+#define COMPUTE_m8(ndim) \
+    INIT_m8n##ndim\
+    COMPUTE_INIT "movq %2,%3;"\
+    "cmpq $18,%6; jb "#ndim"88880f;"\
+    #ndim"88889:\n\t"\
+    KERNEL_t_k1m8n##ndim\
+    KERNEL_t_k1m8n##ndim\
+    KERNEL_t_k1m8n##ndim\
+    "prefetcht1 (%3); prefetcht1 63(%3); addq %4,%3;"\
+    KERNEL_t_k1m8n##ndim\
+    KERNEL_t_k1m8n##ndim\
+    KERNEL_t_k1m8n##ndim\
+    "prefetcht1 (%8); addq $40,%8;"\
+    "subq $6,%6; cmpq $18,%6; jnb "#ndim"88889b;"\
+    "movq %2,%3;"\
+    #ndim"88880:\n\t"\
+    "testq %6,%6; jz "#ndim"88881f;"\
+    "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
+    KERNEL_t_k1m8n##ndim\
+    "decq %6; jmp "#ndim"88880b;"\
+    #ndim"88881:\n\t"\
+    SAVE_m8n##ndim
+
+/* m=4, ymm0-ymm3 for temporary use, ymm4-ymm15 for accumulators */
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilps($177,a0)
+  #define unit_kernel_k1m4n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231ps "#ap",%%ymm2,"#c1";"\
+    "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231ps "#a0",%%ymm2,"#c1";"
+#else //conjg_a != conjg_b
+  #define unit_kernel_k1m4n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231ps "#ap",%%ymm2,"#c1";"\
+    "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231ps "#a0",%%ymm2,"#c1";"
+#endif
+#define KERNEL_h_k1m4n1 \
+    "vmovups (%0),%%ymm0; vpermilps $177,%%ymm0,%%ymm1; addq $32,%0;"\
+    unit_kernel_k1m4n1(%%ymm0,%%ymm1,0,4,%%ymm4,%1)
+#define KERNEL_t_k1m4n1 KERNEL_h_k1m4n1 "addq $8,%1;"
+#define KERNEL_h_k1m4n2 KERNEL_h_k1m4n1 unit_kernel_k1m4n1(%%ymm0,%%ymm1,8,12,%%ymm5,%1)
+#define KERNEL_t_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
+#define unit_kernel_k1m4n2(c1,c2,...) \
+    unit_kernel_k1m4n1(%%ymm0,%%ymm1,0,4,c1,__VA_ARGS__)\
+    unit_kernel_k1m4n1(%%ymm0,%%ymm1,8,12,c2,__VA_ARGS__)
+#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_kernel_k1m4n2(%%ymm6,%%ymm7,%1,%%r12,1)
+#define KERNEL_t_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
+#define KERNEL_t_k1m4n6 KERNEL_h_k1m4n4 unit_kernel_k1m4n2(%%ymm8,%%ymm9,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m4n8 KERNEL_t_k1m4n6 unit_kernel_k1m4n2(%%ymm10,%%ymm11,%%r15)
+#define KERNEL_t_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_kernel_k1m4n2(%%ymm12,%%ymm13,%%r15,%%r12,1)
+#define KERNEL_t_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_kernel_k1m4n2(%%ymm14,%%ymm15,%%r15,%%r12,2)
+#define KERNEL_t_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
+  #define unit_save_m4n1(alp_r,alp_i,c1,...) \
+    "vpermilps $177,"#c1",%%ymm3; vfmaddsub213ps ("#__VA_ARGS__"),"#alp_i",%%ymm3;"\
+    "vfmaddsub213ps %%ymm3,"#alp_r","#c1";vmovups "#c1",("#__VA_ARGS__");"
+#else //do conjugation on a_block
+  #define unit_save_m4n1(alp_r,alp_i,c1,...) \
+    "vpermilps $177,"#c1",%%ymm3; vfmsubadd213ps ("#__VA_ARGS__"),"#alp_r","#c1";"\
+    "vfmsubadd231ps %%ymm3,"#alp_i","#c1";vmovups "#c1",("#__VA_ARGS__");"
+#endif
+#define SAVE_SETUP_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%5),%%ymm0; vbroadcastss 4(%5),%%ymm1;"
+#define SAVE_m4n1 SAVE_SETUP_m4 unit_save_m4n1(%%ymm0,%%ymm1,%%ymm4,%3)
+#define SAVE_m4n2 SAVE_m4n1 unit_save_m4n1(%%ymm0,%%ymm1,%%ymm5,%3,%4,1)
+#define unit_save_m4n2(c1,c2) \
+    "leaq (%3,%4,2),%3;" unit_save_m4n1(%%ymm0,%%ymm1,c1,%3) unit_save_m4n1(%%ymm0,%%ymm1,c2,%3,%4,1)
+#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%ymm6,%%ymm7)
+#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(%%ymm8,%%ymm9)
+#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(%%ymm10,%%ymm11)
+#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(%%ymm12,%%ymm13)
+#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(%%ymm14,%%ymm15)
+#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define unit_init_m4n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";"
+#define INIT_m4n2 unit_init_m4n2(%%ymm4,%%ymm5)
+#define INIT_m4n4 INIT_m4n2 unit_init_m4n2(%%ymm6,%%ymm7)
+#define INIT_m4n6 INIT_m4n4 unit_init_m4n2(%%ymm8,%%ymm9)
+#define INIT_m4n8 INIT_m4n6 unit_init_m4n2(%%ymm10,%%ymm11)
+#define INIT_m4n10 INIT_m4n8 unit_init_m4n2(%%ymm12,%%ymm13)
+#define INIT_m4n12 INIT_m4n10 unit_init_m4n2(%%ymm14,%%ymm15)
+#define COMPUTE_m4(ndim) \
+    INIT_m4n##ndim\
+    COMPUTE_INIT\
+    #ndim"88440:\n\t"\
+    "testq %6,%6; jz "#ndim"88441f;"\
+    KERNEL_t_k1m4n##ndim\
+    "decq %6; jmp "#ndim"88440b;"\
+    #ndim"88441:\n\t"\
+    SAVE_m4n##ndim
+
+/* m=2, xmm0-xmm3 for temporary use, xmm4-xmm15 for accumulators */
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b;
+  #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\
+    "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";"
+#else //conjg_a != conjg_b
+  #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\
+    "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";"
+#endif
+#define KERNEL_h_k1m2n1 \
+    "vmovups (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $16,%0;"\
+    unit_kernel_k1m2n1(%%xmm0,%%xmm1,0,4,%%xmm4,%1)
+#define KERNEL_t_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;"
+#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1 unit_kernel_k1m2n1(%%xmm0,%%xmm1,8,12,%%xmm5,%1)
+#define KERNEL_t_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
+#define unit_kernel_k1m2n2(c1,c2,...) \
+    unit_kernel_k1m2n1(%%xmm0,%%xmm1,0,4,c1,__VA_ARGS__)\
+    unit_kernel_k1m2n1(%%xmm0,%%xmm1,8,12,c2,__VA_ARGS__)
+#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_kernel_k1m2n2(%%xmm6,%%xmm7,%1,%%r12,1)
+#define KERNEL_t_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
+#define KERNEL_t_k1m2n6 KERNEL_h_k1m2n4 unit_kernel_k1m2n2(%%xmm8,%%xmm9,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m2n8 KERNEL_t_k1m2n6 unit_kernel_k1m2n2(%%xmm10,%%xmm11,%%r15)
+#define KERNEL_t_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_kernel_k1m2n2(%%xmm12,%%xmm13,%%r15,%%r12,1)
+#define KERNEL_t_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_kernel_k1m2n2(%%xmm14,%%xmm15,%%r15,%%r12,2)
+#define KERNEL_t_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
+  #define unit_save_m2n1(alp_r,alp_i,c1,...) \
+    "vpermilps $177,"#c1",%%xmm3; vfmaddsub213ps ("#__VA_ARGS__"),"#alp_i",%%xmm3;"\
+    "vfmaddsub213ps %%xmm3,"#alp_r","#c1";vmovups "#c1",("#__VA_ARGS__");"
+#else //do conjugation on a_block
+  #define unit_save_m2n1(alp_r,alp_i,c1,...) \
+    "vpermilps $177,"#c1",%%xmm3; vfmsubadd213ps ("#__VA_ARGS__"),"#alp_r","#c1";"\
+    "vfmsubadd231ps %%xmm3,"#alp_i","#c1";vmovups "#c1",("#__VA_ARGS__");"
+#endif
+#define SAVE_SETUP_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%5),%%xmm0; vbroadcastss 4(%5),%%xmm1;"
+#define SAVE_m2n1 SAVE_SETUP_m2 unit_save_m2n1(%%xmm0,%%xmm1,%%xmm4,%3)
+#define SAVE_m2n2 SAVE_m2n1 unit_save_m2n1(%%xmm0,%%xmm1,%%xmm5,%3,%4,1)
+#define unit_save_m2n2(c1,c2) \
+    "leaq (%3,%4,2),%3;" unit_save_m2n1(%%xmm0,%%xmm1,c1,%3) unit_save_m2n1(%%xmm0,%%xmm1,c2,%3,%4,1)
+#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(%%xmm6,%%xmm7)
+#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(%%xmm8,%%xmm9)
+#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(%%xmm10,%%xmm11)
+#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(%%xmm12,%%xmm13)
+#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(%%xmm14,%%xmm15)
+#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define unit_init_m2n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";"
+#define INIT_m2n2 unit_init_m2n2(%%xmm4,%%xmm5)
+#define INIT_m2n4 INIT_m2n2 unit_init_m2n2(%%xmm6,%%xmm7)
+#define INIT_m2n6 INIT_m2n4 unit_init_m2n2(%%xmm8,%%xmm9)
+#define INIT_m2n8 INIT_m2n6 unit_init_m2n2(%%xmm10,%%xmm11)
+#define INIT_m2n10 INIT_m2n8 unit_init_m2n2(%%xmm12,%%xmm13)
+#define INIT_m2n12 INIT_m2n10 unit_init_m2n2(%%xmm14,%%xmm15)
+#define COMPUTE_m2(ndim) \
+    INIT_m2n##ndim\
+    COMPUTE_INIT\
+    #ndim"88220:\n\t"\
+    "testq %6,%6; jz "#ndim"88221f;"\
+    KERNEL_t_k1m2n##ndim\
+    "decq %6; jmp "#ndim"88220b;"\
+    #ndim"88221:\n\t"\
+    SAVE_m2n##ndim
+
+/* m=1, xmm0-xmm3 and xmm10-xmm15 for temporary use, xmm4-xmm9 for accumulators */
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilps($177,a0)
+  #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\
+    "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";"
+  #define unit_kernel_k1m1n2(a0,ap,c1,...) \
+    "vmovshdup ("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#ap",%%xmm2,"#c1";"\
+    "vmovsldup ("#__VA_ARGS__"),%%xmm2; vfmaddsub231ps "#a0",%%xmm2,"#c1";"
+#else //conjg_a != conjg_b
+  #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastss "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\
+    "vbroadcastss "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";"
+  #define unit_kernel_k1m1n2(a0,ap,c1,...) \
+    "vmovshdup ("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#ap",%%xmm2,"#c1";"\
+    "vmovsldup ("#__VA_ARGS__"),%%xmm2; vfmsubadd231ps "#a0",%%xmm2,"#c1";"
+#endif
+#define KERNEL_h_k1m1n1 \
+    "vmovsd (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $8,%0;"\
+    unit_kernel_k1m1n1(%%xmm0,%%xmm1,0,4,%%xmm4,%1)
+#define KERNEL_t_k1m1n1 KERNEL_h_k1m1n1 "addq $8,%1;"
+#define KERNEL_h_k1m1n2 \
+    "vmovddup (%0),%%xmm0; vpermilps $177,%%xmm0,%%xmm1; addq $8,%0;"\
+    unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm4,%1)
+#define KERNEL_t_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
+#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm5,%1,%%r12,1)
+#define KERNEL_t_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
+#define KERNEL_t_k1m1n6 KERNEL_h_k1m1n4 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm6,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m1n8 KERNEL_t_k1m1n6 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm7,%%r15)
+#define KERNEL_t_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm8,%%r15,%%r12,1)
+#define KERNEL_t_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 unit_kernel_k1m1n2(%%xmm0,%%xmm1,%%xmm9,%%r15,%%r12,2)
+#define KERNEL_t_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
+#if CGEMM_SKX_MODE == 0 || CGEMM_SKX_MODE == 2 //not to do conjugation on a_block
+  #define unit_save_m1n1(alp_r,alp_i,c1,...) \
+    "vpermilps $177,"#c1",%%xmm3; vmovsd ("#__VA_ARGS__"),%%xmm2; vfmaddsub213ps %%xmm2,"#alp_i",%%xmm3;"\
+    "vfmaddsub213ps %%xmm3,"#alp_r","#c1";vmovsd "#c1",("#__VA_ARGS__");"
+  #define unit_save_m1n2(alp_r,alp_i,c1) \
+    "vpermilps $177,"#c1",%%xmm3; vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2;"\
+    "vfmaddsub213ps %%xmm2,"#alp_i",%%xmm3; vfmaddsub231ps "#c1","#alp_r",%%xmm3;"\
+    "vmovsd %%xmm3,(%3); vmovhpd %%xmm3,(%3,%4,1); leaq (%3,%4,2),%3;"
+#else //do conjugation on a_block
+  #define unit_save_m1n1(alp_r,alp_i,c1,...) \
+    "vpermilps $177,"#c1",%%xmm3; vmovsd ("#__VA_ARGS__"),%%xmm2; vfmsubadd213ps %%xmm2,"#alp_r","#c1";"\
+    "vfmsubadd231ps %%xmm3,"#alp_i","#c1";vmovsd "#c1",("#__VA_ARGS__");"
+  #define unit_save_m1n2(alp_r,alp_i,c1) \
+    "vpermilps $177,"#c1",%%xmm3; vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2;"\
+    "vfmsubadd213ps %%xmm2,"#alp_r","#c1"; vfmsubadd213ps "#c1","#alp_i",%%xmm3;"\
+    "vmovsd %%xmm3,(%3); vmovhpd %%xmm3,(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define SAVE_SETUP_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%5),%%xmm0; vbroadcastss 4(%5),%%xmm1;"
+#define SAVE_m1n1 SAVE_SETUP_m1 unit_save_m1n1(%%xmm0,%%xmm1,%%xmm4,%3)
+#define SAVE_m1n2 SAVE_SETUP_m1 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm4)
+#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm5)
+#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm6)
+#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm7)
+#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm8)
+#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(%%xmm0,%%xmm1,%%xmm9)
+#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m1n2 INIT_m2n1
+#define INIT_m1n4 INIT_m1n2 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m1n6 INIT_m1n4 "vpxor %%xmm6,%%xmm6,%%xmm6;"
+#define INIT_m1n8 INIT_m1n6 "vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define INIT_m1n10 INIT_m1n8 "vpxor %%xmm8,%%xmm8,%%xmm8;"
+#define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;"
+#define COMPUTE_m1(ndim) \
+    INIT_m1n##ndim\
+    COMPUTE_INIT\
+    #ndim"88110:\n\t"\
+    "testq %6,%6; jz "#ndim"88111f;"\
+    KERNEL_t_k1m1n##ndim\
+    "decq %6; jmp "#ndim"88110b;"\
+    #ndim"88111:\n\t"\
+    SAVE_m1n##ndim
+
+#define COMPUTE(ndim) {\
+    b_pref = b_pointer + ndim * K * 2;\
+    __asm__ __volatile__(\
+    GENERAL_INIT\
+    CONSTZMM_INIT\
+    "cmpq $8,%7;jb 33101"#ndim"f;"\
+    "33109"#ndim":\n\t"\
+    COMPUTE_m8(ndim)\
+    "subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\
+    "33101"#ndim":\n\t"\
+    "cmpq $4,%7;jb 33102"#ndim"f;"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%7;"\
+    "33102"#ndim":\n\t"\
+    "cmpq $2,%7;jb 33103"#ndim"f;"\
+    COMPUTE_m2(ndim)\
+    "subq $2,%7;"\
+    "33103"#ndim":\n\t"\
+    "testq %7,%7;jz 33104"#ndim"f;"\
+    COMPUTE_m1(ndim)\
+    "33104"#ndim":\n\t"\
+    GENERAL_RECOVER\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(c_store),"+r"(ldc_in_bytes),"+r"(constval),"+r"(K),"+r"(M),"+r"(b_pref)\
+    ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
+    "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
+    "cc","memory");\
+    a_pointer -= M * K * 2; b_pointer += ndim * K * 2; c_pointer += (LDC * ndim - M) * 2;\
+}
+
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2; float const_val[4] = {alphar, alphai, -1, 1};
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*c_store = C,*constval = const_val,*b_pref = B;
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>9;n_count-=10) COMPUTE(10)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>5;n_count-=6) COMPUTE(6)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}
diff --git a/kernel/x86_64/zgemm_kernel_4x2_skylakex.c b/kernel/x86_64/zgemm_kernel_4x2_skylakex.c
new file mode 100644
index 000000000..0606a3f7c
--- /dev/null
+++ b/kernel/x86_64/zgemm_kernel_4x2_skylakex.c
@@ -0,0 +1,283 @@
+#include "common.h"
+#include <stdint.h>
+
+#if  defined(NN) || defined(NT) || defined(TN) || defined(TT)
+  #define ZGEMM_SKX_MODE 0 //not to do conjugation on a_block and b_block
+#endif
+#if  defined(RN) || defined(RT) || defined(CN) || defined(CT)
+  #define ZGEMM_SKX_MODE 1 //do conjugation on a_block, not b_block
+#endif
+#if  defined(NR) || defined(NC) || defined(TR) || defined(TC)
+  #define ZGEMM_SKX_MODE 2 //do conjugation on b_block, not a_block
+#endif
+#if  defined(RR) || defined(RC) || defined(CR) || defined(CC)
+  #define ZGEMM_SKX_MODE 3 //do conjugation on a_block and b_block
+#endif
+
+// recommended settings: GEMM_DEFAULT_Q = 128, GEMM_DEFAULT_P = 256
+/* %0=a_pointer, %1=b_pointer, %2=c_pointer, %3=c_store, %4=ldc(bytes), %5=&constval, %6 = k_counter, %7 = m_counter, %8 = b_pref */
+// const double constval[4] = {alpha_r, alpha_i, -1, 1};
+/* r11 = m; r12 = k * 32; r13 = k; r14 = b_head; r15 = %1 + r12 * 3; */
+#define GENERAL_INIT "movq %7,%%r11; movq %1,%%r14; movq %6,%%r13; movq %6,%%r12; salq $5,%%r12;"
+#define GENERAL_RECOVER "movq %%r11,%7; movq %%r13,%6; movq %%r14,%1;"
+#define CONSTZMM_INIT "vbroadcastsd (%5),%%zmm0; vbroadcastsd 8(%5),%%zmm1; vbroadcastf32x4 16(%5),%%zmm2;"
+#define COMPUTE_INIT "movq %%r13,%6; movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"
+
+/* m=4, zmm0=alpha_r, zmm1=alpha_i, zmm2={-1,1,...,-1,1}, zmm3-zmm7 for temporary use, zmm8-zmm31 for accumulators */
+#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block
+  #define unit_kernel_k1m4n1(a_r,a_i,b_off,c_le,c_ri,...) \
+    "vbroadcastf32x4 "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231pd "#a_r",%%zmm3,"#c_le"; vfmadd231pd "#a_i",%%zmm3,"#c_ri";"
+#else //do conjugation on a_block
+  #define unit_kernel_k1m4n1(a_r,a_i,b_off,c_le,c_ri,...) \
+    "vbroadcastf32x4 "#b_off"("#__VA_ARGS__"),%%zmm3; vfmadd231pd "#a_r",%%zmm3,"#c_le"; vfnmadd231pd "#a_i",%%zmm3,"#c_ri";"
+#endif
+#define KERNEL_h_k1m4n1 \
+    "vmovddup (%0),%%zmm4; vmovddup 8(%0),%%zmm5; prefetcht0 512(%0); addq $64,%0;"\
+    unit_kernel_k1m4n1(%%zmm4,%%zmm5,0,%%zmm8,%%zmm9,%1)
+#define KERNEL_t_k1m4n1 KERNEL_h_k1m4n1 "addq $16,%1;"
+#define KERNEL_h_k1m4n2 KERNEL_h_k1m4n1 unit_kernel_k1m4n1(%%zmm4,%%zmm5,16,%%zmm10,%%zmm11,%1)
+#define KERNEL_t_k1m4n2 KERNEL_h_k1m4n2 "addq $32,%1;"
+#define unit_kernel_k1m4n2(c1le,c1ri,c2le,c2ri,...) \
+    unit_kernel_k1m4n1(%%zmm4,%%zmm5,0,c1le,c1ri,__VA_ARGS__)\
+    unit_kernel_k1m4n1(%%zmm4,%%zmm5,16,c2le,c2ri,__VA_ARGS__)
+#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_kernel_k1m4n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15,%1,%%r12,1)
+#define KERNEL_t_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;"
+#define KERNEL_t_k1m4n6 KERNEL_h_k1m4n4 unit_kernel_k1m4n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%1,%%r12,2) "addq $32,%1;"
+#define KERNEL_h_k1m4n8 KERNEL_t_k1m4n6 unit_kernel_k1m4n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23,%%r15)
+#define KERNEL_t_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%%r15;"
+#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_kernel_k1m4n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%r15,%%r12,1)
+#define KERNEL_t_k1m4n10 KERNEL_h_k1m4n10 "addq $32,%%r15;"
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_kernel_k1m4n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31,%%r15,%%r12,2)
+#define KERNEL_t_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%%r15;"
+#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 1 //not to do conjugation on b_block
+  #define unit_save_m4n1(c_le,c_ri,...) \
+    "vpermilpd $85,"#c_ri","#c_ri"; vfmadd231pd "#c_ri",%%zmm2,"#c_le"; vpermilpd $85,"#c_le",%%zmm4;"\
+    "vfmaddsub213pd ("#__VA_ARGS__"),%%zmm1,%%zmm4; vfmaddsub213pd %%zmm4,%%zmm0,"#c_le"; vmovupd "#c_le",("#__VA_ARGS__");"
+#else //do conjugation on b_block
+  #define unit_save_m4n1(c_le,c_ri,...) \
+    "vpermilpd $85,"#c_ri","#c_ri"; vfnmadd231pd "#c_ri",%%zmm2,"#c_le"; vpermilpd $85,"#c_le",%%zmm4;"\
+    "vfmsubadd213pd ("#__VA_ARGS__"),%%zmm0,"#c_le"; vfmsubadd231pd %%zmm4,%%zmm1,"#c_le"; vmovupd "#c_le",("#__VA_ARGS__");"
+#endif
+#define SAVE_SETUP_m4 "movq %2,%3; addq $64,%2;"
+#define SAVE_m4n1 SAVE_SETUP_m4 unit_save_m4n1(%%zmm8,%%zmm9,%3)
+#define SAVE_m4n2 SAVE_m4n1 unit_save_m4n1(%%zmm10,%%zmm11,%3,%4,1)
+#define unit_save_m4n2(c1le,c1ri,c2le,c2ri) \
+    "leaq (%3,%4,2),%3;" unit_save_m4n1(c1le,c1ri,%3) unit_save_m4n1(c2le,c2ri,%3,%4,1)
+#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
+#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
+#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(%%zmm20,%%zmm21,%%zmm22,%%zmm23)
+#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(%%zmm24,%%zmm25,%%zmm26,%%zmm27)
+#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(%%zmm28,%%zmm29,%%zmm30,%%zmm31)
+#define unit_init_m4n1(c_le,c_ri) "vpxorq "#c_le","#c_le","#c_le"; vpxorq "#c_ri","#c_ri","#c_ri";"
+#define INIT_m4n1 unit_init_m4n1(%%zmm8,%%zmm9)
+#define INIT_m4n2 INIT_m4n1 unit_init_m4n1(%%zmm10,%%zmm11)
+#define INIT_m4n4 INIT_m4n2 unit_init_m4n1(%%zmm12,%%zmm13) unit_init_m4n1(%%zmm14,%%zmm15)
+#define INIT_m4n6 INIT_m4n4 unit_init_m4n1(%%zmm16,%%zmm17) unit_init_m4n1(%%zmm18,%%zmm19)
+#define INIT_m4n8 INIT_m4n6 unit_init_m4n1(%%zmm20,%%zmm21) unit_init_m4n1(%%zmm22,%%zmm23)
+#define INIT_m4n10 INIT_m4n8 unit_init_m4n1(%%zmm24,%%zmm25) unit_init_m4n1(%%zmm26,%%zmm27)
+#define INIT_m4n12 INIT_m4n10 unit_init_m4n1(%%zmm28,%%zmm29) unit_init_m4n1(%%zmm30,%%zmm31)
+#define COMPUTE_m4(ndim) \
+    INIT_m4n##ndim\
+    COMPUTE_INIT "movq %2,%3;"\
+    "cmpq $20,%6; jb "#ndim"88440f;"\
+    #ndim"88449:\n\t"\
+    KERNEL_t_k1m4n##ndim\
+    KERNEL_t_k1m4n##ndim\
+    KERNEL_t_k1m4n##ndim\
+    "prefetcht1 (%3); prefetcht1 63(%3); addq %4,%3;"\
+    KERNEL_t_k1m4n##ndim\
+    KERNEL_t_k1m4n##ndim\
+    KERNEL_t_k1m4n##ndim\
+    "prefetcht1 (%8); addq $24,%8;"\
+    "subq $6,%6; cmpq $20,%6; jnb "#ndim"88449b;"\
+    "movq %2,%3;"\
+    #ndim"88440:\n\t"\
+    "testq %6,%6; jz "#ndim"88441f;"\
+    "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
+    KERNEL_t_k1m4n##ndim\
+    "decq %6; jmp "#ndim"88440b;"\
+    #ndim"88441:\n\t"\
+    SAVE_m4n##ndim
+
+/* m=2, ymm0-ymm3 for temporary use, ymm4-ymm15 for accumulators */
+#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilpd($5,a0)
+  #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastsd "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#ap",%%ymm2,"#c1";"\
+    "vbroadcastsd "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#a0",%%ymm2,"#c1";"
+#else //conjg_a != conjg_b
+  #define unit_kernel_k1m2n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vbroadcastsd "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#ap",%%ymm2,"#c1";"\
+    "vbroadcastsd "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#a0",%%ymm2,"#c1";"
+#endif
+#define KERNEL_h_k1m2n1 \
+    "vmovupd (%0),%%ymm0; vpermilpd $5,%%ymm0,%%ymm1; addq $32,%0;"\
+    unit_kernel_k1m2n1(%%ymm0,%%ymm1,0,8,%%ymm4,%1)
+#define KERNEL_t_k1m2n1 KERNEL_h_k1m2n1 "addq $16,%1;"
+#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1 unit_kernel_k1m2n1(%%ymm0,%%ymm1,16,24,%%ymm5,%1)
+#define KERNEL_t_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;"
+#define unit_kernel_k1m2n2(c1,c2,...) \
+    unit_kernel_k1m2n1(%%ymm0,%%ymm1,0,8,c1,__VA_ARGS__)\
+    unit_kernel_k1m2n1(%%ymm0,%%ymm1,16,24,c2,__VA_ARGS__)
+#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_kernel_k1m2n2(%%ymm6,%%ymm7,%1,%%r12,1)
+#define KERNEL_t_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;"
+#define KERNEL_t_k1m2n6 KERNEL_h_k1m2n4 unit_kernel_k1m2n2(%%ymm8,%%ymm9,%1,%%r12,2) "addq $32,%1;"
+#define KERNEL_h_k1m2n8 KERNEL_t_k1m2n6 unit_kernel_k1m2n2(%%ymm10,%%ymm11,%%r15)
+#define KERNEL_t_k1m2n8 KERNEL_h_k1m2n8 "addq $32,%%r15;"
+#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_kernel_k1m2n2(%%ymm12,%%ymm13,%%r15,%%r12,1)
+#define KERNEL_t_k1m2n10 KERNEL_h_k1m2n10 "addq $32,%%r15;"
+#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_kernel_k1m2n2(%%ymm14,%%ymm15,%%r15,%%r12,2)
+#define KERNEL_t_k1m2n12 KERNEL_h_k1m2n12 "addq $32,%%r15;"
+#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block
+  #define unit_save_m2n1(alp_r,alp_i,c1,...) \
+    "vpermilpd $5,"#c1",%%ymm3; vfmaddsub213pd ("#__VA_ARGS__"),"#alp_i",%%ymm3;"\
+    "vfmaddsub213pd %%ymm3,"#alp_r","#c1";vmovupd "#c1",("#__VA_ARGS__");"
+#else //do conjugation on a_block
+  #define unit_save_m2n1(alp_r,alp_i,c1,...) \
+    "vpermilpd $5,"#c1",%%ymm3; vfmsubadd213pd ("#__VA_ARGS__"),"#alp_r","#c1";"\
+    "vfmsubadd231pd %%ymm3,"#alp_i","#c1";vmovupd "#c1",("#__VA_ARGS__");"
+#endif
+#define SAVE_SETUP_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%5),%%ymm0; vbroadcastsd 8(%5),%%ymm1;"
+#define SAVE_m2n1 SAVE_SETUP_m2 unit_save_m2n1(%%ymm0,%%ymm1,%%ymm4,%3)
+#define SAVE_m2n2 SAVE_m2n1 unit_save_m2n1(%%ymm0,%%ymm1,%%ymm5,%3,%4,1)
+#define unit_save_m2n2(c1,c2) \
+    "leaq (%3,%4,2),%3;" unit_save_m2n1(%%ymm0,%%ymm1,c1,%3) unit_save_m2n1(%%ymm0,%%ymm1,c2,%3,%4,1)
+#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(%%ymm6,%%ymm7)
+#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(%%ymm8,%%ymm9)
+#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(%%ymm10,%%ymm11)
+#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(%%ymm12,%%ymm13)
+#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(%%ymm14,%%ymm15)
+#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define unit_init_m2n2(c1,c2) "vpxor "#c1","#c1","#c1"; vpxor "#c2","#c2","#c2";"
+#define INIT_m2n2 unit_init_m2n2(%%ymm4,%%ymm5)
+#define INIT_m2n4 INIT_m2n2 unit_init_m2n2(%%ymm6,%%ymm7)
+#define INIT_m2n6 INIT_m2n4 unit_init_m2n2(%%ymm8,%%ymm9)
+#define INIT_m2n8 INIT_m2n6 unit_init_m2n2(%%ymm10,%%ymm11)
+#define INIT_m2n10 INIT_m2n8 unit_init_m2n2(%%ymm12,%%ymm13)
+#define INIT_m2n12 INIT_m2n10 unit_init_m2n2(%%ymm14,%%ymm15)
+#define COMPUTE_m2(ndim) \
+    INIT_m2n##ndim\
+    COMPUTE_INIT\
+    #ndim"88220:\n\t"\
+    "testq %6,%6; jz "#ndim"88221f;"\
+    KERNEL_t_k1m2n##ndim\
+    "decq %6; jmp "#ndim"88220b;"\
+    #ndim"88221:\n\t"\
+    SAVE_m2n##ndim
+
+/* m=1, ymm0-ymm3 and ymm10-ymm15 for temporary use, ymm4-ymm9 for accumulators */
+#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 3 //conjg_a == conjg_b; ap = permilpd($5,a0)
+  #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vmovddup "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmaddsub231pd "#ap",%%xmm2,"#c1";"\
+    "vmovddup "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmaddsub231pd "#a0",%%xmm2,"#c1";"
+  #define unit_kernel_k1m1n2(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vmovddup "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#ap",%%ymm2,"#c1";"\
+    "vmovddup "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmaddsub231pd "#a0",%%ymm2,"#c1";"
+#else //conjg_a != conjg_b
+  #define unit_kernel_k1m1n1(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vmovddup "#b_off_i"("#__VA_ARGS__"),%%xmm2; vfmsubadd231pd "#ap",%%xmm2,"#c1";"\
+    "vmovddup "#b_off_r"("#__VA_ARGS__"),%%xmm2; vfmsubadd231pd "#a0",%%xmm2,"#c1";"
+  #define unit_kernel_k1m1n2(a0,ap,b_off_r,b_off_i,c1,...) \
+    "vmovddup "#b_off_i"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#ap",%%ymm2,"#c1";"\
+    "vmovddup "#b_off_r"("#__VA_ARGS__"),%%ymm2; vfmsubadd231pd "#a0",%%ymm2,"#c1";"
+#endif
+#define KERNEL_h_k1m1n1 \
+    "vmovupd (%0),%%xmm0; vpermilpd $5,%%xmm0,%%xmm1; addq $16,%0;"\
+    unit_kernel_k1m1n1(%%xmm0,%%xmm1,0,8,%%xmm4,%1)
+#define KERNEL_t_k1m1n1 KERNEL_h_k1m1n1 "addq $16,%1;"
+#define KERNEL_h_k1m1n2 \
+    "vbroadcastf128 (%0),%%ymm0; vpermilpd $5,%%ymm0,%%ymm1; addq $16,%0;"\
+    unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm4,%1)
+#define KERNEL_t_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;"
+#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm5,%1,%%r12,1)
+#define KERNEL_t_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;"
+#define KERNEL_t_k1m1n6 KERNEL_h_k1m1n4 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm6,%1,%%r12,2) "addq $32,%1;"
+#define KERNEL_h_k1m1n8 KERNEL_t_k1m1n6 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm7,%%r15)
+#define KERNEL_t_k1m1n8 KERNEL_h_k1m1n8 "addq $32,%%r15;"
+#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm8,%%r15,%%r12,1)
+#define KERNEL_t_k1m1n10 KERNEL_h_k1m1n10 "addq $32,%%r15;"
+#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 unit_kernel_k1m1n2(%%ymm0,%%ymm1,0,8,%%ymm9,%%r15,%%r12,2)
+#define KERNEL_t_k1m1n12 KERNEL_h_k1m1n12 "addq $32,%%r15;"
+#if ZGEMM_SKX_MODE == 0 || ZGEMM_SKX_MODE == 2 //not to do conjugation on a_block
+  #define unit_save_m1n1(alp_r,alp_i,c1,...) \
+    "vpermilpd $5,"#c1",%%xmm3; vfmaddsub213pd ("#__VA_ARGS__"),"#alp_i",%%xmm3;"\
+    "vfmaddsub213pd %%xmm3,"#alp_r","#c1";vmovupd "#c1",("#__VA_ARGS__");"
+  #define unit_save_m1n2(alp_r,alp_i,c1) \
+    "vpermilpd $5,"#c1",%%ymm3; vmovupd (%3),%%xmm2; vinsertf128 $1,(%3,%4,1),%%ymm2,%%ymm2;"\
+    "vfmaddsub213pd %%ymm2,"#alp_i",%%ymm3; vfmaddsub231pd "#c1","#alp_r",%%ymm3;"\
+    "vmovupd %%xmm3,(%3); vextractf128 $1,%%ymm3,(%3,%4,1); leaq (%3,%4,2),%3;"
+#else //do conjugation on a_block
+  #define unit_save_m1n1(alp_r,alp_i,c1,...) \
+    "vpermilpd $5,"#c1",%%xmm3; vfmsubadd213pd ("#__VA_ARGS__"),"#alp_r","#c1";"\
+    "vfmsubadd231pd %%xmm3,"#alp_i","#c1";vmovupd "#c1",("#__VA_ARGS__");"
+  #define unit_save_m1n2(alp_r,alp_i,c1) \
+    "vpermilpd $5,"#c1",%%ymm3; vmovupd (%3),%%xmm2; vinsertf128 $1,(%3,%4,1),%%ymm2,%%ymm2;"\
+    "vfmsubadd213pd %%ymm2,"#alp_r","#c1"; vfmsubadd213pd "#c1","#alp_i",%%ymm3;"\
+    "vmovupd %%xmm3,(%3); vextractf128 $1,%%ymm3,(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define SAVE_SETUP_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%5),%%ymm0; vbroadcastsd 8(%5),%%ymm1;"
+#define SAVE_m1n1 SAVE_SETUP_m1 unit_save_m1n1(%%xmm0,%%xmm1,%%xmm4,%3)
+#define SAVE_m1n2 SAVE_SETUP_m1 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm4)
+#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm5)
+#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm6)
+#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm7)
+#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm8)
+#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(%%ymm0,%%ymm1,%%ymm9)
+#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m1n2 INIT_m2n1
+#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm6,%%ymm6,%%ymm6;"
+#define INIT_m1n8 INIT_m1n6 "vpxor %%ymm7,%%ymm7,%%ymm7;"
+#define INIT_m1n10 INIT_m1n8 "vpxor %%ymm8,%%ymm8,%%ymm8;"
+#define INIT_m1n12 INIT_m1n10 "vpxor %%ymm9,%%ymm9,%%ymm9;"
+#define COMPUTE_m1(ndim) \
+    INIT_m1n##ndim\
+    COMPUTE_INIT\
+    #ndim"88110:\n\t"\
+    "testq %6,%6; jz "#ndim"88111f;"\
+    KERNEL_t_k1m1n##ndim\
+    "decq %6; jmp "#ndim"88110b;"\
+    #ndim"88111:\n\t"\
+    SAVE_m1n##ndim
+
+#define COMPUTE(ndim) {\
+    b_pref = b_pointer + ndim * K * 2;\
+    __asm__ __volatile__(\
+    GENERAL_INIT\
+    CONSTZMM_INIT\
+    "cmpq $4,%7;jb 33101"#ndim"f;"\
+    "33109"#ndim":\n\t"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%7;cmpq $4,%7;jnb 33109"#ndim"b;"\
+    "33101"#ndim":\n\t"\
+    "cmpq $2,%7;jb 33102"#ndim"f;"\
+    COMPUTE_m2(ndim)\
+    "subq $2,%7;"\
+    "33102"#ndim":\n\t"\
+    "testq %7,%7;jz 33103"#ndim"f;"\
+    COMPUTE_m1(ndim)\
+    "33103"#ndim":\n\t"\
+    GENERAL_RECOVER\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(c_store),"+r"(ldc_in_bytes),"+r"(constval),"+r"(K),"+r"(M),"+r"(b_pref)\
+    ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
+    "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
+    "cc","memory");\
+    a_pointer -= M * K * 2; b_pointer += ndim * K * 2; c_pointer += (LDC * ndim - M) * 2;\
+}
+
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2; double const_val[4] = {alphar, alphai, -1, 1};
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    double *a_pointer = A,*b_pointer = B,*c_pointer = C,*c_store = C,*constval = const_val,*b_pref = B;
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>9;n_count-=10) COMPUTE(10)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>5;n_count-=6) COMPUTE(6)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}

From bf73aa141b334ce2d870fe1b7ab340e0c7df5e8d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Nov 2019 00:19:24 +0100
Subject: [PATCH 095/210] Fix potential spurious failure from uninitialized
 variable

---
 ctest/c_cblat3.f | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ctest/c_cblat3.f b/ctest/c_cblat3.f
index 96f190352..74293ce53 100644
--- a/ctest/c_cblat3.f
+++ b/ctest/c_cblat3.f
@@ -1503,6 +1503,8 @@ C     $      '      .' )
       NC = 0
       RESET = .TRUE.
       ERRMAX = RZERO
+      RALS = RONE
+      RBETS = RONE
 *
       DO 100 IN = 1, NIDIM
          N = IDIM( IN )

From 351d12b94e2213b6d819fd1cb5c28f64f7deafc9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Nov 2019 00:20:36 +0100
Subject: [PATCH 096/210] Fix potential spurious failure from uninitialized
 variable

---
 ctest/c_zblat3.f | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ctest/c_zblat3.f b/ctest/c_zblat3.f
index 5df834b2e..cc109d651 100644
--- a/ctest/c_zblat3.f
+++ b/ctest/c_zblat3.f
@@ -1504,6 +1504,8 @@ C     $      '      .' )
       NC = 0
       RESET = .TRUE.
       ERRMAX = RZERO
+      RALS = RONE
+      RBETS = RONE
 *
       DO 100 IN = 1, NIDIM
          N = IDIM( IN )

From 6082e556cd990fc4d13e89d83db403b79d771e52 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Nov 2019 15:10:26 +0100
Subject: [PATCH 097/210] Use "generic" S/CGEMM unroll M on big-endian PPC970

as the respective PPC970 "altivec" kernels give wrong results when compiled for big endian
---
 param.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/param.h b/param.h
index 1cf4137d6..9dc94c420 100644
--- a/param.h
+++ b/param.h
@@ -1990,11 +1990,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 3072
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
+#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define SGEMM_DEFAULT_UNROLL_M 4
+#else
 #define SGEMM_DEFAULT_UNROLL_M 16
+#endif
 #define SGEMM_DEFAULT_UNROLL_N 4
 #define DGEMM_DEFAULT_UNROLL_M 4
 #define DGEMM_DEFAULT_UNROLL_N 4
+#if defined(__BYTE_ORDER__)&&(__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define CGEMM_DEFAULT_UNROLL_M 2
+#else
 #define CGEMM_DEFAULT_UNROLL_M 8
+#endif
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_M 2
 #define ZGEMM_DEFAULT_UNROLL_N 2

From b3ac6ee2227c1c62c6f0b93d8e9985423fabdc9d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Nov 2019 15:19:39 +0100
Subject: [PATCH 098/210] Define alternate kernels for big-endian PPC970

The altivec versions of SGEMM and CGEMM fail most test in LAPACK-TESTING when compiled for big endian, STRSM/CTRSM even cause segfaults. The rot kernels either fail the corresponding utest or lead to failures in LAPACK-TESTING.
---
 kernel/power/KERNEL.PPC970 | 55 +++++++++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 10 deletions(-)

diff --git a/kernel/power/KERNEL.PPC970 b/kernel/power/KERNEL.PPC970
index 7431a7788..de30977de 100644
--- a/kernel/power/KERNEL.PPC970
+++ b/kernel/power/KERNEL.PPC970
@@ -1,3 +1,14 @@
+ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
+SGEMMKERNEL    =  gemm_kernel.S
+SGEMMINCOPY    =  
+SGEMMITCOPY    =  
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =  
+SGEMMITCOPYOBJ =  
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
 SGEMMKERNEL    =  gemm_kernel_altivec.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
@@ -7,6 +18,8 @@ SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
 DGEMMKERNEL    =  gemm_kernel.S
 DGEMMINCOPY    =
 DGEMMITCOPY    =
@@ -16,6 +29,18 @@ DGEMMINCOPYOBJ =
 DGEMMITCOPYOBJ =
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
+CGEMMKERNEL    =  zgemm_kernel.S
+CGEMMINCOPY    =
+CGEMMITCOPY    =
+CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    =  ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =
+CGEMMITCOPYOBJ =
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
 CGEMMKERNEL    =  zgemm_kernel_altivec.S
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
@@ -25,6 +50,8 @@ CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
 CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
 ZGEMMKERNEL    =  zgemm_kernel.S
 ZGEMMINCOPY    =
 ZGEMMITCOPY    =
@@ -35,22 +62,30 @@ ZGEMMITCOPYOBJ =
 ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
 ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-#STRSMKERNEL_LN	=  trsm_kernel_LN.S
-#STRSMKERNEL_LT	=  trsm_kernel_LT.S
-#STRSMKERNEL_RN	=  trsm_kernel_LT.S
-#STRSMKERNEL_RT	=  trsm_kernel_RT.S
-
 DTRSMKERNEL_LN	=  trsm_kernel_LN.S
 DTRSMKERNEL_LT	=  trsm_kernel_LT.S
 DTRSMKERNEL_RN	=  trsm_kernel_LT.S
 DTRSMKERNEL_RT	=  trsm_kernel_RT.S
 
-#CTRSMKERNEL_LN	=  ztrsm_kernel_LN.S
-#CTRSMKERNEL_LT	=  ztrsm_kernel_LT.S
-#CTRSMKERNEL_RN	=  ztrsm_kernel_LT.S
-#CTRSMKERNEL_RT	=  ztrsm_kernel_RT.S
-
 ZTRSMKERNEL_LN	=  ztrsm_kernel_LN.S
 ZTRSMKERNEL_LT	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RN	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RT	=  ztrsm_kernel_RT.S
+
+ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
+STRSMKERNEL_LN	=  trsm_kernel_LN.S
+STRSMKERNEL_LT	=  trsm_kernel_LT.S
+STRSMKERNEL_RN	=  trsm_kernel_LT.S
+STRSMKERNEL_RT	=  trsm_kernel_RT.S
+
+CTRSMKERNEL_LN	=  ztrsm_kernel_LN.S
+CTRSMKERNEL_LT	=  ztrsm_kernel_LT.S
+CTRSMKERNEL_RN	=  ztrsm_kernel_LT.S
+CTRSMKERNEL_RT	=  ztrsm_kernel_RT.S
+
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+endif

From 82b75f97e509aaef4bc5d95c4b54b29b77c50ede Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Nov 2019 19:22:04 +0100
Subject: [PATCH 099/210] Disable the old QCDOC qalloc by default and copy
 utility functions from memory.c

1. qalloc() appears to have been a special routine written for the PPC440-based QCDOC supercomputer(s) from around 2005, its source does not seem to be readily available. So switch the #if 1 in the code to rely on standard malloc() by default.
2. Utility functions like get_num_procs, get_num_threads that were added to the "normally" used memory.c in the meantime were still missing here.
---
 driver/others/memory_qalloc.c | 321 ++++++++++++++++++++++++++++++++--
 1 file changed, 311 insertions(+), 10 deletions(-)

diff --git a/driver/others/memory_qalloc.c b/driver/others/memory_qalloc.c
index 17b7f5d60..6174d9b75 100644
--- a/driver/others/memory_qalloc.c
+++ b/driver/others/memory_qalloc.c
@@ -38,21 +38,29 @@
 
 #include <stdio.h>
 #include "common.h"
+#ifdef OS_LINUX
+#include <sys/sysinfo.h>
+#include <sched.h>
+#include <errno.h>
+#include <linux/unistd.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#endif
 
-#ifndef SMP
-#define blas_cpu_number 1
-#else
-
-int blas_cpu_number = 1;
-
-int blas_get_cpu_number(void){
+#ifdef OS_HAIKU
+#include <unistd.h>
+#endif
 
-  return blas_cpu_number;
-}
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
+#include <sys/sysctl.h>
+#include <sys/resource.h>
 #endif
 
+
 #define FIXED_PAGESIZE 4096
 
+
 void *sa = NULL;
 void *sb = NULL;
 static double static_buffer[BUFFER_SIZE/sizeof(double)];
@@ -60,7 +68,7 @@ static double static_buffer[BUFFER_SIZE/sizeof(double)];
 void *blas_memory_alloc(int numproc){
 
   if (sa == NULL){
-#if 1
+#if 0
     sa = (void *)qalloc(QFAST, BUFFER_SIZE);
 #else
     sa = (void *)malloc(BUFFER_SIZE);
@@ -75,3 +83,296 @@ void blas_memory_free(void *free_area){
   return;
 }
 
+
+
+extern void openblas_warning(int verbose, const char * msg);
+
+#ifndef SMP
+
+#define blas_cpu_number 1
+#define blas_num_threads 1
+
+/* Dummy Function */
+int  goto_get_num_procs  (void) { return 1;};
+void goto_set_num_threads(int num_threads) {};
+
+#else
+
+#if defined(OS_LINUX) || defined(OS_SUNOS)
+#ifndef NO_AFFINITY
+int get_num_procs(void);
+#else
+int get_num_procs(void) {
+
+  static int nums = 0;
+  cpu_set_t cpuset,*cpusetp;
+  size_t size;
+  int ret;
+
+#if defined(__GLIBC_PREREQ)
+#if !__GLIBC_PREREQ(2, 7)
+  int i;
+#if !__GLIBC_PREREQ(2, 6)
+  int n;
+#endif
+#endif
+#endif
+
+  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
+#if !defined(OS_LINUX)
+  return nums;
+#endif
+
+/*
+#if !defined(__GLIBC_PREREQ)
+  return nums;
+#else
+ #if !__GLIBC_PREREQ(2, 3)
+  return nums;
+ #endif
+
+ #if !__GLIBC_PREREQ(2, 7)
+  ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
+  if (ret!=0) return nums;
+  n=0;
+  #if !__GLIBC_PREREQ(2, 6)
+  for (i=0;i<nums;i++)
+     if (CPU_ISSET(i,&cpuset)) n++;
+  nums=n;
+  #else
+  nums = CPU_COUNT(sizeof(cpuset),&cpuset);
+  #endif
+  return nums;
+ #else
+  if (nums >= CPU_SETSIZE) {
+    cpusetp = CPU_ALLOC(nums);
+      if (cpusetp == NULL) {
+        return nums;
+      }
+    size = CPU_ALLOC_SIZE(nums);
+    ret = sched_getaffinity(0,size,cpusetp);
+    if (ret!=0) {
+      CPU_FREE(cpusetp);
+      return nums;
+    }
+    ret = CPU_COUNT_S(size,cpusetp);
+    if (ret > 0 && ret < nums) nums = ret;	
+    CPU_FREE(cpusetp);
+    return nums;
+  } else {
+    ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
+    if (ret!=0) {
+      return nums;
+    }
+    ret = CPU_COUNT(&cpuset);
+    if (ret > 0 && ret < nums) nums = ret;	
+    return nums;
+  }
+ #endif
+#endif
+*/
+   return 1;
+}
+#endif
+#endif
+
+#ifdef OS_ANDROID
+int get_num_procs(void) {
+  static int nums = 0;
+  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
+  return nums;
+}
+#endif
+
+#ifdef OS_HAIKU
+int get_num_procs(void) {
+  static int nums = 0;
+  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
+  return nums;
+}
+#endif
+
+#ifdef OS_AIX
+int get_num_procs(void) {
+  static int nums = 0;
+  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
+  return nums;
+}
+#endif
+
+#ifdef OS_WINDOWS
+
+int get_num_procs(void) {
+
+  static int nums = 0;
+
+  if (nums == 0) {
+
+    SYSTEM_INFO sysinfo;
+
+    GetSystemInfo(&sysinfo);
+
+    nums = sysinfo.dwNumberOfProcessors;
+  }
+
+  return nums;
+}
+
+#endif
+
+#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
+
+int get_num_procs(void) {
+
+  static int nums = 0;
+
+  int m[2];
+  size_t len;
+
+  if (nums == 0) {
+    m[0] = CTL_HW;
+    m[1] = HW_NCPU;
+    len = sizeof(int);
+    sysctl(m, 2, &nums, &len, NULL, 0);
+  }
+
+  return nums;
+}
+
+#endif
+
+#if defined(OS_DARWIN)
+int get_num_procs(void) {
+  static int nums = 0;
+  size_t len;
+  if (nums == 0){
+    len = sizeof(int);
+    sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
+  }
+  return nums;
+}
+/*
+void set_stack_limit(int limitMB){
+  int result=0;
+  struct rlimit rl;
+  rlim_t StackSize;
+
+  StackSize=limitMB*1024*1024;
+  result=getrlimit(RLIMIT_STACK, &rl);
+  if(result==0){
+    if(rl.rlim_cur < StackSize){
+      rl.rlim_cur=StackSize;
+      result=setrlimit(RLIMIT_STACK, &rl);
+      if(result !=0){
+        fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
+      }
+    }
+  }
+}
+*/
+#endif
+
+
+/*
+OpenBLAS uses the numbers of CPU cores in multithreading.
+It can be set by openblas_set_num_threads(int num_threads);
+*/
+int blas_cpu_number  = 0;
+/*
+The numbers of threads in the thread pool.
+This value is equal or large than blas_cpu_number. This means some threads are sleep.
+*/
+int blas_num_threads = 0;
+
+int  goto_get_num_procs  (void) {
+  return blas_cpu_number;
+}
+
+void openblas_fork_handler()
+{
+  // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
+  // built with "make USE_OPENMP=0".
+  // Hanging can still happen when OpenBLAS is built against the libgomp
+  // implementation of OpenMP. The problem is tracked at:
+  //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
+  // In the mean time build with USE_OPENMP=0 or link against another
+  // implementation of OpenMP.
+#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
+  int err;
+  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
+  if(err != 0)
+    openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
+#endif
+}
+
+extern int openblas_num_threads_env();
+extern int openblas_goto_num_threads_env();
+extern int openblas_omp_num_threads_env();
+
+int blas_get_cpu_number(void){
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+  int max_num;
+#endif
+  int blas_goto_num   = 0;
+  int blas_omp_num    = 0;
+
+  if (blas_num_threads) return blas_num_threads;
+
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+  max_num = get_num_procs();
+#endif
+
+  // blas_goto_num = 0;
+#ifndef USE_OPENMP
+  blas_goto_num=openblas_num_threads_env();
+  if (blas_goto_num < 0) blas_goto_num = 0;
+
+  if (blas_goto_num == 0) {
+    blas_goto_num=openblas_goto_num_threads_env();
+    if (blas_goto_num < 0) blas_goto_num = 0;
+  }
+
+#endif
+
+  // blas_omp_num = 0;
+  blas_omp_num=openblas_omp_num_threads_env();
+  if (blas_omp_num < 0) blas_omp_num = 0;
+
+  if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
+  else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
+  else blas_num_threads = MAX_CPU_NUMBER;
+
+#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
+  if (blas_num_threads > max_num) blas_num_threads = max_num;
+#endif
+
+  if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
+
+#ifdef DEBUG
+  printf( "Adjusted number of threads : %3d\n", blas_num_threads);
+#endif
+
+  blas_cpu_number = blas_num_threads;
+
+  return blas_num_threads;
+}
+#endif
+
+
+int openblas_get_num_procs(void) {
+#ifndef SMP
+  return 1;
+#else
+  return get_num_procs();
+#endif
+}
+
+int openblas_get_num_threads(void) {
+#ifndef SMP
+  return 1;
+#else
+  // init blas_cpu_number if needed
+  blas_get_cpu_number();
+  return blas_cpu_number;
+#endif
+}

From 0c07c356c1fd402d6466a79742df9cdbd3f5d62a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Nov 2019 19:25:08 +0100
Subject: [PATCH 100/210] Define alternate kernels for big-endian PPC440

---
 kernel/power/KERNEL.PPC440 | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/kernel/power/KERNEL.PPC440 b/kernel/power/KERNEL.PPC440
index 988a4b701..a0696b548 100644
--- a/kernel/power/KERNEL.PPC440
+++ b/kernel/power/KERNEL.PPC440
@@ -15,13 +15,23 @@ ZASUMKERNEL	= zasum_ppc440.S
 
 SAXPYKERNEL	=  axpy_ppc440.S
 DAXPYKERNEL	=  axpy_ppc440.S
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
+CAXPYKERNEL	= ../arm/zaxpy.c
+ZAXPYKERNEL	= ../arm/zaxpy.c
+else
 CAXPYKERNEL	= zaxpy_ppc440.S
 ZAXPYKERNEL	= zaxpy_ppc440.S
+endif
 
 SDOTKERNEL	=  dot_ppc440.S
 DDOTKERNEL	=  dot_ppc440.S
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
 CDOTKERNEL	= zdot_ppc440.S
 ZDOTKERNEL	= zdot_ppc440.S
+else
+CDOTKERNEL	= ../arm/zdot.c
+ZDOTKERNEL	= ../arm/zdot.c
+endif
 
 ISAMAXKERNEL	=  iamax_ppc440.S
 IDAMAXKERNEL	=  iamax_ppc440.S
@@ -52,8 +62,13 @@ ZNRM2KERNEL	= znrm2_ppc440.S
 
 SROTKERNEL	=  rot_ppc440.S
 DROTKERNEL	=  rot_ppc440.S
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
 CROTKERNEL	= zrot_ppc440.S
 ZROTKERNEL	= zrot_ppc440.S
+else
+CROTKERNEL	= ../arm/zrot.c
+ZROTKERNEL	= ../arm/zrot.c
+endif
 
 SSCALKERNEL	=  scal_ppc440.S
 DSCALKERNEL	=  scal_ppc440.S
@@ -116,3 +131,15 @@ ZTRSMKERNEL_LN	=  ztrsm_kernel_ppc440_LN.S
 ZTRSMKERNEL_LT	=  ztrsm_kernel_ppc440_LT.S
 ZTRSMKERNEL_RN	=  ztrsm_kernel_ppc440_LT.S
 ZTRSMKERNEL_RT	=  ztrsm_kernel_ppc440_RT.S
+
+ifeq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+endif
+

From eba0aeb7cde7f59bca1e631512128f3e380588fe Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Nov 2019 22:58:32 +0100
Subject: [PATCH 101/210] Fix compilation for big-endian POWER8

---
 kernel/power/caxpy_power8.S  | 5 ++++-
 kernel/power/icamin_power8.S | 5 ++++-
 kernel/power/idamax.c        | 6 +++---
 kernel/power/idamin.c        | 3 +--
 kernel/power/isamax_power8.S | 5 ++++-
 kernel/power/isamin_power8.S | 5 ++++-
 kernel/power/izamin.c        | 4 ++--
 7 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S
index 0ce61ca3b..b5f841d2e 100644
--- a/kernel/power/caxpy_power8.S
+++ b/kernel/power/caxpy_power8.S
@@ -12,11 +12,12 @@
 
 	PROLOGUE
 
-caxpy_k:
 .LCF0:
 0:	addis 2,12,.TOC.-.LCF0@ha
 	addi 2,2,.TOC.-.LCF0@l
+#if _CALL_ELF ==2
 	.localentry	caxpy_k,.-caxpy_k
+#endif
 	mr. 7,3
 	ble 0,.L33
 	cmpdi 7,9,1
@@ -515,7 +516,9 @@ caxpy_k:
 	b .L13
 	.long 0
 	.byte 0,0,0,0,0,4,0,0
+#if _CALL_ELF ==2
 	.size	caxpy_k,.-caxpy_k
+#endif
 	.section	.rodata
 	.align 4
 	.set	.LANCHOR0,. + 0
diff --git a/kernel/power/icamin_power8.S b/kernel/power/icamin_power8.S
index e3d66798e..f2993e83e 100644
--- a/kernel/power/icamin_power8.S
+++ b/kernel/power/icamin_power8.S
@@ -11,11 +11,12 @@
 
 	PROLOGUE
 
-icamin_k:
 .LCF0:
 0:	addis 2,12,.TOC.-.LCF0@ha
 	addi 2,2,.TOC.-.LCF0@l
+#if _CALL_ELF ==2
 	.localentry	icamin_k,.-icamin_k
+#endif
 	mr. 9,3
 	ble 0,.L25
 	cmpdi 7,5,0
@@ -388,7 +389,9 @@ icamin_k:
 	b .L21
 	.long 0
 	.byte 0,0,0,0,0,1,0,0
+#if _CALL_ELF ==2
 	.size	icamin_k,.-icamin_k
+#endif
 	.section	.rodata.cst16,"aM",@progbits,16
 	.align 4
 .LC2:
diff --git a/kernel/power/idamax.c b/kernel/power/idamax.c
index 337fa54f8..95aa592c7 100644
--- a/kernel/power/idamax.c
+++ b/kernel/power/idamax.c
@@ -324,15 +324,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 
     if (inc_x == 1) {
 
-#if defined(_CALL_ELF) && (_CALL_ELF == 2)
         BLASLONG n1 = n & -32;
-        if (n1 > 0) {
+#if defined(_CALL_ELF) && (_CALL_ELF == 2)
+	if (n1 > 0) {
 
             max = diamax_kernel_32(n1, x, &maxf);
 
             i = n1;
         }
-#endif
+#endif	
         while (i < n) {
             if (ABS(x[i]) > maxf) {
                 max = i;
diff --git a/kernel/power/idamin.c b/kernel/power/idamin.c
index 85dd49ac1..323f9987e 100644
--- a/kernel/power/idamin.c
+++ b/kernel/power/idamin.c
@@ -328,13 +328,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) {
 
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
         BLASLONG n1 = n & -32;
-        if (n1 > 0) {
+	if (n1 > 0) {
 
             min = diamin_kernel_32(n1, x, &minf);
             i = n1;
         }
 #endif
- 
         while (i < n) {
             if (ABS(x[i]) < minf) {
                 min = i;
diff --git a/kernel/power/isamax_power8.S b/kernel/power/isamax_power8.S
index c8fcaecc3..fa5433333 100644
--- a/kernel/power/isamax_power8.S
+++ b/kernel/power/isamax_power8.S
@@ -12,11 +12,12 @@
 
 	PROLOGUE
 
-isamax_k:
 .LCF0:
 0:	addis 2,12,.TOC.-.LCF0@ha
 	addi 2,2,.TOC.-.LCF0@l
+#if _CALL_ELF ==2
 	.localentry	isamax_k,.-isamax_k
+#endif
 	mr. 11,3
 	ble 0,.L36
 	cmpdi 7,5,0
@@ -397,7 +398,9 @@ isamax_k:
 	b .L61
 	.long 0
 	.byte 0,0,0,0,0,1,0,0
+#if _CALL_ELF ==2
 	.size	isamax_k,.-isamax_k
+#endif
 	.section	.rodata.cst16,"aM",@progbits,16
 	.align 4
 .LC2:
diff --git a/kernel/power/isamin_power8.S b/kernel/power/isamin_power8.S
index 3873e879b..c9b6acb85 100644
--- a/kernel/power/isamin_power8.S
+++ b/kernel/power/isamin_power8.S
@@ -11,11 +11,12 @@
 
 	PROLOGUE
 
-isamin_k:
 .LCF0:
 0:	addis 2,12,.TOC.-.LCF0@ha
 	addi 2,2,.TOC.-.LCF0@l
+#if _CALL_ELF ==2
 	.localentry	isamin_k,.-isamin_k
+#endif
 	mr. 11,3
 	ble 0,.L36
 	cmpdi 7,5,0
@@ -380,7 +381,9 @@ isamin_k:
 	b .L35
 	.long 0
 	.byte 0,0,0,0,0,1,0,0
+#if _CALL_ELF ==2
 	.size	isamin_k,.-isamin_k
+#endif
 	.section	.rodata.cst16,"aM",@progbits,16
 	.align 4
 .LC2:
diff --git a/kernel/power/izamin.c b/kernel/power/izamin.c
index 8da2189c6..06a5537d8 100644
--- a/kernel/power/izamin.c
+++ b/kernel/power/izamin.c
@@ -316,14 +316,14 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
         minf = CABS1(x,0); //index will not be incremented
 
 #if defined(_CALL_ELF) && (_CALL_ELF == 2)
-        BLASLONG n1 = n & -16;
+	BLASLONG n1 = n & -16;
         if (n1 > 0) {
 
             min = ziamin_kernel_16_TUNED(n1, x, &minf);
             i = n1;
             ix = n1 << 1;
         }
-#endif      
+#endif 
 
         while(i < n)
         {

From cad0d150db22663ff60feff0b6764e7a97235bd0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 17 Nov 2019 23:12:10 +0100
Subject: [PATCH 102/210] Define alternate kernels for big-endian POWER8

---
 kernel/power/KERNEL.POWER8 | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8
index c08f3fb00..fb9452a35 100644
--- a/kernel/power/KERNEL.POWER8
+++ b/kernel/power/KERNEL.POWER8
@@ -89,14 +89,30 @@ ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 #SMINKERNEL   = ../arm/min.c
 #DMINKERNEL   = ../arm/min.c
 #
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
 ISAMAXKERNEL = isamax_power8.S
+else
+ISAMAXKERNEL = isamax.c
+endif
 IDAMAXKERNEL = idamax.c
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
 ICAMAXKERNEL = icamax_power8.S
+else
+ICAMAXKERNEL = icamax.c
+endif
 IZAMAXKERNEL = izamax.c
 #
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
 ISAMINKERNEL = isamin_power8.S
+else
+ISAMINKERNEL = isamin.c
+endif
 IDAMINKERNEL = idamin.c
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
 ICAMINKERNEL = icamin_power8.S
+else
+ICAMINKERNEL = icamin.c
+endif
 IZAMINKERNEL = izamin.c
 #
 #ISMAXKERNEL  = ../arm/imax.c
@@ -112,7 +128,11 @@ ZASUMKERNEL  = zasum.c
 #
 SAXPYKERNEL  = saxpy.c
 DAXPYKERNEL  = daxpy.c
+ifneq ($(__BYTE_ORDER__),$(__ORDER_BIG_ENDIAN__))
 CAXPYKERNEL  = caxpy_power8.S
+else
+CAXPYKERNEL  = caxpy.c
+endif
 ZAXPYKERNEL  = zaxpy.c
 #
 SCOPYKERNEL  = scopy.c

From 0caf1434c928d39373499ffc02abe645945485d8 Mon Sep 17 00:00:00 2001
From: "Wang, Long" <long1.wang@intel.com>
Date: Wed, 20 Nov 2019 11:50:37 +0800
Subject: [PATCH 103/210] Fix the integer overflow issue for large matrix size

For large matrix, e.g. M=N=K, and M>1290, int mnk=M*N*K will overflow.
This will lead to wrong branching to single-threading. The performance
is downgraded significantly.

Signed-off-by: Wang, Long <long1.wang@intel.com>
---
 kernel/x86_64/sgemm_kernel_16x4_skylakex.c   | 2 +-
 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
index 3246e681f..31d82e3bf 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
@@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo
 
 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 {
-	int mnk = M * N * K;
+	unsigned long mnk = M * N * K;
 	/* large matrixes -> not performant */
 	if (mnk >= 28 * 512 * 512)
 		return 0;
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
index 5d491237b..95963c0ac 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
@@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
 
 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 {
-	int mnk = M * N * K;
+	unsigned long mnk = M * N * K;
 	/* large matrixes -> not performant */
 	if (mnk >= 28 * 512 * 512)
 		return 0;

From 1f6071590d5b4fa3b52aa1456a9648ae354b2c7a Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Wed, 20 Nov 2019 12:21:35 +0100
Subject: [PATCH 104/210] Fix usage of TerminateThread() causing critical
 section corruption.

This patch was submitted to the GIMP project by a publisher wishing to
keep confidentiality (hence anonymously). I just pass along the patch.
Here is the patch explanation which came with:

First they remind us what Microsoft documentation says about
TerminateThread:
> TerminateThread is a dangerous function that should only be used in
> the most extreme cases. You should call TerminateThread only if you
> know exactly what the target thread is doing, and you control all of
> the code that the target thread could possibly be running at the time
> of the termination.
(https://docs.microsoft.com/en-us/windows/win32/api/processthreadsapi/nf-processthreadsapi-terminatethread)

Then they say that 5 milliseconds time-out might not be long enough for
the thread to exit gracefully. They propose to set it to a much higher
value (for instance here 5 seconds).

And finally you should always check the return value of
WaitForSingleObject(). In particular you want to run TerminateThread()
only if WaitForSingleObject() failed, not on success case.
---
 driver/others/blas_server_win32.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index bace54a23..e27725baf 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -462,11 +462,15 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
     for(i = 0; i < blas_num_threads - 1; i++){
       // Could also just use WaitForMultipleObjects
-      WaitForSingleObject(blas_threads[i], 5);  //INFINITE);
+      DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
+
 #ifndef OS_WINDOWSSTORE
-// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
-      TerminateThread(blas_threads[i],0);
+      // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
+      if (WAIT_OBJECT_0 != wait_thread_value) {
+        TerminateThread(blas_threads[i],0);
+      }
 #endif
+
       CloseHandle(blas_threads[i]);
     }
 

From 1191db1a49b237e7c636616cb51ca0879d01c128 Mon Sep 17 00:00:00 2001
From: "Wang, Long" <long1.wang@intel.com>
Date: Wed, 20 Nov 2019 21:30:16 +0800
Subject: [PATCH 105/210] For the sake of windows compatible, used "unsigned
 long long" to ensure 64-bit length

Signed-off-by: Wang, Long <long1.wang@intel.com>
---
 kernel/x86_64/sgemm_kernel_16x4_skylakex.c   | 4 ++--
 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
index 31d82e3bf..4177ae2dc 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
@@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 int __attribute__ ((noinline))
 CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc)
 {
-	unsigned long M = m, N = n, K = k;
+	unsigned long long M = m, N = n, K = k;
 	if (M == 0)
 		return 0;
 	if (N == 0)
@@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
 			STORE_SCALAR(0, 0);
 		}
 	}
-}
\ No newline at end of file
+}
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
index 95963c0ac..ee3417505 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
@@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
 
 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 {
-	unsigned long mnk = M * N * K;
+	unsigned long long mnk = M * N * K;
 	/* large matrixes -> not performant */
 	if (mnk >= 28 * 512 * 512)
 		return 0;

From bfb5fbdb4d6660d539c5f1cf42c44aadb446b3e6 Mon Sep 17 00:00:00 2001
From: "Wang, Long" <long1.wang@intel.com>
Date: Thu, 21 Nov 2019 10:19:40 +0800
Subject: [PATCH 106/210] revised fix windows compatible for #2313

Signed-off-by: Wang, Long <long1.wang@intel.com>
---
 kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
index 4177ae2dc..76b82e65b 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
@@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo
 
 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 {
-	unsigned long mnk = M * N * K;
+	unsigned long long mnk = M * N * K;
 	/* large matrixes -> not performant */
 	if (mnk >= 28 * 512 * 512)
 		return 0;

From 883c39773a178048be81dac9f9110dd602e562f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dan=20Hor=C3=A1k?= <dan@danny.cz>
Date: Thu, 21 Nov 2019 12:49:54 +0100
Subject: [PATCH 107/210] zarch: treat z15 as z14 instead of generic
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Dan Horák <dan@danny.cz>
---
 cpuid_zarch.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/cpuid_zarch.c b/cpuid_zarch.c
index 896ed94f5..872d846e1 100644
--- a/cpuid_zarch.c
+++ b/cpuid_zarch.c
@@ -30,17 +30,20 @@
 #define CPU_GENERIC     0
 #define CPU_Z13         1
 #define CPU_Z14         2
+#define CPU_Z15         3
 
 static char *cpuname[] = {
   "ZARCH_GENERIC",
   "Z13",
-  "Z14"
+  "Z14",
+  "Z15"
 };
 
 static char *cpuname_lower[] = {
   "zarch_generic",
   "z13",
-  "z14"
+  "z14",
+  "z15"
 };
 
 int detect(void)
@@ -66,6 +69,7 @@ int detect(void)
   if (strstr(p, "2965")) return CPU_Z13;
   if (strstr(p, "3906")) return CPU_Z14;
   if (strstr(p, "3907")) return CPU_Z14;
+  if (strstr(p, "8561")) return CPU_Z14;        // fallback z15 to z14
 
   return CPU_GENERIC;
 }

From d117dfd5059c4c1e21a89f23412ca05bb1536dab Mon Sep 17 00:00:00 2001
From: Andreas Arnez <arnez@linux.ibm.com>
Date: Fri, 20 Sep 2019 18:32:47 +0200
Subject: [PATCH 108/210] Change bad usage of "asum" to "sum" in ZARCH versions
 of ?sum

The ZARCH implementations of ?sum contain a cut & paste-error: An inline
assembly argument is named "sum", but the assembly references "asum"
instead.  The mismatch causes a build error.  This is fixed.
---
 kernel/zarch/csum.c | 2 +-
 kernel/zarch/dsum.c | 2 +-
 kernel/zarch/ssum.c | 2 +-
 kernel/zarch/zsum.c | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c
index c0b8c6371..e9413da8e 100644
--- a/kernel/zarch/csum.c
+++ b/kernel/zarch/csum.c
@@ -88,7 +88,7 @@ static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) {
     "vfasb   %%v24,%%v24,%%v25\n\t"
     "vrepf   %%v25,%%v24,2\n\t"
     "vfasb   %%v24,%%v24,%%v25\n\t"
-    "vstef   %%v24,%[asum],0"
+    "vstef   %%v24,%[sum],0"
     : [sum] "=Q"(sum),[n] "+&r"(n)
     : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
     : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c
index 178bc3462..8d44873c0 100644
--- a/kernel/zarch/dsum.c
+++ b/kernel/zarch/dsum.c
@@ -86,7 +86,7 @@ static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) {
     "vfadb   %%v24,%%v24,%%v31\n\t"
     "vrepg   %%v25,%%v24,1\n\t"
     "vfadb   %%v24,%%v24,%%v25\n\t"
-    "vsteg   %%v24,%[asum],0"
+    "vsteg   %%v24,%[sum],0"
     : [sum] "=Q"(sum),[n] "+&r"(n)
     : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
     : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c
index a433ab592..3f3f46a85 100644
--- a/kernel/zarch/ssum.c
+++ b/kernel/zarch/ssum.c
@@ -89,7 +89,7 @@ static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) {
     "vfasb   %%v24,%%v24,%%v25\n\t"
     "vrepf   %%v25,%%v24,2\n\t"
     "vfasb   %%v24,%%v24,%%v25\n\t"
-    "vstef   %%v24,%[asum],0"
+    "vstef   %%v24,%[sum],0"
     : [sum] "=Q"(sum),[n] "+&r"(n)
     : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x)
     : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c
index 7cfc1f17f..e0f978d87 100644
--- a/kernel/zarch/zsum.c
+++ b/kernel/zarch/zsum.c
@@ -87,7 +87,7 @@ static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) {
     "vfadb   %%v24,%%v24,%%v31\n\t"
     "vrepg   %%v25,%%v24,1\n\t"
     "vfadb   %%v24,%%v24,%%v25\n\t"
-    "vsteg   %%v24,%[asum],0"
+    "vsteg   %%v24,%[sum],0"
     : [sum] "=Q"(sum),[n] "+&r"(n)
     : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x)
     : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",

From 04226f1e97dd30b2e757893d230a5b1d67017b0d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 21 Nov 2019 18:14:29 +0100
Subject: [PATCH 109/210] Add the cpuid of the business/rackmount version of
 z15 as well

---
 cpuid_zarch.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpuid_zarch.c b/cpuid_zarch.c
index 872d846e1..df3b7898f 100644
--- a/cpuid_zarch.c
+++ b/cpuid_zarch.c
@@ -70,6 +70,7 @@ int detect(void)
   if (strstr(p, "3906")) return CPU_Z14;
   if (strstr(p, "3907")) return CPU_Z14;
   if (strstr(p, "8561")) return CPU_Z14;        // fallback z15 to z14
+  if (strstr(p, "8562")) return CPU_Z14;        // fallback z15 to z14
 
   return CPU_GENERIC;
 }

From f3065a0eedb4d0e6a48d4009cbc447f127fbb5b8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Nov 2019 19:54:56 +0100
Subject: [PATCH 110/210] Fix race conditions in multithreaded GEMM3M

by adding barriers (and a mutex lock for the non-OpenMP case) like it was already done for GEMM in level3_thread.c some time ago
---
 driver/level3/level3_gemm3m_thread.c | 53 ++++++++++++++++++++++------
 1 file changed, 42 insertions(+), 11 deletions(-)

diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c
index 4903aa5bd..21d431b60 100644
--- a/driver/level3/level3_gemm3m_thread.c
+++ b/driver/level3/level3_gemm3m_thread.c
@@ -408,7 +408,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       /* Make sure if no one is using another buffer */
       for (i = 0; i < args -> nthreads; i++)
-	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
+	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
 
       STOP_RPCC(waiting1);
 
@@ -441,7 +441,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       for (i = 0; i < args -> nthreads; i++)
 	job[mypos].working[i][CACHE_LINE_SIZE * bufferside] = (BLASLONG)buffer[bufferside];
-      }
+      WMB;
+	}
 
     current = mypos;
 
@@ -458,7 +459,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 	  START_RPCC();
 
 	  /* thread has to wait */
-	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
+	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
 
 	  STOP_RPCC(waiting2);
 
@@ -477,6 +478,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
 	if (m_to - m_from == min_i) {
 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
+	WMB;
 	}
       }
     } while (current != mypos);
@@ -517,6 +519,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 	if (is + min_i >= m_to) {
 	  /* Thread doesn't need this buffer any more */
 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
+	WMB;
 	}
 	}
 
@@ -541,7 +544,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       /* Make sure if no one is using another buffer */
       for (i = 0; i < args -> nthreads; i++)
-	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
+	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
 
       STOP_RPCC(waiting1);
 
@@ -595,7 +598,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 	  START_RPCC();
 
 	  /* thread has to wait */
-	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
+	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
 
 	  STOP_RPCC(waiting2);
 
@@ -613,6 +616,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
 	if (m_to - m_from == min_i) {
 	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
+	WMB;
 	}
       }
     } while (current != mypos);
@@ -677,7 +681,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       /* Make sure if no one is using another buffer */
       for (i = 0; i < args -> nthreads; i++)
-	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;};
+	while (job[mypos].working[i][CACHE_LINE_SIZE * bufferside]) {YIELDING;MB;};
 
       STOP_RPCC(waiting1);
 
@@ -731,7 +735,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 	  START_RPCC();
 
 	  /* thread has to wait */
-	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;};
+	  while(job[current].working[mypos][CACHE_LINE_SIZE * bufferside] == 0) {YIELDING;MB;};
 
 	  STOP_RPCC(waiting2);
 
@@ -748,8 +752,9 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 	}
 
 	if (m_to - m_from == min_i) {
-	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
-	}
+	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+	WMB;
+}
       }
     } while (current != mypos);
 
@@ -787,7 +792,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #endif
 	if (is + min_i >= m_to) {
 	  /* Thread doesn't need this buffer any more */
-	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] = 0;
+	  job[current].working[mypos][CACHE_LINE_SIZE * bufferside] &= 0;
+	  WMB;
 	}
 	}
 
@@ -804,7 +810,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
   for (i = 0; i < args -> nthreads; i++) {
     for (xxx = 0; xxx < DIVIDE_RATE; xxx++) {
-      while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;};
+      while (job[mypos].working[i][CACHE_LINE_SIZE * xxx] ) {YIELDING;MB;};
     }
   }
 
@@ -840,6 +846,15 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
 		       *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){
 
+#ifndef USE_OPENMP
+#ifndef OS_WINDOWS
+static pthread_mutex_t  level3_lock    = PTHREAD_MUTEX_INITIALIZER;
+#else
+CRITICAL_SECTION level3_lock;
+InitializeCriticalSection((PCRITICAL_SECTION)&level3_lock);
+#endif
+#endif
+
   blas_arg_t newarg;
 
   blas_queue_t queue[MAX_CPU_NUMBER];
@@ -869,6 +884,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
   mode  =  BLAS_SINGLE  | BLAS_REAL | BLAS_NODE;
 #endif
 
+#ifndef USE_OPENMP
+#ifndef OS_WINDOWS
+pthread_mutex_lock(&level3_lock);
+#else
+EnterCriticalSection((PCRITICAL_SECTION)&level3_lock);
+#endif
+#endif
+
   newarg.m        = args -> m;
   newarg.n        = args -> n;
   newarg.k        = args -> k;
@@ -973,6 +996,14 @@ static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG
   free(job);
 #endif
 
+#ifndef USE_OPENMP
+#ifndef OS_WINDOWS
+  pthread_mutex_unlock(&level3_lock);
+#else
+  LeaveCriticalSection((PCRITICAL_SECTION)&level3_lock);
+#endif
+#endif
+
   return 0;
 }
 

From f95989cbc1d0c8dedbded1183103569cd64584fd Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 23 Nov 2019 22:38:07 +0100
Subject: [PATCH 111/210] Fix AVX512 capability test (always returning zero)

from #2322
---
 driver/others/dynamic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index f1cd3c6e6..a4ff0e086 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -329,7 +329,7 @@ int support_avx512(){
   if (!support_avx())
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 1){
+  if((ebx & (1<<7)) == 0){
       ret=0;  //OS does not even support AVX2
   }
   if((ebx & (1<<31)) != 0){

From eb1e9c8c928f710170532ece33c4f183aea87ea2 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 26 Nov 2019 14:12:20 +0800
Subject: [PATCH 112/210] some optimizations

---
 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 160 +++++++++-----------
 1 file changed, 75 insertions(+), 85 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
index 72878acfd..51b0b94fa 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
+++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
@@ -2,7 +2,8 @@
 #include <stdint.h>
 #include <immintrin.h>
 
-//register usage: zmm3 for alpha, zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
+//register usage: zmm3 for alpha, zmm0-zmm2 and zmm4-zmm7 for temporary use, zmm8-zmm31 for accumulators.
+
 /* row-major c_block */
 #define INNER_KERNEL_k1m1n8 \
     "prefetcht0 384(%1);"\
@@ -13,18 +14,6 @@
     INNER_KERNEL_k1m1n8\
     "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm9;"
 
-#define INNER_KERNEL_k1m4n8 \
-    INNER_KERNEL_k1m2n8\
-    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;"\
-    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;"
-
-#define INNER_KERNEL_k1m8n8 \
-    INNER_KERNEL_k1m4n8\
-    "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;"\
-    "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm13;"\
-    "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;"\
-    "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm15;"
-
 #define INNER_KERNEL_k1m1n16 \
     "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2);"\
     "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; addq $64,%1;"\
@@ -34,18 +23,6 @@
     INNER_KERNEL_k1m1n16\
     "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm10;vfmadd231pd %%zmm6,%%zmm4,%%zmm11;"
 
-#define INNER_KERNEL_k1m4n16 \
-    INNER_KERNEL_k1m2n16\
-    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm12;vfmadd231pd %%zmm6,%%zmm4,%%zmm13;"\
-    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;"
-
-#define INNER_KERNEL_k1m8n16 \
-    INNER_KERNEL_k1m4n16\
-    "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm16;vfmadd231pd %%zmm6,%%zmm4,%%zmm17;"\
-    "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm18;vfmadd231pd %%zmm6,%%zmm4,%%zmm19;"\
-    "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;"\
-    "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm22;vfmadd231pd %%zmm6,%%zmm4,%%zmm23;"
-
 #define INNER_KERNEL_k1m1n24 \
     "prefetcht0 128(%1); prefetcht0 128(%1,%%r12,2); prefetcht0 128(%1,%%r12,4);"\
     "vmovupd (%1),%%zmm5; vmovupd (%1,%%r12,2),%%zmm6; vmovupd (%1,%%r12,4),%%zmm7; addq $64,%1;"\
@@ -55,18 +32,48 @@
     INNER_KERNEL_k1m1n24\
     "vbroadcastsd  8(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm11;vfmadd231pd %%zmm6,%%zmm4,%%zmm12;vfmadd231pd %%zmm7,%%zmm4,%%zmm13;"
 
+/* row-major z-partition c_block */
+#define INNER_KERNEL_k1m4n8 \
+    "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5; addq $32,%0;"\
+    "vmovddup (%1),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm8; vfmadd231pd %%zmm5,%%zmm6,%%zmm10;"\
+    "vmovddup 8(%1),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm9; vfmadd231pd %%zmm5,%%zmm7,%%zmm11;"
+
+#define INNER_KERNEL_k1m4n16 \
+    INNER_KERNEL_k1m4n8\
+    "vmovddup (%1,%%r12,2),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm12; vfmadd231pd %%zmm5,%%zmm6,%%zmm14;"\
+    "vmovddup 8(%1,%%r12,2),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm13; vfmadd231pd %%zmm5,%%zmm7,%%zmm15;"
+
 #define INNER_KERNEL_k1m4n24 \
-    INNER_KERNEL_k1m2n24\
-    "vbroadcastsd 16(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm14;vfmadd231pd %%zmm6,%%zmm4,%%zmm15;vfmadd231pd %%zmm7,%%zmm4,%%zmm16;"\
-    "vbroadcastsd 24(%0),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm17;vfmadd231pd %%zmm6,%%zmm4,%%zmm18;vfmadd231pd %%zmm7,%%zmm4,%%zmm19;"
+    INNER_KERNEL_k1m4n16\
+    "vmovddup (%1,%%r12,4),%%zmm6; vfmadd231pd %%zmm4,%%zmm6,%%zmm16; vfmadd231pd %%zmm5,%%zmm6,%%zmm18;"\
+    "vmovddup 8(%1,%%r12,4),%%zmm7; vfmadd231pd %%zmm4,%%zmm7,%%zmm17; vfmadd231pd %%zmm5,%%zmm7,%%zmm19;"
 
-#define INNER_KERNEL_k1m8n24 \
-    INNER_KERNEL_k1m4n24\
-    "vbroadcastsd (%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm20;vfmadd231pd %%zmm6,%%zmm4,%%zmm21;vfmadd231pd %%zmm7,%%zmm4,%%zmm22;"\
-    "vbroadcastsd 8(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm23;vfmadd231pd %%zmm6,%%zmm4,%%zmm24;vfmadd231pd %%zmm7,%%zmm4,%%zmm25;"\
-    "vbroadcastsd 16(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm26;vfmadd231pd %%zmm6,%%zmm4,%%zmm27;vfmadd231pd %%zmm7,%%zmm4,%%zmm28;"\
-    "vbroadcastsd 24(%0,%%r12,1),%%zmm4;vfmadd231pd %%zmm5,%%zmm4,%%zmm29;vfmadd231pd %%zmm6,%%zmm4,%%zmm30;vfmadd231pd %%zmm7,%%zmm4,%%zmm31;"
+#define INNER_KERNEL_k1m8n8 \
+    "vbroadcastf32x4 (%0),%%zmm4; vbroadcastf32x4 16(%0),%%zmm5;"\
+    "vbroadcastf32x4 (%0,%%r12,1),%%zmm6; vbroadcastf32x4 16(%0,%%r12,1),%%zmm7; addq $32,%0;"\
+    "prefetcht0 128(%1);"\
+    "vmovddup (%1),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm8; vfmadd231pd %%zmm5,%%zmm2,%%zmm10;"\
+    "vfmadd231pd %%zmm6,%%zmm2,%%zmm12; vfmadd231pd %%zmm7,%%zmm2,%%zmm14;"\
+    "vmovddup 8(%1),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm9; vfmadd231pd %%zmm5,%%zmm1,%%zmm11;"\
+    "vfmadd231pd %%zmm6,%%zmm1,%%zmm13; vfmadd231pd %%zmm7,%%zmm1,%%zmm15;"
+
+#define INNER_KERNEL_k1m8n16 \
+    INNER_KERNEL_k1m8n8\
+    "prefetcht0 128(%1,%%r12,2);"\
+    "vmovddup (%1,%%r12,2),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm16; vfmadd231pd %%zmm5,%%zmm2,%%zmm18;"\
+    "vfmadd231pd %%zmm6,%%zmm2,%%zmm20; vfmadd231pd %%zmm7,%%zmm2,%%zmm22;"\
+    "vmovddup 8(%1,%%r12,2),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm17; vfmadd231pd %%zmm5,%%zmm1,%%zmm19;"\
+    "vfmadd231pd %%zmm6,%%zmm1,%%zmm21; vfmadd231pd %%zmm7,%%zmm1,%%zmm23;"
 
+#define INNER_KERNEL_k1m8n24 \
+    INNER_KERNEL_k1m8n16\
+    "prefetcht0 128(%1,%%r12,4);"\
+    "vmovddup (%1,%%r12,4),%%zmm2; vfmadd231pd %%zmm4,%%zmm2,%%zmm24; vfmadd231pd %%zmm5,%%zmm2,%%zmm26;"\
+    "vfmadd231pd %%zmm6,%%zmm2,%%zmm28; vfmadd231pd %%zmm7,%%zmm2,%%zmm30;"\
+    "vmovddup 8(%1,%%r12,4),%%zmm1; vfmadd231pd %%zmm4,%%zmm1,%%zmm25; vfmadd231pd %%zmm5,%%zmm1,%%zmm27;"\
+    "vfmadd231pd %%zmm6,%%zmm1,%%zmm29; vfmadd231pd %%zmm7,%%zmm1,%%zmm31;"
+
+/* micro kernels */
 #define INNER_KERNELm1(nn) \
     "cmpq $1,%2;jb "#nn"3f;"\
     #nn"4:\n\t"\
@@ -84,7 +91,7 @@
 #define INNER_KERNELm4(nn) \
     "cmpq $1,%2;jb "#nn"00f;"\
     #nn"01:\n\t"\
-    INNER_KERNEL_k1m4n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m4n##nn "addq $64,%1;"\
     "decq %2;cmpq $1,%2;jnb "#nn"01b;"\
     #nn"00:\n\t"
 
@@ -92,18 +99,18 @@
 #define INNER_KERNELm8(nn) \
     "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
     #nn"008:\n\t"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
     "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
     "prefetcht1 (%11); addq $16,%11;"\
     "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
     "movq %3,%10;"\
     #nn"001:\n\t"\
     "cmpq $1,%2;jb "#nn"000f;"\
     "prefetcht0 (%10); prefetcht0 63(%10); prefetcht0 (%10,%4,1); prefetcht0 63(%10,%4,1); leaq (%10,%4,2),%10;"\
-    INNER_KERNEL_k1m8n##nn "addq $32,%0;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
     "decq %2;jmp "#nn"001b;"\
     ""#nn"000:\n\t"
 
@@ -207,24 +214,19 @@
     INNER_STORE_m1n8(%%zmm13,8)
 
 #define INNER_TRANS_4x8(c1,c2,c3,c4) \
-    "vunpcklpd "#c2","#c1",%%zmm4;vunpckhpd "#c2","#c1",%%zmm5;vunpcklpd "#c4","#c3",%%zmm6;vunpckhpd "#c4","#c3",%%zmm7;"\
-    "vblendmpd %%zmm6,%%zmm4,"#c1"%{%6%};vblendmpd %%zmm7,%%zmm5,"#c3"%{%6%};"\
-    "vshuff64x2 $0xb1,"#c1","#c1","#c1";vshuff64x2 $0xb1,"#c3","#c3","#c3";"\
-    "vblendmpd %%zmm4,"#c1",%%zmm4%{%6%};vblendmpd %%zmm5,"#c3","#c2"%{%6%};"\
-    "vblendmpd "#c1",%%zmm6,%%zmm6%{%6%};vblendmpd "#c3",%%zmm7,"#c4"%{%6%};"\
-    "vmovapd %%zmm4,"#c1"; vmovapd %%zmm6,"#c3";"
+    "vblendmpd "#c3","#c1",%%zmm4%{%6%}; vblendmpd "#c4","#c2",%%zmm6%{%6%};"\
+    "vshuff64x2 $177,%%zmm4,%%zmm4,%%zmm4; vshuff64x2 $177,%%zmm6,%%zmm6,%%zmm6;"\
+    "vblendmpd "#c1",%%zmm4,"#c1"%{%6%}; vblendmpd "#c2",%%zmm6,"#c2"%{%6%};"\
+    "vblendmpd %%zmm4,"#c3","#c3"%{%6%}; vblendmpd %%zmm6,"#c4","#c4"%{%6%};"\
+
+#define INNER_TRANS_f128_4x4(c1,c2,c3,c4) \
+    "vshuff64x2 $68,"#c3","#c1",%%zmm4; vshuff64x2 $17,"#c4","#c2",%%zmm5;"\
+    "vshuff64x2 $238,"#c3","#c1",%%zmm6; vshuff64x2 $187,"#c4","#c2",%%zmm7;"\
+    "vblendmpd %%zmm5,%%zmm4,"#c2"%{%6%}; vshuff64x2 $177,"#c2","#c2","#c2"; vblendmpd %%zmm4,%%zmm5,"#c1"%{%6%};"\
+    "vblendmpd %%zmm7,%%zmm6,"#c4"%{%6%}; vshuff64x2 $177,"#c4","#c4","#c4"; vblendmpd %%zmm6,%%zmm7,"#c3"%{%6%};"
 
 #define INNER_TRANS_8x8(c1,c2,c3,c4,c5,c6,c7,c8) \
-    INNER_TRANS_4x8(c1,c2,c3,c4)\
-    INNER_TRANS_4x8(c5,c6,c7,c8)\
-    "vblendmpd "#c5","#c1",%%zmm4%{%5%};vshuff64x2 $0x4e,%%zmm4,%%zmm4,%%zmm4;"\
-    "vblendmpd "#c1",%%zmm4,"#c1"%{%5%};vblendmpd  %%zmm4,"#c5","#c5"%{%5%};"\
-    "vblendmpd "#c6","#c2",%%zmm5%{%5%};vshuff64x2 $0x4e,%%zmm5,%%zmm5,%%zmm5;"\
-    "vblendmpd "#c2",%%zmm5,"#c2"%{%5%};vblendmpd  %%zmm5,"#c6","#c6"%{%5%};"\
-    "vblendmpd "#c7","#c3",%%zmm6%{%5%};vshuff64x2 $0x4e,%%zmm6,%%zmm6,%%zmm6;"\
-    "vblendmpd "#c3",%%zmm6,"#c3"%{%5%};vblendmpd  %%zmm6,"#c7","#c7"%{%5%};"\
-    "vblendmpd "#c8","#c4",%%zmm7%{%5%};vshuff64x2 $0x4e,%%zmm7,%%zmm7,%%zmm7;"\
-    "vblendmpd "#c4",%%zmm7,"#c4"%{%5%};vblendmpd  %%zmm7,"#c8","#c8"%{%5%};"
+    INNER_TRANS_f128_4x4(c1,c3,c5,c7) INNER_TRANS_f128_4x4(c2,c4,c6,c8)
 
 //%7 for k01(input) only when m=4
 #define INNER_STORE_4x8(c1,c2,c3,c4) \
@@ -250,20 +252,14 @@
     INNER_STORE_4x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11)
 
 #define INNER_SAVE_m4n16 \
-    "movq %3,%10;"\
-    INNER_TRANS_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
-    INNER_STORE_4x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14)\
-    INNER_TRANS_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)\
-    INNER_STORE_4x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15)
+    INNER_SAVE_m4n8\
+    INNER_TRANS_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)\
+    INNER_STORE_4x8(%%zmm12,%%zmm13,%%zmm14,%%zmm15)
 
 #define INNER_SAVE_m4n24 \
-    "movq %3,%10;"\
-    INNER_TRANS_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
-    INNER_STORE_4x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17)\
-    INNER_TRANS_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
-    INNER_STORE_4x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18)\
-    INNER_TRANS_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)\
-    INNER_STORE_4x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19)
+    INNER_SAVE_m4n16\
+    INNER_TRANS_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)\
+    INNER_STORE_4x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19)
 
 #define INNER_SAVE_m8n8 \
     "movq %3,%10;"\
@@ -271,20 +267,14 @@
     INNER_STORE_8x8(%%zmm8,%%zmm9,%%zmm10,%%zmm11,%%zmm12,%%zmm13,%%zmm14,%%zmm15)
 
 #define INNER_SAVE_m8n16 \
-    "movq %3,%10;"\
-    INNER_TRANS_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
-    INNER_STORE_8x8(%%zmm8,%%zmm10,%%zmm12,%%zmm14,%%zmm16,%%zmm18,%%zmm20,%%zmm22)\
-    INNER_TRANS_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)\
-    INNER_STORE_8x8(%%zmm9,%%zmm11,%%zmm13,%%zmm15,%%zmm17,%%zmm19,%%zmm21,%%zmm23)
+    INNER_SAVE_m8n8\
+    INNER_TRANS_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)\
+    INNER_STORE_8x8(%%zmm16,%%zmm17,%%zmm18,%%zmm19,%%zmm20,%%zmm21,%%zmm22,%%zmm23)
 
 #define INNER_SAVE_m8n24 \
-    "movq %3,%10;"\
-    INNER_TRANS_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
-    INNER_STORE_8x8(%%zmm8,%%zmm11,%%zmm14,%%zmm17,%%zmm20,%%zmm23,%%zmm26,%%zmm29)\
-    INNER_TRANS_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
-    INNER_STORE_8x8(%%zmm9,%%zmm12,%%zmm15,%%zmm18,%%zmm21,%%zmm24,%%zmm27,%%zmm30)\
-    INNER_TRANS_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)\
-    INNER_STORE_8x8(%%zmm10,%%zmm13,%%zmm16,%%zmm19,%%zmm22,%%zmm25,%%zmm28,%%zmm31)
+    INNER_SAVE_m8n16\
+    INNER_TRANS_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)\
+    INNER_STORE_8x8(%%zmm24,%%zmm25,%%zmm26,%%zmm27,%%zmm28,%%zmm29,%%zmm30,%%zmm31)
 
 #define COMPUTE_n8 {\
     b_pref = packed_b_pointer + 8 * K;\
@@ -327,7 +317,7 @@
     "shlq $3,%4;addq %4,%3;shrq $3,%4;"\
     :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
     "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
-    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
+    ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 #define COMPUTE_n16 {\
@@ -372,7 +362,7 @@
     "leaq (%1,%%r12,4),%1;"\
     :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
     "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
-    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
+    ::"zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17",\
     "zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
@@ -417,9 +407,9 @@
     "shlq $3,%4;addq %4,%3;shlq $1,%4;addq %4,%3;shrq $4,%4;"\
     "leaq (%1,%%r12,4),%1; leaq (%1,%%r12,2),%1;"\
     :"+r"(a_block_pointer),"+r"(packed_b_pointer),"+r"(K),"+r"(c_pointer),"+r"(ldc_in_bytes),"+Yk"(k02),"+Yk"(k03),"+Yk"(k01),\
-    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)\
-    ::"zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18","zmm19",\
-    "zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
+    "+r"(M),"+r"(alpha),"+r"(c_store),"+r"(b_pref)::\
+    "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15","zmm16","zmm17","zmm18",\
+    "zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31","cc","memory","k1","r12","r13","r14");\
     a_block_pointer -= M * K;\
 }
 static void KERNEL_MAIN(double *packed_a, double *packed_b, BLASLONG m, BLASLONG ndiv8, BLASLONG k, BLASLONG LDC, double *c,double *alpha){//icopy=4,ocopy=8

From cf2a8e410cc095b40d3b357e74a5f77af83ce602 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@linux.ibm.com>
Date: Tue, 26 Nov 2019 21:55:04 -0700
Subject: [PATCH 113/210] Fix SEGV in cdot_power9

We were corrupting r2 because the local entry wasn't being
setup correctly.
---
 kernel/power/cdot_power9.S | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/power/cdot_power9.S b/kernel/power/cdot_power9.S
index 9ec7cdd85..6ca7a02a5 100644
--- a/kernel/power/cdot_power9.S
+++ b/kernel/power/cdot_power9.S
@@ -13,10 +13,7 @@
 	
 cdot_k:
 .LCF0:
-0:	addis 2,12,.TOC.-.LCF0@ha
-	addi 2,2,.TOC.-.LCF0@l
-	.localentry	cdot_k,.-cdot_k
-	mr. 9,3
+0:	mr. 9,3
 	ble 0,.L10
 	cmpdi 7,5,1
 	beq 7,.L18

From 6bc487de356f3b40412b0bca3ad950d4b5da38b4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 27 Nov 2019 15:10:57 +0100
Subject: [PATCH 114/210] Cleanup IOS build and disable FORTRAN on 32bit and
 ios builds for now

Travis recently appears unable to find a matching homebrew package for 32bit gfortran,
and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined"
warnings when CFLAGS is set in the environment
---
 .travis.yml | 48 +++++++++++++++---------------------------------
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6016ec1fe..fb6006474 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,10 +4,11 @@ dist: precise
 sudo: true
 language: c
 
-matrix:
+jobs:
   include:
     - &test-ubuntu
       os: linux
+      stage: test
       compiler: gcc
       addons:
         apt:
@@ -17,7 +18,7 @@ matrix:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
       script:
         - set -e
-        - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
         - make -C utest $COMMON_FLAGS $BTYPE
@@ -25,15 +26,6 @@ matrix:
         - TARGET_BOX=LINUX64
         - BTYPE="BINARY=64"
 
-    - <<: *test-ubuntu
-      os: linux-ppc64le
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
-      env:
-        # for matrix annotation only
-        - TARGET_BOX=PPC64LE_LINUX
-        - BTYPE="BINARY=64 USE_OPENMP=1"
-
     - <<: *test-ubuntu
       env:
         - TARGET_BOX=LINUX64
@@ -67,6 +59,7 @@ matrix:
         - BTYPE="BINARY=32"
 
     - os: linux
+      stage: test
       compiler: gcc
       addons:
         apt:
@@ -87,12 +80,13 @@ matrix:
     # that don't require sudo.
     - &test-alpine
       os: linux
+      stage: test
       dist: trusty
       sudo: true
       language: minimal
       before_install:
-        - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
-          && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251  alpine-chroot-install' | sha1sum -c || exit 1"
+        - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
+              && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e  alpine-chroot-install' | sha1sum -c || exit 1"
         - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
       install:
         - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
@@ -126,10 +120,11 @@ matrix:
     - <<: *test-alpine
       env:
         - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
+        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
 
     - &test-cmake
       os: linux
+      stage: test
       compiler: clang
       addons:
         apt:
@@ -156,30 +151,17 @@ matrix:
       env:
         - CMAKE=1
 
-    - &test-macos
+    - osx-gcc
       os: osx
-      osx_image: xcode10.1
-      before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
+      stage: test
+      osx_image: xcode8
+      before_script: *common-before
         - brew update
-        - brew install gcc@8 # for gfortran
+        - brew install gcc # for gfortran
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
-
-    - <<: *test-macos
-      osx_image: xcode8.3
-      env:
-        - BTYPE="BINARY=32 FC=gfortran-8"
-
-    - <<: *test-macos
-      osx_image: xcode10.1
-      env:
-        - COMMON_FLAGS="NUM_THREADS=32"
-        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
-        - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
-        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
+        - BTYPE="BINARY=64 INTERFACE64=1"
 
 # whitelist
 branches:

From 83dae28ae25b05d3e607f4f1112c369ce2f2e653 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 28 Nov 2019 00:09:06 +0100
Subject: [PATCH 115/210] Revert "Cleanup Travis IOS xbuild and disable FORTRAN
 on 32bit and ios builds for now"

---
 .travis.yml | 48 +++++++++++++++++++++++++++++++++---------------
 1 file changed, 33 insertions(+), 15 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index fb6006474..6016ec1fe 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,11 +4,10 @@ dist: precise
 sudo: true
 language: c
 
-jobs:
+matrix:
   include:
     - &test-ubuntu
       os: linux
-      stage: test
       compiler: gcc
       addons:
         apt:
@@ -18,7 +17,7 @@ jobs:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
       script:
         - set -e
-        - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+        - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
         - make -C test $COMMON_FLAGS $BTYPE
         - make -C ctest $COMMON_FLAGS $BTYPE
         - make -C utest $COMMON_FLAGS $BTYPE
@@ -26,6 +25,15 @@ jobs:
         - TARGET_BOX=LINUX64
         - BTYPE="BINARY=64"
 
+    - <<: *test-ubuntu
+      os: linux-ppc64le
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32"
+      env:
+        # for matrix annotation only
+        - TARGET_BOX=PPC64LE_LINUX
+        - BTYPE="BINARY=64 USE_OPENMP=1"
+
     - <<: *test-ubuntu
       env:
         - TARGET_BOX=LINUX64
@@ -59,7 +67,6 @@ jobs:
         - BTYPE="BINARY=32"
 
     - os: linux
-      stage: test
       compiler: gcc
       addons:
         apt:
@@ -80,13 +87,12 @@ jobs:
     # that don't require sudo.
     - &test-alpine
       os: linux
-      stage: test
       dist: trusty
       sudo: true
       language: minimal
       before_install:
-        - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.6.0/alpine-chroot-install' \
-              && echo 'a827a4ba3d0817e7c88bae17fe34e50204983d1e  alpine-chroot-install' | sha1sum -c || exit 1"
+        - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \
+          && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251  alpine-chroot-install' | sha1sum -c || exit 1"
         - alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
       install:
         - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers'
@@ -120,11 +126,10 @@ jobs:
     - <<: *test-alpine
       env:
         - TARGET_BOX=LINUX64_MUSL
-        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=core2"
+        - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2"
 
     - &test-cmake
       os: linux
-      stage: test
       compiler: clang
       addons:
         apt:
@@ -151,17 +156,30 @@ jobs:
       env:
         - CMAKE=1
 
-    - osx-gcc
+    - &test-macos
       os: osx
-      stage: test
-      osx_image: xcode8
-      before_script: *common-before
+      osx_image: xcode10.1
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
         - brew update
-        - brew install gcc # for gfortran
+        - brew install gcc@8 # for gfortran
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="BINARY=64 INTERFACE64=1"
+        - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
+
+    - <<: *test-macos
+      osx_image: xcode8.3
+      env:
+        - BTYPE="BINARY=32 FC=gfortran-8"
+
+    - <<: *test-macos
+      osx_image: xcode10.1
+      env:
+        - COMMON_FLAGS="NUM_THREADS=32"
+        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
+        - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
+        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
 
 # whitelist
 branches:

From ae2a0995ccaefc20a3476a4b232b37d7f37ad794 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 28 Nov 2019 00:15:36 +0100
Subject: [PATCH 116/210] Cleanup IOS build and disable FORTRAN on 32bit and
 ios builds for now

 Travis recently appears unable to find a matching homebrew package for 32bit gfortran,
and the IOS crossbuild suffered from excessive output due to the known problem with "ASMNAME redefined"
warnings when CFLAGS is set in the environment
---
 .travis.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 6016ec1fe..9e18412e8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -160,26 +160,25 @@ matrix:
       os: osx
       osx_image: xcode10.1
       before_script:
-        - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
         - brew update
         - brew install gcc@8 # for gfortran
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="BINARY=64 INTERFACE64=1 FC=gfortran-8"
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
 
     - <<: *test-macos
-      osx_image: xcode8.3
+      osx_image: xcode10.0
       env:
-        - BTYPE="BINARY=32 FC=gfortran-8"
+        - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
 
     - <<: *test-macos
       osx_image: xcode10.1
       env:
-        - COMMON_FLAGS="NUM_THREADS=32"
         - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk"
-        - CFLAGS="-O2 -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
-        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang"
+        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
+        - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
 
 # whitelist
 branches:

From 934e601e934f5cf930382dfd9d7e92b937d1d2ed Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Thu, 28 Nov 2019 19:56:35 +0800
Subject: [PATCH 117/210] Update dgemm_kernel_4x8_skylakex_2.c

---
 kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
index 51b0b94fa..90a4c2b1d 100644
--- a/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
+++ b/kernel/x86_64/dgemm_kernel_4x8_skylakex_2.c
@@ -97,15 +97,17 @@
 
 /* %10 for prefetch of C elements before storage; %4 = ldc(in bytes),%11 for prefetch of next B block */
 #define INNER_KERNELm8(nn) \
-    "movq %3,%10;cmpq $16,%2;jb "#nn"001f;"\
+    "movq %3,%10;cmpq $18,%2;jb "#nn"001f;"\
     #nn"008:\n\t"\
     INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
     INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
     "prefetcht1 (%10); prefetcht1 63(%10); addq %4,%10;"\
     INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
     INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
-    "prefetcht1 (%11); addq $16,%11;"\
-    "subq $4,%2;cmpq $16,%2;jnb "#nn"008b;"\
+    INNER_KERNEL_k1m8n##nn "addq $64,%1;"\
+    "prefetcht1 (%11); addq $32,%11;"\
+    "subq $6,%2;cmpq $18,%2;jnb "#nn"008b;"\
     "movq %3,%10;"\
     #nn"001:\n\t"\
     "cmpq $1,%2;jb "#nn"000f;"\

From e20709e976ba668141f07d47bf1152a6e948b729 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Thu, 28 Nov 2019 19:57:50 +0800
Subject: [PATCH 118/210] Update param.h

---
 param.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/param.h b/param.h
index 9dc94c420..d39fc4a1d 100644
--- a/param.h
+++ b/param.h
@@ -1691,16 +1691,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #define SGEMM_DEFAULT_P 768
-#define DGEMM_DEFAULT_P 512
+#define DGEMM_DEFAULT_P 384
 #define CGEMM_DEFAULT_P 384
 #define ZGEMM_DEFAULT_P 256
 
 #ifdef WINDOWS_ABI
 #define SGEMM_DEFAULT_Q 192
-#define DGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 168
 #else
 #define SGEMM_DEFAULT_Q 192
-#define DGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 168
 #endif
 #define CGEMM_DEFAULT_Q 192
 #define ZGEMM_DEFAULT_Q 128

From 97762234f9517f1ae90fc97a4456cd0923c30319 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 29 Nov 2019 23:47:23 +0100
Subject: [PATCH 119/210] Add variable for gcc >=9 test

used in KERNEL.POWER9
---
 kernel/Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/Makefile b/kernel/Makefile
index e81225075..9b468a6af 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,6 +5,11 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
+
+ifeq ($(C_COMPILER), GCC)
+GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
+endif
+
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0

From a9b62c03f852a38cc2171a652b93a673591c483b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 29 Nov 2019 23:49:50 +0100
Subject: [PATCH 120/210] Substitute precompiled gcc7 codes only when gcc is
 older than 9.x

---
 kernel/power/KERNEL.POWER9 | 392 ++++++++++++++++++++-----------------
 1 file changed, 208 insertions(+), 184 deletions(-)

diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index 2ed843fff..4bfa017e1 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -1,184 +1,208 @@
-#SGEMM_BETA = ../generic/gemm_beta.c
-#DGEMM_BETA = ../generic/gemm_beta.c
-#CGEMM_BETA = ../generic/zgemm_beta.c
-#ZGEMM_BETA = ../generic/zgemm_beta.c
-
-STRMMKERNEL	= sgemm_kernel_power9.S
-DTRMMKERNEL	= dgemm_kernel_power9.S
-CTRMMKERNEL	= cgemm_kernel_power9.S
-ZTRMMKERNEL	= zgemm_kernel_power9.S
-
-SGEMMKERNEL    =  sgemm_kernel_power9.S
-SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
-SGEMMITCOPY    = sgemm_tcopy_16_power8.S
-SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
-SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_power9.S
-DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
-DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
-DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    = cgemm_kernel_power9.S
-CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
-CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    = zgemm_kernel_power9.S
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
-ZGEMMITCOPY    = zgemm_tcopy_8_power8.S
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
-#CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_sse3.S
-#ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_sse3.S
-
-#Pure C for other kernels
-#SAMAXKERNEL  = ../arm/amax.c
-#DAMAXKERNEL  = ../arm/amax.c
-#CAMAXKERNEL  = ../arm/zamax.c
-#ZAMAXKERNEL  = ../arm/zamax.c
-#
-#SAMINKERNEL  = ../arm/amin.c
-#DAMINKERNEL  = ../arm/amin.c
-#CAMINKERNEL  = ../arm/zamin.c
-#ZAMINKERNEL  = ../arm/zamin.c
-#
-#SMAXKERNEL   = ../arm/max.c
-#DMAXKERNEL   = ../arm/max.c
-#
-#SMINKERNEL   = ../arm/min.c
-#DMINKERNEL   = ../arm/min.c
-#
-ISAMAXKERNEL = isamax_power9.S
-IDAMAXKERNEL = idamax.c
-ICAMAXKERNEL = icamax_power9.S
-IZAMAXKERNEL = izamax.c
-#
-ISAMINKERNEL = isamin_power9.S
-IDAMINKERNEL = idamin.c
-ICAMINKERNEL = icamin_power9.S
-IZAMINKERNEL = izamin.c
-#
-#ISMAXKERNEL  = ../arm/imax.c
-#IDMAXKERNEL  = ../arm/imax.c
-#
-#ISMINKERNEL  = ../arm/imin.c
-#IDMINKERNEL  = ../arm/imin.c
-#
-SASUMKERNEL  = sasum.c
-DASUMKERNEL  = dasum.c
-CASUMKERNEL  = casum.c
-ZASUMKERNEL  = zasum.c
-#
-SAXPYKERNEL  = saxpy.c
-DAXPYKERNEL  = daxpy.c
-CAXPYKERNEL  = caxpy_power9.S
-ZAXPYKERNEL  = zaxpy.c
-#
-SCOPYKERNEL  = scopy.c
-DCOPYKERNEL  = dcopy.c
-CCOPYKERNEL  = ccopy.c
-ZCOPYKERNEL  = zcopy.c
-#
-SDOTKERNEL   =  sdot.c
-DDOTKERNEL   =  ddot.c
-DSDOTKERNEL  =  sdot.c
-CDOTKERNEL   =  cdot_power9.S
-ZDOTKERNEL   =  zdot.c
-#
-SNRM2KERNEL  = ../arm/nrm2.c
-DNRM2KERNEL  = ../arm/nrm2.c
-CNRM2KERNEL  = ../arm/znrm2.c
-ZNRM2KERNEL  = ../arm/znrm2.c
-#
-SROTKERNEL   = srot.c
-DROTKERNEL   = drot.c
-CROTKERNEL   = crot.c
-ZROTKERNEL   = zrot.c
-#
-SSCALKERNEL  = sscal.c
-DSCALKERNEL  = dscal.c
-CSCALKERNEL  = zscal.c
-ZSCALKERNEL  = zscal.c
-#
-SSWAPKERNEL  = sswap.c
-DSWAPKERNEL  = dswap.c
-CSWAPKERNEL  = cswap.c
-ZSWAPKERNEL  = zswap.c
-#
-
-SGEMVNKERNEL = sgemv_n.c
-DGEMVNKERNEL = dgemv_n.c
-CGEMVNKERNEL = cgemv_n.c
-ZGEMVNKERNEL = zgemv_n_4.c
-#
-SGEMVTKERNEL = sgemv_t.c
-DGEMVTKERNEL = dgemv_t.c
-CGEMVTKERNEL = cgemv_t.c
-ZGEMVTKERNEL = zgemv_t_4.c
-
-
-#SSYMV_U_KERNEL =  ../generic/symv_k.c
-#SSYMV_L_KERNEL =  ../generic/symv_k.c
-#DSYMV_U_KERNEL =  ../generic/symv_k.c
-#DSYMV_L_KERNEL =  ../generic/symv_k.c
-#QSYMV_U_KERNEL =  ../generic/symv_k.c
-#QSYMV_L_KERNEL =  ../generic/symv_k.c
-#CSYMV_U_KERNEL =  ../generic/zsymv_k.c
-#CSYMV_L_KERNEL =  ../generic/zsymv_k.c
-#ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
-#ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
-#XSYMV_U_KERNEL =  ../generic/zsymv_k.c
-#XSYMV_L_KERNEL =  ../generic/zsymv_k.c
-
-#ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
-#ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
-
-LSAME_KERNEL = ../generic/lsame.c
-SCABS_KERNEL	= ../generic/cabs.c
-DCABS_KERNEL	= ../generic/cabs.c
-QCABS_KERNEL	= ../generic/cabs.c
-
-#Dump kernel
-CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
-ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+#SGEMM_BETA = ../generic/gemm_beta.c
+#DGEMM_BETA = ../generic/gemm_beta.c
+#CGEMM_BETA = ../generic/zgemm_beta.c
+#ZGEMM_BETA = ../generic/zgemm_beta.c
+
+STRMMKERNEL	= sgemm_kernel_power9.S
+DTRMMKERNEL	= dgemm_kernel_power9.S
+CTRMMKERNEL	= cgemm_kernel_power9.S
+ZTRMMKERNEL	= zgemm_kernel_power9.S
+
+SGEMMKERNEL    =  sgemm_kernel_power9.S
+SGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+SGEMMITCOPY    = sgemm_tcopy_16_power8.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMOTCOPY    = sgemm_tcopy_8_power8.S 
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_power9.S
+DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
+DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = cgemm_kernel_power9.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_8.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = zgemm_kernel_power9.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMINCOPY    = ../generic/zgemm_ncopy_8.c
+ZGEMMITCOPY    = zgemm_tcopy_8_power8.S
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+#Todo: CGEMM3MKERNEL should be 4x4 blocksizes.
+#CGEMM3MKERNEL    =  zgemm3m_kernel_8x4_sse3.S
+#ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_sse3.S
+
+#Pure C for other kernels
+#SAMAXKERNEL  = ../arm/amax.c
+#DAMAXKERNEL  = ../arm/amax.c
+#CAMAXKERNEL  = ../arm/zamax.c
+#ZAMAXKERNEL  = ../arm/zamax.c
+#
+#SAMINKERNEL  = ../arm/amin.c
+#DAMINKERNEL  = ../arm/amin.c
+#CAMINKERNEL  = ../arm/zamin.c
+#ZAMINKERNEL  = ../arm/zamin.c
+#
+#SMAXKERNEL   = ../arm/max.c
+#DMAXKERNEL   = ../arm/max.c
+#
+#SMINKERNEL   = ../arm/min.c
+#DMINKERNEL   = ../arm/min.c
+#
+ifneq ($(GCCVERSIONGTEQ9),1)
+ISAMAXKERNEL = isamax_power9.S
+else
+ISAMAXKERNEL = isamax.c
+endif
+IDAMAXKERNEL = idamax.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+ICAMAXKERNEL = icamax_power9.S
+else
+ICAMAXKERNEL = icamax.c
+endif
+IZAMAXKERNEL = izamax.c
+#
+ifneq ($(GCCVERSIONGTEQ9),1)
+ISAMINKERNEL = isamin_power9.S
+else
+ISAMINKERNEL = isamin.c
+endif
+IDAMINKERNEL = idamin.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+ICAMINKERNEL = icamin_power9.S
+else
+ICAMINKERNEL = icamin.c
+endif
+IZAMINKERNEL = izamin.c
+#
+#ISMAXKERNEL  = ../arm/imax.c
+#IDMAXKERNEL  = ../arm/imax.c
+#
+#ISMINKERNEL  = ../arm/imin.c
+#IDMINKERNEL  = ../arm/imin.c
+#
+SASUMKERNEL  = sasum.c
+DASUMKERNEL  = dasum.c
+CASUMKERNEL  = casum.c
+ZASUMKERNEL  = zasum.c
+#
+SAXPYKERNEL  = saxpy.c
+DAXPYKERNEL  = daxpy.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+CAXPYKERNEL  = caxpy_power9.S
+else
+CAXPYKERNEL  = caxpy.c
+endif
+ZAXPYKERNEL  = zaxpy.c
+#
+SCOPYKERNEL  = scopy.c
+DCOPYKERNEL  = dcopy.c
+CCOPYKERNEL  = ccopy.c
+ZCOPYKERNEL  = zcopy.c
+#
+SDOTKERNEL   =  sdot.c
+DDOTKERNEL   =  ddot.c
+DSDOTKERNEL  =  sdot.c
+ifneq ($(GCCVERSIONGTEQ9),1)
+CDOTKERNEL   =  cdot_power9.S
+else
+CDOTKERNEL   =  cdot.c
+endif
+ZDOTKERNEL   =  zdot.c
+#
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+#
+SROTKERNEL   = srot.c
+DROTKERNEL   = drot.c
+CROTKERNEL   = crot.c
+ZROTKERNEL   = zrot.c
+#
+SSCALKERNEL  = sscal.c
+DSCALKERNEL  = dscal.c
+CSCALKERNEL  = zscal.c
+ZSCALKERNEL  = zscal.c
+#
+SSWAPKERNEL  = sswap.c
+DSWAPKERNEL  = dswap.c
+CSWAPKERNEL  = cswap.c
+ZSWAPKERNEL  = zswap.c
+#
+
+SGEMVNKERNEL = sgemv_n.c
+DGEMVNKERNEL = dgemv_n.c
+CGEMVNKERNEL = cgemv_n.c
+ZGEMVNKERNEL = zgemv_n_4.c
+#
+SGEMVTKERNEL = sgemv_t.c
+DGEMVTKERNEL = dgemv_t.c
+CGEMVTKERNEL = cgemv_t.c
+ZGEMVTKERNEL = zgemv_t_4.c
+
+
+#SSYMV_U_KERNEL =  ../generic/symv_k.c
+#SSYMV_L_KERNEL =  ../generic/symv_k.c
+#DSYMV_U_KERNEL =  ../generic/symv_k.c
+#DSYMV_L_KERNEL =  ../generic/symv_k.c
+#QSYMV_U_KERNEL =  ../generic/symv_k.c
+#QSYMV_L_KERNEL =  ../generic/symv_k.c
+#CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_U_KERNEL =  ../generic/zsymv_k.c
+#XSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+#ZHEMV_U_KERNEL =  ../generic/zhemv_k.c
+#ZHEMV_L_KERNEL =  ../generic/zhemv_k.c
+
+LSAME_KERNEL = ../generic/lsame.c
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+#Dump kernel
+CGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c
+ZGEMM3MKERNEL    = ../generic/zgemm3mkernel_dump.c

From 2181fb7047f87f66ae1584c8af4e66e766b31b53 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 29 Nov 2019 23:54:15 +0100
Subject: [PATCH 121/210] Fix caxpy/caxpyc naming in localentry

---
 kernel/power/caxpy_power8.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/power/caxpy_power8.S b/kernel/power/caxpy_power8.S
index b5f841d2e..294a1d24d 100644
--- a/kernel/power/caxpy_power8.S
+++ b/kernel/power/caxpy_power8.S
@@ -16,7 +16,11 @@
 0:	addis 2,12,.TOC.-.LCF0@ha
 	addi 2,2,.TOC.-.LCF0@l
 #if _CALL_ELF ==2
+#ifdef CONJ
+	.localentry	caxpyc_k,.-caxpyc_k
+#else
 	.localentry	caxpy_k,.-caxpy_k
+#endif
 #endif
 	mr. 7,3
 	ble 0,.L33
@@ -517,7 +521,11 @@
 	.long 0
 	.byte 0,0,0,0,0,4,0,0
 #if _CALL_ELF ==2
+#ifdef CONJ
+	.size	caxpyc_k,.-caxpyc_k
+#else
 	.size	caxpy_k,.-caxpy_k
+#endif
 #endif
 	.section	.rodata
 	.align 4

From dedd822d1aeb2315e44e47e97167ae8a02c9c9ff Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 29 Nov 2019 23:56:57 +0100
Subject: [PATCH 122/210] Fix caxpy/caxpyc naming in localentry

---
 kernel/power/caxpy_power9.S | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/kernel/power/caxpy_power9.S b/kernel/power/caxpy_power9.S
index 48e6e5ba3..844cacd50 100644
--- a/kernel/power/caxpy_power9.S
+++ b/kernel/power/caxpy_power9.S
@@ -17,7 +17,11 @@ caxpy_k:
 .LCF0:
 0:	addis 2,12,.TOC.-.LCF0@ha
 	addi 2,2,.TOC.-.LCF0@l
+#ifdef CONJ
+	.localentry	caxpyc_k,.-caxpyc_k
+#else
 	.localentry	caxpy_k,.-caxpy_k
+#endif
 	mr. 7,3
 	ble 0,.L33
 	cmpdi 7,9,1
@@ -474,7 +478,11 @@ caxpy_k:
 	b .L13
 	.long 0
 	.byte 0,0,0,0,0,1,0,0
+#ifdef CONJ
+	.size	caxpyc_k,.-caxpyc_k
+#else
 	.size	caxpy_k,.-caxpy_k
+#endif
 	.section	.rodata
 	.align 4
 	.set	.LANCHOR0,. + 0

From b863b32ac5598e96b76d5783ae3a96c2b58e1712 Mon Sep 17 00:00:00 2001
From: Isuru Fernando <isuruf@gmail.com>
Date: Sun, 1 Dec 2019 11:55:49 -0600
Subject: [PATCH 123/210] Workaround an ICE in clang 9.0.0

This bug is not there in 8.x nor in the 9.0 daily snapshot.
---
 kernel/x86_64/dsymv_L_microk_skylakex-2.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/dsymv_L_microk_skylakex-2.c b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
index 8244dffa1..bdcd914fb 100644
--- a/kernel/x86_64/dsymv_L_microk_skylakex-2.c
+++ b/kernel/x86_64/dsymv_L_microk_skylakex-2.c
@@ -33,6 +33,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define HAVE_KERNEL_4x4 1
 
+#if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0
+#pragma clang optimize off
+#endif
+
 static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FLOAT *y, FLOAT *temp1, FLOAT *temp2)
 {
 
@@ -155,7 +159,12 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
 	temp2[1] += half_accum1[0];
 	temp2[2] += half_accum2[0];
 	temp2[3] += half_accum3[0];
-} 
+}
+
+#if defined(__clang_patchlevel__) && __clang_major__ == 9 && __clang_minor__ == 0 && __clang_patchlevel__ == 0
+#pragma clang optimize on
+#endif
+
 #else
 #include "dsymv_L_microk_haswell-2.c"
-#endif
\ No newline at end of file
+#endif

From 715f4650d9874badfede90e4bd09451ac8ea1886 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Dec 2019 08:24:10 +0100
Subject: [PATCH 124/210] Delete stray copy of dynamic.c from PR 2228

---
 dynamic.c | 897 ------------------------------------------------------
 1 file changed, 897 deletions(-)
 delete mode 100644 dynamic.c

diff --git a/dynamic.c b/dynamic.c
deleted file mode 100644
index aa2b87621..000000000
--- a/dynamic.c
+++ /dev/null
@@ -1,897 +0,0 @@
-/*********************************************************************/
-/* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* All rights reserved.                                              */
-/*                                                                   */
-/* Redistribution and use in source and binary forms, with or        */
-/* without modification, are permitted provided that the following   */
-/* conditions are met:                                               */
-/*                                                                   */
-/*   1. Redistributions of source code must retain the above         */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer.                                                  */
-/*                                                                   */
-/*   2. Redistributions in binary form must reproduce the above      */
-/*      copyright notice, this list of conditions and the following  */
-/*      disclaimer in the documentation and/or other materials       */
-/*      provided with the distribution.                              */
-/*                                                                   */
-/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
-/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
-/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
-/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
-/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
-/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
-/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
-/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
-/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
-/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
-/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
-/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
-/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
-/*    POSSIBILITY OF SUCH DAMAGE.                                    */
-/*                                                                   */
-/* The views and conclusions contained in the software and           */
-/* documentation are those of the authors and should not be          */
-/* interpreted as representing official policies, either expressed   */
-/* or implied, of The University of Texas at Austin.                 */
-/*********************************************************************/
-
-#include "common.h"
-
-#ifdef _MSC_VER
-#define strncasecmp _strnicmp
-#define strcasecmp _stricmp
-#endif
-
-#ifdef ARCH_X86
-#define EXTERN extern
-#else
-#define EXTERN
-#endif
-
-#ifdef DYNAMIC_LIST
-extern gotoblas_t gotoblas_PRESCOTT;
-
-#ifdef DYN_ATHLON
-extern gotoblas_t gotoblas_ATHLON;
-#else
-#define gotoblas_ATHLON gotoblas_PRESCOTT
-#endif
-#ifdef DYN_KATMAI
-extern gotoblas_t gotoblas_KATMAI;
-#else
-#define gotoblas_KATMAI gotoblas_PRESCOTT
-#endif
-#ifdef DYN_BANIAS
-extern gotoblas_t gotoblas_BANIAS;
-#else
-#define gotoblas_BANIAS gotoblas_PRESCOTT
-#endif
-#ifdef DYN_COPPERMINE
-extern gotoblas_t gotoblas_COPPERMINE;
-#else
-#define gotoblas_COPPERMINE gotoblas_PRESCOTT
-#endif
-#ifdef DYN_NORTHWOOD
-extern gotoblas_t gotoblas_NORTHWOOD;
-#else
-#define gotoblas_NORTHWOOD gotoblas_PRESCOTT
-#endif
-#ifdef DYN_CORE2
-extern gotoblas_t gotoblas_CORE2;
-#else
-#define gotoblas_CORE2 gotoblas_PRESCOTT
-#endif
-#ifdef DYN_NEHALEM
-extern gotoblas_t gotoblas_NEHALEM;
-#else
-#define gotoblas_NEHALEM gotoblas_PRESCOTT
-#endif
-#ifdef DYN_BARCELONA
-extern gotoblas_t gotoblas_BARCELONA;
-#elif defined(DYN_NEHALEM)
-#define gotoblas_BARCELONA gotoblas_NEHALEM
-#else
-#define gotoblas_BARCELONA gotoblas_PRESCOTT
-#endif
-#ifdef DYN_ATOM
-extern gotoblas_t gotoblas_ATOM;
-elif defined(DYN_NEHALEM)
-#define gotoblas_ATOM gotoblas_NEHALEM
-#else
-#define gotoblas_ATOM gotoblas_PRESCOTT
-#endif
-#ifdef DYN_NANO
-extern gotoblas_t gotoblas_NANO;
-#else
-#define gotoblas_NANO gotoblas_PRESCOTT
-#endif
-#ifdef DYN_PENRYN
-extern gotoblas_t gotoblas_PENRYN;
-#else
-#define gotoblas_PENRYN gotoblas_PRESCOTT
-#endif
-#ifdef DYN_DUNNINGTON
-extern gotoblas_t gotoblas_DUNNINGTON;
-#else
-#define gotoblas_DUNNINGTON gotoblas_PRESCOTT
-#endif
-#ifdef DYN_OPTERON
-extern gotoblas_t gotoblas_OPTERON;
-#else
-#define gotoblas_OPTERON gotoblas_PRESCOTT
-#endif
-#ifdef DYN_OPTERON_SSE3
-extern gotoblas_t gotoblas_OPTERON_SSE3;
-#else
-#define gotoblas_OPTERON_SSE3 gotoblas_PRESCOTT
-#endif
-#ifdef DYN_BOBCAT
-extern gotoblas_t gotoblas_BOBCAT;
-#elif defined(DYN_NEHALEM)
-#define gotoblas_BOBCAT gotoblas_NEHALEM
-#else
-#define gotoblas_BOBCAT gotoblas_PRESCOTT
-#endif
-#ifdef DYN_SANDYBRIDGE
-extern gotoblas_t gotoblas_SANDYBRIDGE;
-#elif defined(DYN_NEHALEM)
-#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
-#else
-#define gotoblas_SANDYBRIDGE gotoblas_PRESCOTT
-#endif
-#ifdef DYN_BULLDOZER
-extern gotoblas_t gotoblas_BULLDOZER;
-#elif defined(DYN_SANDYBRIDGE)
-#define gotoblas_BULLDOZER gotoblas_SANDYBRIDGE
-#elif defined(DYN_NEHALEM)
-#define gotoblas_BULLDOZER gotoblas_NEHALEM
-#else
-#define gotoblas_BULLDOZER gotoblas_PRESCOTT
-#endif
-#ifdef DYN_PILEDRIVER
-extern gotoblas_t gotoblas_PILEDRIVER;
-#elif defined(DYN_SANDYBRIDGE)
-#define gotoblas_PILEDRIVER gotoblas_SANDYBRIDGE
-#elif defined(DYN_NEHALEM)
-#define gotoblas_PILEDRIVER gotoblas_NEHALEM
-#else
-#define gotoblas_PILEDRIVER gotoblas_PRESCOTT
-#endif
-#ifdef DYN_STEAMROLLER
-extern gotoblas_t gotoblas_STEAMROLLER;
-#elif defined(DYN_SANDYBRIDGE)
-#define gotoblas_STEAMROLLER gotoblas_SANDYBRIDGE
-#elif defined(DYN_NEHALEM)
-#define gotoblas_STEAMROLLER gotoblas_NEHALEM
-#else
-#define gotoblas_STEAMROLLER gotoblas_PRESCOTT
-#endif
-#ifdef DYN_EXCAVATOR
-extern gotoblas_t gotoblas_EXCAVATOR;
-#elif defined(DYN_SANDYBRIDGE)
-#define gotoblas_EXCAVATOR gotoblas_SANDYBRIDGE
-#elif defined(DYN_NEHALEM)
-#define gotoblas_EXCAVATOR gotoblas_NEHALEM
-#else
-#define gotoblas_EXCAVATOR gotoblas_PRESCOTT
-#endif
-#ifdef DYN_HASWELL
-extern gotoblas_t gotoblas_HASWELL;
-#elif defined(DYN_SANDYBRIDGE)
-#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
-#elif defined(DYN_NEHALEM)
-#define gotoblas_HASWELL gotoblas_NEHALEM
-#else
-#define gotoblas_HASWELL gotoblas_PRESCOTT
-#endif
-#ifdef DYN_ZEN
-extern gotoblas_t gotoblas_ZEN;
-#elif defined(DYN_HASWELL)
-#define gotoblas_ZEN gotoblas_HASWELL
-#elif defined(DYN_SANDYBRIDGE)
-#define gotoblas_ZEN gotoblas_SANDYBRIDGE
-#elif defined(DYN_NEHALEM)
-#define gotoblas_ZEN gotoblas_NEHALEM
-#else
-#define gotoblas_ZEN gotoblas_PRESCOTT
-#endif
-#ifdef DYN_SKYLAKEX
-extern gotoblas_t gotoblas_SKYLAKEX;
-#elif defined(DYN_HASWELL)
-#define gotoblas_SKYLAKEX gotoblas_HASWELL
-#elif defined(DYN_SANDYBRIDGE)
-#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
-#elif defined(DYN_NEHALEM)
-#define gotoblas_SKYLAKEX gotoblas_NEHALEM
-#else
-#define gotoblas_SKYLAKEX gotoblas_PRESCOTT
-#endif
-
-
-#else // not DYNAMIC_LIST
-EXTERN gotoblas_t  gotoblas_KATMAI;
-EXTERN gotoblas_t  gotoblas_COPPERMINE;
-EXTERN gotoblas_t  gotoblas_NORTHWOOD;
-EXTERN gotoblas_t  gotoblas_BANIAS;
-EXTERN gotoblas_t  gotoblas_ATHLON;
-
-extern gotoblas_t  gotoblas_PRESCOTT;
-extern gotoblas_t  gotoblas_CORE2;
-extern gotoblas_t  gotoblas_NEHALEM;
-extern gotoblas_t  gotoblas_BARCELONA;
-#ifdef DYNAMIC_OLDER
-extern gotoblas_t  gotoblas_ATOM;
-extern gotoblas_t  gotoblas_NANO;
-extern gotoblas_t  gotoblas_PENRYN;
-extern gotoblas_t  gotoblas_DUNNINGTON;
-extern gotoblas_t  gotoblas_OPTERON;
-extern gotoblas_t  gotoblas_OPTERON_SSE3;
-extern gotoblas_t  gotoblas_BOBCAT;
-#else
-#define gotoblas_ATOM gotoblas_NEHALEM
-#define gotoblas_NANO gotoblas_NEHALEM
-#define gotoblas_PENRYN gotoblas_CORE2
-#define gotoblas_DUNNINGTON gotoblas_CORE2
-#define gotoblas_OPTERON gotoblas_CORE2
-#define gotoblas_OPTERON_SSE3 gotoblas_CORE2
-#define gotoblas_BOBCAT gotoblas_CORE2
-#endif
-
-#ifndef NO_AVX
-extern gotoblas_t  gotoblas_SANDYBRIDGE;
-extern gotoblas_t  gotoblas_BULLDOZER;
-extern gotoblas_t  gotoblas_PILEDRIVER;
-extern gotoblas_t  gotoblas_STEAMROLLER;
-extern gotoblas_t  gotoblas_EXCAVATOR;
-#ifdef NO_AVX2
-#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
-#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
-#define gotoblas_ZEN gotoblas_SANDYBRIDGE
-#else
-extern gotoblas_t  gotoblas_HASWELL;
-extern gotoblas_t  gotoblas_ZEN;
-#ifndef NO_AVX512
-extern gotoblas_t  gotoblas_SKYLAKEX;
-#else
-#define gotoblas_SKYLAKEX gotoblas_HASWELL
-#endif
-#endif
-#else
-//Use NEHALEM kernels for sandy bridge
-#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
-#define gotoblas_HASWELL gotoblas_NEHALEM
-#define gotoblas_SKYLAKEX gotoblas_NEHALEM
-#define gotoblas_BULLDOZER gotoblas_BARCELONA
-#define gotoblas_PILEDRIVER gotoblas_BARCELONA
-#define gotoblas_STEAMROLLER gotoblas_BARCELONA
-#define gotoblas_EXCAVATOR gotoblas_BARCELONA
-#define gotoblas_ZEN gotoblas_BARCELONA
-#endif
-
-#endif // DYNAMIC_LIST
-
-#define VENDOR_INTEL      1
-#define VENDOR_AMD        2
-#define VENDOR_CENTAUR    3
-#define VENDOR_HYGON	  4
-#define VENDOR_UNKNOWN   99
-
-#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
-
-#ifndef NO_AVX
-static inline void xgetbv(int op, int * eax, int * edx){
-  //Use binary code for xgetbv
-  __asm__ __volatile__
-    (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc");
-}
-#endif
-
-int support_avx(){
-#ifndef NO_AVX
-  int eax, ebx, ecx, edx;
-  int ret=0;
-
-  cpuid(1, &eax, &ebx, &ecx, &edx);
-  if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
-    xgetbv(0, &eax, &edx);
-    if((eax & 6) == 6){
-      ret=1;  //OS support AVX
-    }
-  }
-  return ret;
-#else
-  return 0;
-#endif
-}
-
-int support_avx2(){
-#ifndef NO_AVX2
-  int eax, ebx, ecx=0, edx;
-  int ret=0;
-
-  if (!support_avx())
-    return 0;
-  cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 0)
-      ret=1;  //OS supports AVX2
-  return ret;
-#else
-  return 0;
-#endif
-}
-
-int support_avx512(){
-#if !defined(NO_AVX) && !defined(NO_AVX512)
-  int eax, ebx, ecx, edx;
-  int ret=0;
-
-  if (!support_avx())
-    return 0;
-  cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 1){
-      ret=0;  //OS does not even support AVX2
-  }
-  if((ebx & (1<<31)) != 0){
-    xgetbv(0, &eax, &edx);
-    if((eax & 0xe0) == 0xe0)
-      ret=1;  //OS supports AVX512VL
-  }
-  return ret;
-#else
-  return 0;
-#endif
-}
-
-extern void openblas_warning(int verbose, const char * msg);
-#define FALLBACK_VERBOSE 1
-#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
-#define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n"
-#define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n"
-#define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"
-
-static int get_vendor(void){
-  int eax, ebx, ecx, edx;
-
-  union
-  {
-        char vchar[16];
-        int  vint[4];
-  } vendor;
-
-  cpuid(0, &eax, &ebx, &ecx, &edx);
-
-  *(&vendor.vint[0]) = ebx;
-  *(&vendor.vint[1]) = edx;
-  *(&vendor.vint[2]) = ecx;
-
-  vendor.vchar[12] = '\0';
-
-  if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL;
-  if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD;
-  if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR;
-  if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON;
-
-  if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL;
-
-  return VENDOR_UNKNOWN;
-}
-
-static gotoblas_t *get_coretype(void){
-
-  int eax, ebx, ecx, edx;
-  int family, exfamily, model, vendor, exmodel;
-
-  cpuid(1, &eax, &ebx, &ecx, &edx);
-
-  family   = BITMASK(eax,  8, 0x0f);
-  exfamily = BITMASK(eax, 20, 0xff);
-  model    = BITMASK(eax,  4, 0x0f);
-  exmodel  = BITMASK(eax, 16, 0x0f);
-
-  vendor = get_vendor();
-
-  if (vendor == VENDOR_INTEL){
-    switch (family) {
-    case 0x6:
-      switch (exmodel) {
-      case 0:
-	if (model <= 0x7) return &gotoblas_KATMAI;
-	if ((model == 0x8) || (model == 0xa) || (model == 0xb)) return &gotoblas_COPPERMINE;
-	if ((model == 0x9) || (model == 0xd)) return &gotoblas_BANIAS;
-	if (model == 14) return &gotoblas_BANIAS;
-	if (model == 15) return &gotoblas_CORE2;
-	return NULL;
-
-      case 1:
-	if (model == 6) return &gotoblas_CORE2;
-	if (model == 7) return &gotoblas_PENRYN;
-	if (model == 13) return &gotoblas_DUNNINGTON;
-	if ((model == 10) || (model == 11) || (model == 14) || (model == 15)) return &gotoblas_NEHALEM;
-	if (model == 12) return &gotoblas_ATOM;
-	return NULL;
-
-      case 2:
-	//Intel Core (Clarkdale) / Core (Arrandale)
-	// Pentium (Clarkdale) / Pentium Mobile (Arrandale)
-	// Xeon (Clarkdale), 32nm
-	if (model ==  5) return &gotoblas_NEHALEM;
-
-	//Intel Xeon Processor 5600 (Westmere-EP)
-	//Xeon Processor E7 (Westmere-EX)
-	//Xeon E7540
-	if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM;
-
-	//Intel Core i5-2000 /i7-2000 (Sandy Bridge)
-	//Intel Core i7-3000 / Xeon E5
-	if (model == 10 || model == 13) {
-	  if(support_avx())
-	    return &gotoblas_SANDYBRIDGE;
-	  else{
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	return NULL;
-      case 3:
-	//Intel Sandy Bridge 22nm (Ivy Bridge?)
-	if (model == 10 || model == 14) {
-	  if(support_avx())
-	    return &gotoblas_SANDYBRIDGE;
-	  else{
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	//Intel Haswell
-	if (model == 12 || model == 15) {
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	//Intel Broadwell
-	if (model == 13) {
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	if (model == 7) return &gotoblas_ATOM; //Bay Trail	
-	return NULL;
-      case 4:
-		//Intel Haswell
-	if (model == 5 || model == 6) {
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	//Intel Broadwell
-	if (model == 7 || model == 15) {
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	//Intel Skylake
-	if (model == 14) {
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	//Intel Braswell / Avoton
-	if (model == 12 || model == 13) { 
-	  return &gotoblas_NEHALEM;
-	}	
-	return NULL;
-      case 5:
-	//Intel Broadwell
-	if (model == 6) {
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	if (model == 5) {	
-	// Intel Skylake X
-          if (support_avx512()) 
-	    return &gotoblas_SKYLAKEX;
-	  if(support_avx2()){
-	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
-	    return &gotoblas_HASWELL;
-          }
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-          return &gotoblas_NEHALEM;
-          }
-	}
-	//Intel Skylake
-	if (model == 14) {
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	//Intel Phi Knights Landing
-	if (model == 7) {
-	  if(support_avx2()){
-	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
-	    return &gotoblas_HASWELL;
-	  }  
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	//Apollo Lake or Denverton
-	if (model == 12 || model == 15) { 
-	  return &gotoblas_NEHALEM;
-	}	
-	return NULL;
-      case 6:
-        if (model == 6) {
-          // Cannon Lake
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM;
-	  }
-        }
-        return NULL;  
-      case 7:
-        if (model == 10) // Goldmont plus 
-           return &gotoblas_NEHALEM;
-        if (model == 14) {
-	// Ice Lake
-          if (support_avx512()) 
-	    return &gotoblas_SKYLAKEX;
-	  if(support_avx2()){
-	    openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK);
-	    return &gotoblas_HASWELL;
-          }
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-          openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-          return &gotoblas_NEHALEM;
-          }
-        }
-        return NULL;  
-      case 9:
-      case 8:
-	if (model == 14 ) { // Kaby Lake, Coffee Lake
-	  if(support_avx2())
-	    return &gotoblas_HASWELL;
-	  if(support_avx()) {
-	    openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK);
-	    return &gotoblas_SANDYBRIDGE;
-	  } else {
-	    openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK);
-	    return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-	return NULL;
-      }
-      case 0xf:
-      if (model <= 0x2) return &gotoblas_NORTHWOOD;
-      return &gotoblas_PRESCOTT;
-    }
-  }
-
-  if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){
-    if (family <= 0xe) {
-        // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon
-        cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
-        if ( (eax & 0xffff)  >= 0x01) {
-            cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
-            if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0)
-              return NULL;
-          }
-        else
-          return NULL;
-
-        return &gotoblas_ATHLON;
-      }
-    if (family == 0xf){
-      if ((exfamily == 0) || (exfamily == 2)) {
-	if (ecx & (1 <<  0)) return &gotoblas_OPTERON_SSE3;
-	else return &gotoblas_OPTERON;
-      }  else if (exfamily == 5) {
-	return &gotoblas_BOBCAT;
-      } else if (exfamily == 6) {
-	if(model == 1){
-	  //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
-	  if(support_avx())
-	    return &gotoblas_BULLDOZER;
-	  else{
-	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-	  }
-	}else if(model == 2 || model == 3){
-	  //AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
-	  if(support_avx())
-	    return &gotoblas_PILEDRIVER;
-	  else{
-	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-	  }
-	}else if(model == 5){
-	  if(support_avx())
-	    return &gotoblas_EXCAVATOR;
-	  else{
-	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-	  }
-	}else if(model == 0 || model == 8){
-	  if (exmodel == 1) {
-	    //AMD Trinity
-	    if(support_avx())
-	      return &gotoblas_PILEDRIVER;
-	    else{
-	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-	    }
-	   }else if (exmodel == 3) {
-	    //AMD STEAMROLLER
-	    if(support_avx())
-	      return &gotoblas_STEAMROLLER;
-	    else{
-	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-	    }
-	  }else if (exmodel == 6) {
-	    if(support_avx())
-	      return &gotoblas_EXCAVATOR;
-	    else{
-	      openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	      return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-	    }
-
-	  }
-	}
-      } else if (exfamily == 8) {
-	if (model == 1 || model == 8) {
-	  if(support_avx())
-	    return &gotoblas_ZEN;
-	  else{
-	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-	  }
-	}
-      } else if (exfamily == 9) {
-	  if(support_avx())
-	    return &gotoblas_ZEN;
-	  else{
-	    openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK);
-	    return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
-        }
-      }else {
-	return &gotoblas_BARCELONA;
-      }
-    }
-  }
-
-  if (vendor == VENDOR_CENTAUR) {
-    switch (family) {
-    case 0x6:
-      return &gotoblas_NANO;
-    }
-  }
-
-  return NULL;
-}
-
-static char *corename[] = {
-    "Unknown",
-    "Katmai",
-    "Coppermine",
-    "Northwood",
-    "Prescott",
-    "Banias",
-    "Atom",
-    "Core2",
-    "Penryn",
-    "Dunnington",
-    "Nehalem",
-    "Athlon",
-    "Opteron",
-    "Opteron_SSE3",
-    "Barcelona",
-    "Nano",
-    "Sandybridge",
-    "Bobcat",
-    "Bulldozer",
-    "Piledriver",
-    "Haswell",
-    "Steamroller",
-    "Excavator",
-    "Zen",
-    "SkylakeX"	
-};
-
-char *gotoblas_corename(void) {
-
-  if (gotoblas == &gotoblas_KATMAI)       return corename[ 1];
-  if (gotoblas == &gotoblas_COPPERMINE)   return corename[ 2];
-  if (gotoblas == &gotoblas_NORTHWOOD)    return corename[ 3];
-  if (gotoblas == &gotoblas_PRESCOTT)     return corename[ 4];
-  if (gotoblas == &gotoblas_BANIAS)       return corename[ 5];
-  if (gotoblas == &gotoblas_ATOM)         return corename[ 6];
-  if (gotoblas == &gotoblas_CORE2)        return corename[ 7];
-  if (gotoblas == &gotoblas_PENRYN)       return corename[ 8];
-  if (gotoblas == &gotoblas_DUNNINGTON)   return corename[ 9];
-  if (gotoblas == &gotoblas_NEHALEM)      return corename[10];
-  if (gotoblas == &gotoblas_ATHLON)       return corename[11];
-  if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12];
-  if (gotoblas == &gotoblas_OPTERON)      return corename[13];
-  if (gotoblas == &gotoblas_BARCELONA)    return corename[14];
-  if (gotoblas == &gotoblas_NANO)         return corename[15];
-  if (gotoblas == &gotoblas_SANDYBRIDGE)  return corename[16];
-  if (gotoblas == &gotoblas_BOBCAT)       return corename[17];
-  if (gotoblas == &gotoblas_BULLDOZER)    return corename[18];
-  if (gotoblas == &gotoblas_PILEDRIVER)   return corename[19];
-  if (gotoblas == &gotoblas_HASWELL)      return corename[20];
-  if (gotoblas == &gotoblas_STEAMROLLER)  return corename[21];
-  if (gotoblas == &gotoblas_EXCAVATOR)    return corename[22];
-  if (gotoblas == &gotoblas_ZEN)          return corename[23];
-  if (gotoblas == &gotoblas_SKYLAKEX)     return corename[24];
-  return corename[0];
-}
-
-
-static gotoblas_t *force_coretype(char *coretype){
-
-	int i ;
-	int found = -1;
-	char message[128];
-	//char mname[20];
-
-	for ( i=1 ; i <= 24; i++)
-	{
-		if (!strncasecmp(coretype,corename[i],20))
-		{
-			found = i;
-			break;
-		}
-	}
-	if (found < 0)
-	{
-	        //strncpy(mname,coretype,20);
-	        snprintf(message, 128, "Core not found: %s\n",coretype);
-    		openblas_warning(1, message);
-		return(NULL);
-	}
-
-	switch (found)
-	{
-		case 24: return (&gotoblas_SKYLAKEX);	
-		case 23: return (&gotoblas_ZEN);
-		case 22: return (&gotoblas_EXCAVATOR);
-		case 21: return (&gotoblas_STEAMROLLER);
-		case 20: return (&gotoblas_HASWELL);
-		case 19: return (&gotoblas_PILEDRIVER);
-		case 18: return (&gotoblas_BULLDOZER);
-		case 17: return (&gotoblas_BOBCAT);
-		case 16: return (&gotoblas_SANDYBRIDGE);
-		case 15: return (&gotoblas_NANO);
-		case 14: return (&gotoblas_BARCELONA);
-		case 13: return (&gotoblas_OPTERON);
-		case 12: return (&gotoblas_OPTERON_SSE3);
-		case 11: return (&gotoblas_ATHLON);
-		case 10: return (&gotoblas_NEHALEM);
-		case  9: return (&gotoblas_DUNNINGTON);
-		case  8: return (&gotoblas_PENRYN);
-		case  7: return (&gotoblas_CORE2);
-		case  6: return (&gotoblas_ATOM);
-		case  5: return (&gotoblas_BANIAS);
-		case  4: return (&gotoblas_PRESCOTT);
-		case  3: return (&gotoblas_NORTHWOOD);
-		case  2: return (&gotoblas_COPPERMINE);
-		case  1: return (&gotoblas_KATMAI);
-	}
-	return(NULL);
-
-}
-
-
-
-
-void gotoblas_dynamic_init(void) {
-
-  char coremsg[128];
-  char coren[22];
-  char *p;
-
-
-  if (gotoblas) return;
-
-  p = getenv("OPENBLAS_CORETYPE");
-  if ( p )
-  {
-	gotoblas = force_coretype(p);
-  }
-  else
-  {
-  	gotoblas = get_coretype();
-  }
-
-#ifdef ARCH_X86
-  if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI;
-#else
-  if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT;
-  /* sanity check, if 64bit pointer we can't have a 32 bit cpu */
-  if (sizeof(void*) == 8) {
-      if (gotoblas == &gotoblas_KATMAI ||
-          gotoblas == &gotoblas_COPPERMINE ||
-          gotoblas == &gotoblas_NORTHWOOD ||
-          gotoblas == &gotoblas_BANIAS ||
-          gotoblas == &gotoblas_ATHLON)
-          gotoblas = &gotoblas_PRESCOTT;
-  }
-#endif
-
-  if (gotoblas && gotoblas -> init) {
-    strncpy(coren,gotoblas_corename(),20);
-    sprintf(coremsg, "Core: %s\n",coren);
-    openblas_warning(2, coremsg);
-    gotoblas -> init();
-  } else {
-    openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
-    exit(1);
-  }
-
-}
-
-void gotoblas_dynamic_quit(void) {
-
-  gotoblas = NULL;
-
-}

From 3518617f5b7118286db9ab86a85cc078c00d6046 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Dec 2019 08:32:29 +0100
Subject: [PATCH 125/210] Add Intel Goldmont+ cpuid

was originally in #2228 but that PR had misplaced the file in the toplevel directory
---
 driver/others/dynamic.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index a4ff0e086..2e87e186a 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -586,6 +586,8 @@ static gotoblas_t *get_coretype(void){
         }
         return NULL;  
       case 7:
+	if (model == 10) // Goldmont Plus
+	   return &gotoblas_NEHALEM;
         if (model == 14) {
 	// Ice Lake
           if (support_avx512()) 

From 3938e59569cd44634dce06c832f0db12968cd7fc Mon Sep 17 00:00:00 2001
From: Kavana Bhat <Kavana.bhat@in.ibm.com>
Date: Wed, 4 Dec 2019 00:23:46 -0600
Subject: [PATCH 126/210] AIX changes for Power8

---
 kernel/Makefile.L3 | 193 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 180 insertions(+), 13 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index ed8ae406f..4decfbd20 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -1,4 +1,5 @@
 USE_GEMM3M = 0
+OS := $(shell uname)
 
 ifeq ($(ARCH), x86)
 USE_GEMM3M = 1
@@ -434,10 +435,15 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s	
 	m4 sgemmotcopy.s > sgemmotcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@
 	rm sgemmotcopy.s sgemmotcopy_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+endif	
+
 
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 
@@ -445,17 +451,26 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s	
 	m4 sgemmitcopy.s > sgemmitcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@
 	rm sgemmitcopy.s sgemmitcopy_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ 
+endif
+	
 endif
 
 $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s
 	m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@
 	rm dgemm_ncopy.s dgemm_ncopy_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+endif
 
 $(KDIR)$(DGEMMOTCOPYOBJ) : $(KERNELDIR)/$(DGEMMOTCOPY)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@@ -466,10 +481,14 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s
 	m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@
 	rm dgemm_itcopy.s dgemm_itcopy_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+endif
 
 endif
 
@@ -494,16 +513,10 @@ endif
 endif
 
 $(KDIR)$(CGEMMONCOPYOBJ) : $(KERNELDIR)/$(CGEMMONCOPY)
-#	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o cgemm_oncopy.s
-#	m4 cgemm_oncopy.s > cgemm_oncopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-#	rm cgemm_oncopy.s cgemm_oncopy_nomacros.s
 
 $(KDIR)$(CGEMMOTCOPYOBJ) : $(KERNELDIR)/$(CGEMMOTCOPY)
-#	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX  $< -o cgemm_otcopy.s
-#	m4 cgemm_otcopy.s > cgemm_otcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
-#	rm cgemm_otcopy.s cgemm_otcopy_nomacros.s
 
 ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
 
@@ -511,10 +524,14 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s
 	m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@
 	rm cgemm_itcopy.s cgemm_itcopy_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+endif
 
 endif
 
@@ -530,10 +547,14 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s
 	m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@
 	rm zgemm_itcopy.s zgemm_itcopy_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+endif
 
 endif
 
@@ -558,67 +579,107 @@ endif
 endif
 
 $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX  $< -o sgemm_kernel$(TSUFFIX).s
 	m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
+endif
 
 $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s
 	m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@
 	rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
+endif
 
 $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@
 
 $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s
 	m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@
 	rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@
+endif
 
 $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s
 	m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@
 	rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
+endif
 
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC  $< -o cgemm_kernel_r.s
 	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+endif
 
 $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s
 	m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@
 	rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
+endif
 
 $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s
 	m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@
 	rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@
+endif
 
 $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s
 	m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@
 	rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@
+endif
 
 $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s
 	m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@
 	rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@
+endif
 
 $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s
 	m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s
 	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@
 	rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@
+endif
 
 $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND)
 	$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@
@@ -635,56 +696,84 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD
 
 ifdef USE_TRMM
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s	
 	m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@
 	rm strmmkernel_ln.s strmmkernel_ln_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+endif
 
 $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s	
 	m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@
 	rm strmmkernel_lt.s strmmkernel_lt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+endif
 
 $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s	
 	m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@
 	rm strmmkernel_rn.s strmmkernel_rn_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+endif
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
 	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+endif
 
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s
-#	$(CC) $(CFLAGS) -E $< -o dtrmm_kernel_ln.s
 	m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@
 	rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
+endif
 
 $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s
-#	$(CC) $(CFLAGS) -E $< -o dtrmm_kernel_lt.s
 	m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@
 	rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
+endif
 
 $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s
-#	$(CC) $(CFLAGS) -E $< -o dtrmm_kernel_rn.s
 	m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@
 	rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
+endif
 
 $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s
-#	$(CC) $(CFLAGS) -E  $< -o dtrmm_kernel_rt.s
 	m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@
 	rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+endif
 
 $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -699,100 +788,165 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
 
 $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN  $< -o ctrmm_kernel_ln.s
 	m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@
 	rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s
 	m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@
 	rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s
 	m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN  ctrmm_kernel_lr_nomacros.s -o $@
 	rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+endif
 
 $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s
 	m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@
 	rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+endif
 
 $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s
 	m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@
 	rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s
 	m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@
 	rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s
 	m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@
 	rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+endif
 
 $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s
 	m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@
 	rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s
 	m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@
 	rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s
 	m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@
 	rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s
 	m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@
 	rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s
 	m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@
 	rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s 
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s
 	m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@
 	rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s
 	m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@
 	rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s
 	m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@
 	rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
+endif
 
 $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s
 	m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@
 	rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
+endif
+
 else
 $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -804,10 +958,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
 	m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+endif
 
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@@ -931,16 +1089,17 @@ $(KDIR)strsm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRSMKERNEL_RT) $(ST
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -UDOUBLE -UUPPER -DRT -UCONJ $< -o $@
 
 $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DTRSMDEPEND)
-#	$(CC) $(CFLAGS) -E $< -o dtrsm_kernel_ln.s
-#	m4 dtrsm_kernel_ln.s > dtrsm_kernel_ln_nomacros.s
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DLN -UCONJ $< -o $@
-#	rm dtrsm_kernel_ln.s dtrsm_kernel_ln_nomacros.s
 
 $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s
 	m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
 	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
+else
+	$(CC) $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
+endif
 
 $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -DUPPER -DRN -UCONJ $< -o $@
@@ -2180,10 +2339,14 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@
 
 $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
+ifeq ($(OS), AIX)
 	$(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s
 	m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@
 	rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s 
+else
+	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@
+endif
 
 $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND)
 	$(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@
@@ -2222,10 +2385,14 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
 
 $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
+ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s	
 	m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@
 	rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s
+else
+	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
+endif
 
 $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
 	$(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@

From a4896b5538e5a3299acd6857b055e58fc3cce398 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 4 Dec 2019 11:06:03 +0100
Subject: [PATCH 127/210] Update DYNAMIC_ARCH support for ARM64 and PPC (#2332)

* Update DYNAMIC_ARCH list of ARM64 targets for gmake
* Update arm64 cpu list for runtime detection
* Update DYNAMIC_ARCH list of ARM64 targets for cmake and add POWERPC targets
---
 Makefile.arm64                |  3 ++
 Makefile.system               |  6 +++
 cmake/arch.cmake              |  6 ++-
 cmake/prebuild.cmake          | 77 +++++++++++++++++++++++++++++++++++
 driver/others/dynamic_arm64.c | 56 +++++++++++++++++++++----
 5 files changed, 138 insertions(+), 10 deletions(-)

diff --git a/Makefile.arm64 b/Makefile.arm64
index 4d10ff684..c17ea7938 100644
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -39,7 +39,10 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
 endif
 
+ifeq ($(GCCVERSIONGTEQ9), 1)
 ifeq ($(CORE), TSV110)
 CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
 FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
 endif
+endif
+
diff --git a/Makefile.system b/Makefile.system
index 4cb4dc954..ab2ffca52 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -326,6 +326,7 @@ ifeq ($(C_COMPILER), GCC)
 GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
 GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
 GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
+GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
 ifeq ($(GCCVERSIONGT4), 1)
 # GCC Major version > 4
@@ -547,9 +548,14 @@ endif
 
 ifeq ($(ARCH), arm64)
 DYNAMIC_CORE =  ARMV8
+DYNAMIC_CORE += CORTEXA53
 DYNAMIC_CORE += CORTEXA57
+DYNAMIC_CORE += CORTEXA72
+DYNAMIC_CORE += CORTEXA73
+DYNAMIC_CORE += FALKOR
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
+DYNAMIC_CORE += TSV110
 endif
 
 ifeq ($(ARCH), power)
diff --git a/cmake/arch.cmake b/cmake/arch.cmake
index f3ae84fe0..8280d6274 100644
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -45,7 +45,11 @@ endif ()
 
 if (DYNAMIC_ARCH)
   if (ARM64)
-    set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99)
+    set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110)
+  endif ()
+  
+  if (POWER)
+    set(DYNAMIC_CORE POWER6 POWER8 POWER9)
   endif ()
   
   if (X86)
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 086df1943..c6d109356 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -309,6 +309,83 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
     set(ZGEMM_UNROLL_M 4)
     set(ZGEMM_UNROLL_N 4)
     set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "TSV110")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define ARMV8\n"
+      "#define L1_CODE_SIZE\t65536\n"
+      "#define L1_CODE_LINESIZE\t64\n"
+      "#define L1_CODE_ASSOCIATIVE\t4\n"
+      "#define L1_DATA_SIZE\t65536\n"
+      "#define L1_DATA_LINESIZE\t64\n"
+      "#define L1_DATA_ASSOCIATIVE\t4\n"
+      "#define L2_SIZE\t524288\n"
+      "#define L2_LINESIZE\t64\n"
+      "#define L2_ASSOCIATIVE\t8\n"
+      "#define DTB_DEFAULT_ENTRIES\t64\n"
+      "#define DTB_SIZE\t4096\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 8)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 4)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 16)
+  elseif ("${TCORE}" STREQUAL "POWER6")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_DATA_SIZE 32768\n"
+      "#define L1_DATA_LINESIZE 128\n"
+      "#define L2_SIZE 524288\n"
+      "#define L2_LINESIZE 128 \n"
+      "#define DTB_DEFAULT_ENTRIES 128\n"
+      "#define DTB_SIZE 4096\n"
+      "#define L2_ASSOCIATIVE 8\n")
+    set(SGEMM_UNROLL_M 4)
+    set(SGEMM_UNROLL_N 4)
+    set(DGEMM_UNROLL_M 4)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 2)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 2)
+    set(ZGEMM_UNROLL_N 4)
+    set(SYMV_P 8)
+  elseif ("${TCORE}" STREQUAL "POWER8")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_DATA_SIZE 32768\n"
+      "#define L1_DATA_LINESIZE 128\n"
+      "#define L2_SIZE 524288\n"
+      "#define L2_LINESIZE 128 \n"
+      "#define DTB_DEFAULT_ENTRIES 128\n"
+      "#define DTB_SIZE 4096\n"
+      "#define L2_ASSOCIATIVE 8\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 8)
+    set(DGEMM_UNROLL_M 16)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 8)
+    set(ZGEMM_UNROLL_N 2)
+    set(SYMV_P 8)
+  elseif ("${TCORE}" STREQUAL "POWER9")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define L1_DATA_SIZE 32768\n"
+      "#define L1_DATA_LINESIZE 128\n"
+      "#define L2_SIZE 524288\n"
+      "#define L2_LINESIZE 128 \n"
+      "#define DTB_DEFAULT_ENTRIES 128\n"
+      "#define DTB_SIZE 4096\n"
+      "#define L2_ASSOCIATIVE 8\n")
+    set(SGEMM_UNROLL_M 16)
+    set(SGEMM_UNROLL_N 8)
+    set(DGEMM_UNROLL_M 16)
+    set(DGEMM_UNROLL_N 4)
+    set(CGEMM_UNROLL_M 8)
+    set(CGEMM_UNROLL_N 4)
+    set(ZGEMM_UNROLL_M 8)
+    set(ZGEMM_UNROLL_N 2)
+    set(SYMV_P 8)
   endif()
 
   # Or should this actually be NUM_CORES?
diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 9db9ba17d..72f5fcca2 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -43,13 +43,18 @@
 #endif
 
 extern gotoblas_t  gotoblas_ARMV8;
+extern gotoblas_t  gotoblas_CORTEXA53;
 extern gotoblas_t  gotoblas_CORTEXA57;
+extern gotoblas_t  gotoblas_CORTEXA72;
+extern gotoblas_t  gotoblas_CORTEXA73;
+extern gotoblas_t  gotoblas_FALKOR;
 extern gotoblas_t  gotoblas_THUNDERX;
 extern gotoblas_t  gotoblas_THUNDERX2T99;
+extern gotoblas_t  gotoblas_TSV110;
 
 extern void openblas_warning(int verbose, const char * msg);
 
-#define NUM_CORETYPES    4
+#define NUM_CORETYPES    9
 
 /*
  * In case asm/hwcap.h is outdated on the build system, make sure
@@ -65,17 +70,27 @@ extern void openblas_warning(int verbose, const char * msg);
 
 static char *corename[] = {
   "armv8",
+  "cortexa53",
   "cortexa57",
+  "cortexa72",
+  "cortexa73",
+  "falkor",
   "thunderx",
   "thunderx2t99",
+  "tsv110",
   "unknown"
 };
 
 char *gotoblas_corename(void) {
   if (gotoblas == &gotoblas_ARMV8)        return corename[ 0];
-  if (gotoblas == &gotoblas_CORTEXA57)    return corename[ 1];
-  if (gotoblas == &gotoblas_THUNDERX)     return corename[ 2];
-  if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 3];
+  if (gotoblas == &gotoblas_CORTEXA53)    return corename[ 1];
+  if (gotoblas == &gotoblas_CORTEXA57)    return corename[ 2];
+  if (gotoblas == &gotoblas_CORTEXA72)    return corename[ 3];
+  if (gotoblas == &gotoblas_CORTEXA73)    return corename[ 4];
+  if (gotoblas == &gotoblas_FALKOR)       return corename[ 5];
+  if (gotoblas == &gotoblas_THUNDERX)     return corename[ 6];
+  if (gotoblas == &gotoblas_THUNDERX2T99) return corename[ 7];
+  if (gotoblas == &gotoblas_TSV110)       return corename[ 8];
   return corename[NUM_CORETYPES];
 }
 
@@ -96,9 +111,14 @@ static gotoblas_t *force_coretype(char *coretype) {
   switch (found)
   {
     case  0: return (&gotoblas_ARMV8);
-    case  1: return (&gotoblas_CORTEXA57);
-    case  2: return (&gotoblas_THUNDERX);
-    case  3: return (&gotoblas_THUNDERX2T99);
+    case  1: return (&gotoblas_CORTEXA53);
+    case  2: return (&gotoblas_CORTEXA57);
+    case  3: return (&gotoblas_CORTEXA72);
+    case  4: return (&gotoblas_CORTEXA73);
+    case  5: return (&gotoblas_FALKOR);
+    case  6: return (&gotoblas_THUNDERX);
+    case  7: return (&gotoblas_THUNDERX2T99);
+    case  8: return (&gotoblas_TSV110);
   }
   snprintf(message, 128, "Core not found: %s\n", coretype);
   openblas_warning(1, message);
@@ -136,10 +156,14 @@ static gotoblas_t *get_coretype(void) {
     case 0x41: // ARM
       switch (part)
       {
-        case 0xd07: // Cortex A57
-        case 0xd08: // Cortex A72
         case 0xd03: // Cortex A53
+          return &gotoblas_CORTEXA53;
+        case 0xd07: // Cortex A57
           return &gotoblas_CORTEXA57;
+        case 0xd08: // Cortex A72
+          return &gotoblas_CORTEXA72;
+        case 0xd09: // Cortex A73
+          return &gotoblas_CORTEXA73;
       }
       break;
     case 0x42: // Broadcom
@@ -158,6 +182,20 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_THUNDERX2T99;
       }
       break;
+    case 0x48: // HiSilicon
+      switch (part)
+      {
+        case 0xd01: // tsv110
+          return &gotoblas_TSV110;
+      }
+      break;
+    case 0x51: // Qualcomm
+      switch (part)
+      {
+        case 0xc00: // Falkor
+          return &gotoblas_FALKOR;
+      }
+      break;
   }
   return NULL;
 }

From 6baa9b07d7e88f93ef42db4e96fa3d2be035c3d4 Mon Sep 17 00:00:00 2001
From: Kavana Bhat <Kavana.bhat@in.ibm.com>
Date: Fri, 6 Dec 2019 04:33:32 -0600
Subject: [PATCH 128/210] AIX changes for Power8

---
 common_power.h     | 8 ++++----
 kernel/Makefile.L3 | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/common_power.h b/common_power.h
index 76b9f0f32..9df398266 100644
--- a/common_power.h
+++ b/common_power.h
@@ -60,10 +60,10 @@
 #define XXSWAPD(T,A)	xxswapd	T, A
 #define XVMOVDP(T,A)	xvmovdp	T, A
 
-#define XXSPLTD_S(T,A,z)	"xxspltd	T, A, z	\n\t"
-#define XXMRGHD_S(T,A,B)	"xxmrghd	T, A, B	\n\t"
-#define XXMRGLD_S(T,A,B)	"xxmrgld	T, A, B	\n\t"
-#define XXSWAPD_S(T,A)	"xxswapd	T, A"
+#define XXSPLTD_S(T,A,z)	"xxspltd	" str(T) ", " str(A) ", " str(z)"	\n\t"
+#define XXMRGHD_S(T,A,B)	"xxmrghd	" str(T) ", " str(A) ", " str(B)"	\n\t"
+#define XXMRGLD_S(T,A,B)	"xxmrgld	" str(T) ", " str(A) ", " str(B)"	\n\t"
+#define XXSWAPD_S(T,A)	"xxswapd	" str(T) ", " str(A) "	\n\t"
 
 #endif
 
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 4decfbd20..c36a44f20 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -1098,7 +1098,7 @@ ifeq ($(OS), AIX)
 	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@
 	rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s
 else
-	$(CC) $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
+	$(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o $@
 endif
 
 $(KDIR)dtrsm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_RN) $(DTRSMDEPEND)

From b28db31429d9b3b6a57a182d79e63aafdd2843f1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 6 Dec 2019 21:23:56 +0100
Subject: [PATCH 129/210] Support two-digit version numbers in gcc version
 check

fixes #2336 (non-recognition of gcc 10) with patch provided by JeffreyALaw.
---
 f_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/f_check b/f_check
index 993ad9a35..79b24e2dc 100644
--- a/f_check
+++ b/f_check
@@ -71,7 +71,7 @@ if ($compiler eq "") {
 
 	if ($data =~ /GNU/) {
 
-	    $data =~ /(\d)\.(\d).(\d)/;
+	    $data =~ /(\d+)\.(\d+).(\d+)/;
 	    $major = $1;
 	    $minor = $2;
 

From 13226e310195c7dd5e051c791c4f0839f2f606c4 Mon Sep 17 00:00:00 2001
From: Jehan <jehan@girinstud.io>
Date: Wed, 11 Dec 2019 17:51:42 +0100
Subject: [PATCH 130/210] driver: more reasonable thread wait timeout on
 Windows.

It used to be 5ms, which might not be long enough in some cases for the
thread to exit well, but then when set to 5000 (5s), it would slow down
any program depending on OpenBlas.

Let's just set it to 50ms, which is at least 10 times longer than
originally, but still reasonable in case of failed thread termination.
---
 driver/others/blas_server_win32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index e27725baf..5ecc4428b 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -462,7 +462,7 @@ int BLASFUNC(blas_thread_shutdown)(void){
 
     for(i = 0; i < blas_num_threads - 1; i++){
       // Could also just use WaitForMultipleObjects
-      DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 5000);
+      DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
 
 #ifndef OS_WINDOWSSTORE
       // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP

From aeef942c4f2a17099d82307d482abdec53bd3fbd Mon Sep 17 00:00:00 2001
From: w00421467 <wuzeyi1@huawei.com>
Date: Tue, 17 Dec 2019 10:00:13 +0800
Subject: [PATCH 131/210] use arm neon instructions to optimize gemm beta
 operation

---
 kernel/arm64/KERNEL       |   2 +-
 kernel/arm64/dgemm_beta.S | 178 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 179 insertions(+), 1 deletion(-)
 create mode 100644 kernel/arm64/dgemm_beta.S

diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL
index f936cdf47..440257196 100644
--- a/kernel/arm64/KERNEL
+++ b/kernel/arm64/KERNEL
@@ -34,7 +34,7 @@ ifndef SGEMM_BETA
 SGEMM_BETA = ../generic/gemm_beta.c
 endif
 ifndef DGEMM_BETA
-DGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../arm64/dgemm_beta.S
 endif
 ifndef CGEMM_BETA
 CGEMM_BETA = ../generic/zgemm_beta.c
diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S
new file mode 100644
index 000000000..636954695
--- /dev/null
+++ b/kernel/arm64/dgemm_beta.S
@@ -0,0 +1,178 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	BETA    d0
+#define	LDC     x6
+#define	C00	x7
+
+#define	A01	x8
+#define	A02	x9
+#define	A03	x10
+#define	A04	x11
+
+#define beta0		d11
+#define betaV0		v11.d[0]
+#define I	x16
+
+#define size 128
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	ldr LDC, [sp]
+	SAVE_REGS
+
+.Lgemm_beta_BEGIN:
+
+	fmov beta0, BETA
+	cmp  N, #0
+	ble	.Lgemm_beta_L999
+
+.Lgemm_beta_01:
+
+	lsl LDC, LDC, #3
+
+	.align 5
+.Lgemm_beta_02:
+
+	mov	A01, C00
+	add     C00, C00, LDC
+	asr	I, M, #4
+	cmp     I, #0
+	ble .Lgemm_beta_04
+	add	A02, A01, #32
+	add	A03, A02, #32
+	add	A04, A03, #32
+
+	.align 5
+.Lgemm_beta_03:
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+  
+	fmul v0.2d, v0.2d, betaV0
+	fmul v1.2d, v1.2d, betaV0
+	
+	fmul v2.2d, v2.2d, betaV0
+	fmul v3.2d, v3.2d, betaV0
+	
+	fmul v4.2d, v4.2d, betaV0
+	fmul v5.2d, v5.2d, betaV0
+	
+	fmul v6.2d, v6.2d, betaV0
+	fmul v7.2d, v7.2d, betaV0
+
+	st1	{v0.2d, v1.2d}, [A01]
+	add	A01, A01, size
+	st1	{v2.2d, v3.2d}, [A02]
+	add	A02, A02, size
+	st1	{v4.2d, v5.2d}, [A03]
+	add	A03, A03, size
+	st1	{v6.2d, v7.2d}, [A04]
+	add	A04, A04, size
+
+	subs    I , I , #1
+	bne	.Lgemm_beta_03
+
+	.align 5
+.Lgemm_beta_04:
+
+	and	I, M , #15	// M%16
+	cmp	I, #0
+	ble	.Lgemm_beta_06
+
+	.align 5
+.Lgemm_beta_05:
+
+	ldr	d12, [A01]
+	fmul	d12, d12, beta0
+	str	d12, [A01]
+	add	A01, A01, #8
+
+	subs	I , I , #1
+	bne	.Lgemm_beta_05
+
+	.align 5
+.Lgemm_beta_06:
+
+	subs	N , N, #1	// N--
+	bne	.Lgemm_beta_02
+
+	.align 5
+.Lgemm_beta_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE

From b7cc69ee622fed9039ab755b87eee9279d27d541 Mon Sep 17 00:00:00 2001
From: w00421467 <wuzeyi1@huawei.com>
Date: Fri, 20 Dec 2019 10:11:50 +0800
Subject: [PATCH 132/210] declare DGEMM_BETA in KERNEL.ARMV8 rather than the
 generic KERNEL

---
 kernel/arm64/KERNEL       | 2 +-
 kernel/arm64/KERNEL.ARMV8 | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/arm64/KERNEL b/kernel/arm64/KERNEL
index 440257196..f936cdf47 100644
--- a/kernel/arm64/KERNEL
+++ b/kernel/arm64/KERNEL
@@ -34,7 +34,7 @@ ifndef SGEMM_BETA
 SGEMM_BETA = ../generic/gemm_beta.c
 endif
 ifndef DGEMM_BETA
-DGEMM_BETA = ../arm64/dgemm_beta.S
+DGEMM_BETA = ../generic/gemm_beta.c
 endif
 ifndef CGEMM_BETA
 CGEMM_BETA = ../generic/zgemm_beta.c
diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index efc1ec8bc..b90dd228b 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -102,6 +102,8 @@ CDOTKERNEL     = zdot.S
 ZDOTKERNEL     = zdot.S
 DSDOTKERNEL    = dot.S
 
+DGEMM_BETA     = dgemm_beta.S
+
 SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))

From d573d24de7cda411edbf0675c7c2e2dd8cdb896f Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 21 Dec 2019 14:35:15 +0800
Subject: [PATCH 133/210] Fast Haswell ZGEMM kernel

---
 kernel/x86_64/zgemm_kernel_4x2_haswell.c | 240 +++++++++++++++++++++++
 1 file changed, 240 insertions(+)
 create mode 100644 kernel/x86_64/zgemm_kernel_4x2_haswell.c

diff --git a/kernel/x86_64/zgemm_kernel_4x2_haswell.c b/kernel/x86_64/zgemm_kernel_4x2_haswell.c
new file mode 100644
index 000000000..3279b8b8c
--- /dev/null
+++ b/kernel/x86_64/zgemm_kernel_4x2_haswell.c
@@ -0,0 +1,240 @@
+#include "common.h"
+#include <stdint.h>
+
+/* recommended settings: GEMM_P = 192, GEMM_Q = 192 */
+
+#if  defined(NN) || defined(NT) || defined(TN) || defined(TT)
+  #define A_CONJ 0
+  #define B_CONJ 0
+#endif
+#if  defined(RN) || defined(RT) || defined(CN) || defined(CT)
+  #define A_CONJ 1
+  #define B_CONJ 0
+#endif
+#if  defined(NR) || defined(NC) || defined(TR) || defined(TC)
+  #define A_CONJ 0
+  #define B_CONJ 1
+#endif
+#if  defined(RR) || defined(RC) || defined(CR) || defined(CC)
+  #define A_CONJ 1
+  #define B_CONJ 1
+#endif
+
+/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
+/* r11 = m, r12 = k << 5, r13 = k, r14 = b_head, r15 = temp */
+
+/* m=4, ymm 0-3 temp, ymm 4-15 acc */
+#if A_CONJ == B_CONJ
+  #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
+  #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmaddsub231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
+#else
+  #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231pd %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
+  #define acc_m4n1_con(ua,la,b1,uc,lc) "vfmsubadd231pd %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231pd %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
+#endif
+/* expanded accumulators for m4n1 and m4n2 */
+#define KERNEL_k1m4n1 \
+  "vbroadcastf128 (%1),%%ymm0; addq $16,%1;"\
+  "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;" acc_m2n1_exp(1,2,0,4,5)\
+  "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2;" acc_m2n1_exp(1,2,0,6,7)\
+  "addq $64,%0;"
+#define KERNEL_k1m4n2 \
+  "vbroadcastf128 (%1),%%ymm0; vbroadcastf128 16(%1),%%ymm1; addq $32,%1;"\
+  "vmovddup (%0),%%ymm2; vmovddup 8(%0),%%ymm3;" acc_m2n1_exp(2,3,0,4,5) acc_m2n1_exp(2,3,1,8,9)\
+  "vmovddup 32(%0),%%ymm2; vmovddup 40(%0),%%ymm3;" acc_m2n1_exp(2,3,0,6,7) acc_m2n1_exp(2,3,1,10,11)\
+  "addq $64,%0;"
+/* contracted accumulators for m4n4 and m4n6 */
+#define acc_m4n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
+  "vbroadcastsd "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m4n1_con(ua,la,2,luc,llc)\
+  "vbroadcastsd "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m4n1_con(ua,la,3,ruc,rlc)
+#define KERNEL_1_k1m4n4 \
+  "vmovupd (%0),%%ymm0; vmovupd 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
+  acc_m4n2_con(0,1,4,5,6,7,0,16,%1) acc_m4n2_con(0,1,8,9,10,11,0,16,%1,%%r12,1)
+#define KERNEL_2_k1m4n4 \
+  "vpermilpd $5,%%ymm0,%%ymm0; vpermilpd $5,%%ymm1,%%ymm1;"\
+  acc_m4n2_con(0,1,4,5,6,7,8,24,%1) acc_m4n2_con(0,1,8,9,10,11,8,24,%1,%%r12,1)
+#define KERNEL_1_k1m4n6 KERNEL_1_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,0,16,%1,%%r12,2)
+#define KERNEL_2_k1m4n6 KERNEL_2_k1m4n4 acc_m4n2_con(0,1,12,13,14,15,8,24,%1,%%r12,2)
+#define KERNEL_k1m4n4 KERNEL_1_k1m4n4 KERNEL_2_k1m4n4 "addq $32,%1;"
+#define KERNEL_k1m4n6 KERNEL_1_k1m4n6 KERNEL_2_k1m4n6 "addq $32,%1;"
+#define zero_4ymm(no1,no2,no3,no4) \
+  "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
+  "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
+/* initialization and storage macros */
+#define INIT_m4n1 zero_4ymm(4,5,6,7)
+#define INIT_m4n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
+#define INIT_m4n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
+#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
+#if A_CONJ == B_CONJ
+  #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
+#else
+  #define cont_expacc(cl,cr,dst) "vpermilpd $5,%%ymm"#cr",%%ymm"#cr"; vaddsubpd %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
+#endif
+#if A_CONJ == 0
+  #define save_1ymm(c,tmp,off,alpr,alpi,...) \
+    "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213pd "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
+    "vfmsubadd231pd %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovupd %%ymm"#c","#off"("#__VA_ARGS__");"
+#else
+  #define save_1ymm(c,tmp,off,alpr,alpi,...) \
+    "vpermilpd $5,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213pd "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
+    "vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovupd %%ymm"#tmp","#off"("#__VA_ARGS__");"
+#endif
+#define save_init_m4 "movq %2,%3; addq $64,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
+#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
+#define SAVE_m4n2 SAVE_m4n1\
+  cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
+#define SAVE_m4n4 save_init_m4\
+  save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
+  save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
+#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
+  save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
+#define COMPUTE_m4(ndim) \
+  "movq %%r14,%1;" INIT_m4n##ndim "movq %2,%3; movq %%r13,%5;"\
+  "testq %5,%5; jz "#ndim"4443f; cmpq $10,%5; jb "#ndim"4442f;"\
+  "movq $10,%5; movq $84,%%r15;"\
+  #ndim"4441:\n\t"\
+  "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
+  "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
+  "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
+  "prefetcht0 96(%1); prefetcht0 96(%1,%%r12,1); prefetcht0 96(%1,%%r12,2);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
+  "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"4441b;"\
+  "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 15(%6);"\
+  #ndim"4442:\n\t"\
+  "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
+  KERNEL_k1m4n##ndim "decq %5; jnz "#ndim"4442b;"\
+  #ndim"4443:\n\t"\
+  "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m4n##ndim
+
+/* m=2, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
+#define KERNEL_k1m2n1 \
+  "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\
+  "vbroadcastf128 (%1),%%ymm0;" acc_m2n1_exp(1,2,0,4,5) "addq $16,%1;"
+#define acc_m2n2_exp(c1l,c1r,c2l,c2r,...) \
+  "vbroadcastf128 ("#__VA_ARGS__"),%%ymm2;" acc_m2n1_exp(0,1,2,c1l,c1r)\
+  "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3;" acc_m2n1_exp(0,1,3,c2l,c2r)
+#define KERNEL_h_k1m2n2 \
+  "vmovddup (%0),%%ymm0; vmovddup 8(%0),%%ymm1; addq $32,%0;" acc_m2n2_exp(4,5,6,7,%1)
+#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
+#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
+#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $32,%1;"
+#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;"
+#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $32,%1;"
+#define INIT_m2n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m2n2 zero_4ymm(4,5,6,7)
+#define INIT_m2n4 INIT_m2n2 zero_4ymm(8,9,10,11)
+#define INIT_m2n6 INIT_m2n4 zero_4ymm(12,13,14,15)
+#define save_init_m2 "movq %2,%3; addq $32,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
+#define SAVE_m2n1 save_init_m2 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
+#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
+#define SAVE_m2n4 SAVE_m2n2 "leaq (%3,%4,2),%3;"\
+  cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
+#define SAVE_m2n6 SAVE_m2n4 "leaq (%3,%4,2),%3;"\
+  cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
+#define COMPUTE_m2(ndim) \
+  "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
+  "testq %5,%5; jz "#ndim"2222f;"\
+  #ndim"2221:\n\t"\
+  KERNEL_k1m2n##ndim\
+  "decq %5; jnz "#ndim"2221b;"\
+  #ndim"2222:\n\t"\
+  SAVE_m2n##ndim
+
+/* m=1, vmm 0-3 temp, vmm 4-15 acc, expanded accumulators */
+#if A_CONJ == B_CONJ
+  #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
+  #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";"
+#else
+  #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231pd %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231pd %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
+  #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231pd %%ymm"#arb",%%ymm"#b4",%%ymm"#cl"; vfnmadd231pd %%ymm"#aib",%%ymm"#b4",%%ymm"#cr";"
+#endif
+#define KERNEL_k1m1n1 \
+  "vmovddup (%0),%%xmm0; vmovddup 8(%0),%%xmm1; addq $16,%0;"\
+  "vmovupd (%1),%%xmm2; addq $16,%1;" acc_m1n1_exp(0,1,2,4,5)
+#define KERNEL_h_k1m1n2 \
+  "vbroadcastsd (%0),%%ymm0; vbroadcastsd 8(%0),%%ymm1; addq $16,%0;"\
+  "vmovupd (%1),%%ymm2;" acc_m1n2_exp(0,1,2,4,5)
+#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovupd (%1,%%r12,1),%%ymm2;" acc_m1n2_exp(0,1,2,6,7)
+#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovupd (%1,%%r12,2),%%ymm2;" acc_m1n2_exp(0,1,2,8,9)
+#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $32,%1;"
+#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;"
+#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $32,%1;"
+#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4; vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m1n2 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m1n4 INIT_m1n2 "vpxor %%ymm6,%%ymm6,%%ymm6; vpxor %%ymm7,%%ymm7,%%ymm7;"
+#define INIT_m1n6 INIT_m1n4 "vpxor %%ymm8,%%ymm8,%%ymm8; vpxor %%ymm9,%%ymm9,%%ymm9;"
+#if A_CONJ == B_CONJ
+  #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
+#else
+  #define cont_expxmmacc(cl,cr,dst) "vpermilpd $5,%%xmm"#cr",%%xmm"#cr"; vaddsubpd %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
+#endif
+#if A_CONJ == 0
+  #define save_m1n1(c,tmp,alpr,alpi) \
+    "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213pd (%3),%%xmm"#alpr",%%xmm"#c";"\
+    "vfmsubadd231pd %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovupd %%xmm"#c",(%3);"
+  #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
+    "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\
+    "vfmsubadd213pd %%ymm"#tmp2",%%ymm"#alpr",%%ymm"#c"; vfmsubadd231pd %%ymm"#tmp1",%%ymm"#alpi",%%ymm"#c";"\
+    "vmovupd %%xmm"#c",(%3); vextractf128 $1,%%ymm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
+#else
+  #define save_m1n1(c,tmp,alpr,alpi) \
+    "vpermilpd $5,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213pd (%3),%%xmm"#alpi",%%xmm"#tmp";"\
+    "vfmaddsub231pd %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovupd %%xmm"#tmp",(%3);"
+  #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
+    "vpermilpd $5,%%ymm"#c",%%ymm"#tmp1"; vmovupd (%3),%%xmm"#tmp2"; vinsertf128 $1,(%3,%4,1),%%ymm"#tmp2",%%ymm"#tmp2";"\
+    "vfmaddsub213pd %%ymm"#tmp2",%%ymm"#alpi",%%ymm"#tmp1"; vfmaddsub231pd %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp1";"\
+    "vmovupd %%xmm"#tmp1",(%3); vextractf128 $1,%%ymm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define save_init_m1 "movq %2,%3; addq $16,%2; vbroadcastsd (%6),%%ymm0; vbroadcastsd 8(%6),%%ymm1;"
+#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,0,1)
+#define SAVE_m1n2 save_init_m1 cont_expacc(4,5,4) save_m1n2(4,2,3,0,1)
+#define SAVE_m1n4 SAVE_m1n2 cont_expacc(6,7,6) save_m1n2(6,2,3,0,1)
+#define SAVE_m1n6 SAVE_m1n4 cont_expacc(8,9,8) save_m1n2(8,2,3,0,1)
+#define COMPUTE_m1(ndim) \
+  "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
+  "testq %5,%5; jz "#ndim"1112f;"\
+  #ndim"1111:\n\t"\
+  KERNEL_k1m1n##ndim\
+  "decq %5; jnz "#ndim"1111b;"\
+  #ndim"1112:\n\t"\
+  SAVE_m1n##ndim
+
+#define COMPUTE(ndim) {\
+  b_pref = b_ptr + ndim * K *2;\
+  __asm__ __volatile__ (\
+    "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $5,%%r12; movq %7,%%r11;"\
+    "cmpq $4,%7; jb "#ndim"9992f;"\
+    #ndim"9991:\n\t"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%7; cmpq $4,%7; jnb "#ndim"9991b;"\
+    #ndim"9992:\n\t"\
+    "cmpq $2,%7; jb "#ndim"9993f;"\
+    COMPUTE_m2(ndim) "subq $2,%7;"\
+    #ndim"9993:\n\t"\
+    "testq %7,%7; jz "#ndim"9994f;"\
+    COMPUTE_m1(ndim)\
+    #ndim"9994:\n\t"\
+    "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
+    :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
+    ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
+    "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
+  a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
+}
+
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2;
+#if A_CONJ == B_CONJ
+    double const_val[2] = {-alphar, -alphai};
+#else
+    double const_val[2] = {alphar, alphai};
+#endif
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
+    for(;n_count>5;n_count-=6) COMPUTE(6)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}

From f41d52665d589440dd5227b52025ea492bea4c6e Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 21 Dec 2019 14:37:06 +0800
Subject: [PATCH 134/210] Fast Haswell ZGEMM kernel

---
 kernel/x86_64/KERNEL.HASWELL | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index f98728a41..5c11ced1d 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -67,7 +67,7 @@ CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZTRMMKERNEL    =  zgemm_kernel_4x2_haswell.S
-ZGEMMKERNEL    =  zgemm_kernel_4x2_haswell.S
+ZGEMMKERNEL    =  zgemm_kernel_4x2_haswell.c
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_4.c
 ZGEMMITCOPY    =  ../generic/zgemm_tcopy_4.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c

From 105e26e12ac2283ec2bee50d03d02d77a2c92780 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Sat, 21 Dec 2019 14:38:51 +0800
Subject: [PATCH 135/210] Adjust Haswell ZGEMM blocking parameters

---
 param.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/param.h b/param.h
index d39fc4a1d..5fb0868b2 100644
--- a/param.h
+++ b/param.h
@@ -1572,7 +1572,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_P 768
 #define DGEMM_DEFAULT_P 512
 #define CGEMM_DEFAULT_P 384
-#define ZGEMM_DEFAULT_P 256
+#define ZGEMM_DEFAULT_P 192
 
 #ifdef WINDOWS_ABI
 #define SGEMM_DEFAULT_Q 320
@@ -1582,7 +1582,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DGEMM_DEFAULT_Q 256
 #endif
 #define CGEMM_DEFAULT_Q 192
-#define ZGEMM_DEFAULT_Q 128
+#define ZGEMM_DEFAULT_Q 192
 
 #define SGEMM_DEFAULT_R sgemm_r
 #define DGEMM_DEFAULT_R 13824

From 025741f16aeaafe0080b9065dbf2315762b286e4 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 23 Dec 2019 23:40:03 +0800
Subject: [PATCH 136/210] Fast Haswell CGEMM kernel

---
 kernel/x86_64/cgemm_kernel_8x2_haswell.c | 287 +++++++++++++++++++++++
 1 file changed, 287 insertions(+)
 create mode 100644 kernel/x86_64/cgemm_kernel_8x2_haswell.c

diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c
new file mode 100644
index 000000000..49fef90db
--- /dev/null
+++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c
@@ -0,0 +1,287 @@
+#include "common.h"
+#include <stdint.h>
+
+/* recommended settings: GEMM_P = 256, GEMM_Q = 256 */
+
+#if  defined(NN) || defined(NT) || defined(TN) || defined(TT)
+  #define A_CONJ 0
+  #define B_CONJ 0
+#endif
+#if  defined(RN) || defined(RT) || defined(CN) || defined(CT)
+  #define A_CONJ 1
+  #define B_CONJ 0
+#endif
+#if  defined(NR) || defined(NC) || defined(TR) || defined(TC)
+  #define A_CONJ 0
+  #define B_CONJ 1
+#endif
+#if  defined(RR) || defined(RC) || defined(CR) || defined(CC)
+  #define A_CONJ 1
+  #define B_CONJ 1
+#endif
+
+/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = &alpha, %7 = m_counter, %8 = b_pref */
+/* r11 = m, r12 = k << 4, r13 = k, r14 = b_head, r15 = temp */
+
+/* m=8, ymm 0-3 temp, ymm 4-15 acc */
+#if A_CONJ == B_CONJ
+  #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
+  #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmaddsub231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmaddsub231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
+#else
+  #define acc_m4n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%ymm"#ar",%%ymm"#b2",%%ymm"#cl"; vfnmadd231ps %%ymm"#ai",%%ymm"#b2",%%ymm"#cr";"
+  #define acc_m8n1_con(ua,la,b1,uc,lc) "vfmsubadd231ps %%ymm"#ua",%%ymm"#b1",%%ymm"#uc"; vfmsubadd231ps %%ymm"#la",%%ymm"#b1",%%ymm"#lc";"
+#endif
+/* expanded accumulators for m8n1 and m8n2 */
+#define KERNEL_k1m8n1 \
+  "vbroadcastsd (%1),%%ymm0; addq $8,%1;"\
+  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;" acc_m4n1_exp(1,2,0,4,5)\
+  "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2;" acc_m4n1_exp(1,2,0,6,7)\
+  "addq $64,%0;"
+#define KERNEL_k1m8n2 \
+  "vbroadcastsd (%1),%%ymm0; vbroadcastsd 8(%1),%%ymm1; addq $16,%1;"\
+  "vmovsldup (%0),%%ymm2; vmovshdup (%0),%%ymm3;" acc_m4n1_exp(2,3,0,4,5) acc_m4n1_exp(2,3,1,8,9)\
+  "vmovsldup 32(%0),%%ymm2; vmovshdup 32(%0),%%ymm3;" acc_m4n1_exp(2,3,0,6,7) acc_m4n1_exp(2,3,1,10,11)\
+  "addq $64,%0;"
+/* contracted accumulators for m8n4 and m8n6 */
+#define acc_m8n2_con(ua,la,luc,llc,ruc,rlc,lboff,rboff,...) \
+  "vbroadcastss "#lboff"("#__VA_ARGS__"),%%ymm2;" acc_m8n1_con(ua,la,2,luc,llc)\
+  "vbroadcastss "#rboff"("#__VA_ARGS__"),%%ymm3;" acc_m8n1_con(ua,la,3,ruc,rlc)
+#define KERNEL_1_k1m8n4 \
+  "vmovups (%0),%%ymm0; vmovups 32(%0),%%ymm1; prefetcht0 512(%0); addq $64,%0;"\
+  acc_m8n2_con(0,1,4,5,6,7,0,8,%1) acc_m8n2_con(0,1,8,9,10,11,0,8,%1,%%r12,1)
+#define KERNEL_2_k1m8n4 \
+  "vpermilps $177,%%ymm0,%%ymm0; vpermilps $177,%%ymm1,%%ymm1;"\
+  acc_m8n2_con(0,1,4,5,6,7,4,12,%1) acc_m8n2_con(0,1,8,9,10,11,4,12,%1,%%r12,1)
+#define KERNEL_1_k1m8n6 KERNEL_1_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,0,8,%1,%%r12,2)
+#define KERNEL_2_k1m8n6 KERNEL_2_k1m8n4 acc_m8n2_con(0,1,12,13,14,15,4,12,%1,%%r12,2)
+#define KERNEL_k1m8n4 KERNEL_1_k1m8n4 KERNEL_2_k1m8n4 "addq $16,%1;"
+#define KERNEL_k1m8n6 KERNEL_1_k1m8n6 KERNEL_2_k1m8n6 "addq $16,%1;"
+#define zero_4ymm(no1,no2,no3,no4) \
+  "vpxor %%ymm"#no1",%%ymm"#no1",%%ymm"#no1"; vpxor %%ymm"#no2",%%ymm"#no2",%%ymm"#no2";"\
+  "vpxor %%ymm"#no3",%%ymm"#no3",%%ymm"#no3"; vpxor %%ymm"#no4",%%ymm"#no4",%%ymm"#no4";"
+/* initialization and storage macros */
+#define INIT_m8n1 zero_4ymm(4,5,6,7)
+#define INIT_m8n2 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
+#define INIT_m8n4 zero_4ymm(4,5,6,7) zero_4ymm(8,9,10,11)
+#define INIT_m8n6 INIT_m8n4 zero_4ymm(12,13,14,15)
+#if A_CONJ == B_CONJ
+  #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cl",%%ymm"#cr",%%ymm"#dst";"
+#else
+  #define cont_expacc(cl,cr,dst) "vpermilps $177,%%ymm"#cr",%%ymm"#cr"; vaddsubps %%ymm"#cr",%%ymm"#cl",%%ymm"#dst";"
+#endif
+#if A_CONJ == 0
+  #define save_1ymm(c,tmp,off,alpr,alpi,...) \
+    "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmsubadd213ps "#off"("#__VA_ARGS__"),%%ymm"#alpr",%%ymm"#c";"\
+    "vfmsubadd231ps %%ymm"#tmp",%%ymm"#alpi",%%ymm"#c"; vmovups %%ymm"#c","#off"("#__VA_ARGS__");"
+#else
+  #define save_1ymm(c,tmp,off,alpr,alpi,...) \
+    "vpermilps $177,%%ymm"#c",%%ymm"#tmp"; vfmaddsub213ps "#off"("#__VA_ARGS__"),%%ymm"#alpi",%%ymm"#tmp";"\
+    "vfmaddsub231ps %%ymm"#c",%%ymm"#alpr",%%ymm"#tmp"; vmovups %%ymm"#tmp","#off"("#__VA_ARGS__");"
+#endif
+#define save_init_m8 "movq %2,%3; addq $64,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
+#define SAVE_m8n1 save_init_m8 cont_expacc(4,5,4) cont_expacc(6,7,6) save_1ymm(4,2,0,0,1,%3) save_1ymm(6,3,32,0,1,%3)
+#define SAVE_m8n2 SAVE_m8n1\
+  cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3,%4,1) save_1ymm(10,3,32,0,1,%3,%4,1)
+#define SAVE_m8n4 save_init_m8\
+  save_1ymm(4,2,0,0,1,%3) save_1ymm(5,3,32,0,1,%3) save_1ymm(6,2,0,0,1,%3,%4,1) save_1ymm(7,3,32,0,1,%3,%4,1) "leaq (%3,%4,2),%3;"\
+  save_1ymm(8,2,0,0,1,%3) save_1ymm(9,3,32,0,1,%3) save_1ymm(10,2,0,0,1,%3,%4,1) save_1ymm(11,3,32,0,1,%3,%4,1)
+#define SAVE_m8n6 SAVE_m8n4 "leaq (%3,%4,2),%3;"\
+  save_1ymm(12,2,0,0,1,%3) save_1ymm(13,3,32,0,1,%3) save_1ymm(14,2,0,0,1,%3,%4,1) save_1ymm(15,3,32,0,1,%3,%4,1)
+#define COMPUTE_m8(ndim) \
+  "movq %%r14,%1;" INIT_m8n##ndim "movq %2,%3; movq %%r13,%5;"\
+  "testq %5,%5; jz "#ndim"8883f; cmpq $10,%5; jb "#ndim"8882f;"\
+  "movq $10,%5; movq $84,%%r15;"\
+  #ndim"8881:\n\t"\
+  "prefetcht1 (%3); subq $63,%3; addq %%r15,%3;"\
+  "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
+  KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
+  "testq $12,%5; movq $84,%%r15; cmovz %4,%%r15; prefetcht1 (%8); addq $16,%8;"\
+  KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
+  "addq $4,%5; cmpq %5,%%r13; jnb "#ndim"8881b;"\
+  "movq %2,%3; negq %5; leaq 10(%%r13,%5,1),%5; prefetcht0 (%6); prefetcht0 7(%6);"\
+  #ndim"8882:\n\t"\
+  "prefetcht0 (%3); prefetcht0 63(%3); addq %4,%3;"\
+  KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\
+  #ndim"8883:\n\t"\
+  "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim
+/* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
+#define KERNEL_k1m4n1 \
+  "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
+  "vbroadcastsd (%1),%%ymm0;" acc_m4n1_exp(1,2,0,4,5) "addq $8,%1;"
+#define acc_m4n2_exp(c1l,c1r,c2l,c2r,...) \
+  "vbroadcastsd ("#__VA_ARGS__"),%%ymm2;" acc_m4n1_exp(0,1,2,c1l,c1r)\
+  "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3;" acc_m4n1_exp(0,1,3,c2l,c2r)
+#define KERNEL_h_k1m4n2 \
+  "vmovsldup (%0),%%ymm0; vmovshdup (%0),%%ymm1; addq $32,%0;" acc_m4n2_exp(4,5,6,7,%1)
+#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 acc_m4n2_exp(8,9,10,11,%1,%%r12,1)
+#define KERNEL_h_k1m4n6 KERNEL_h_k1m4n4 acc_m4n2_exp(12,13,14,15,%1,%%r12,2)
+#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
+#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
+#define KERNEL_k1m4n6 KERNEL_h_k1m4n6 "addq $16,%1;"
+#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4; vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m4n2 zero_4ymm(4,5,6,7)
+#define INIT_m4n4 INIT_m4n2 zero_4ymm(8,9,10,11)
+#define INIT_m4n6 INIT_m4n4 zero_4ymm(12,13,14,15)
+#define save_init_m4 "movq %2,%3; addq $32,%2; vbroadcastss (%6),%%ymm0; vbroadcastss 4(%6),%%ymm1;"
+#define SAVE_m4n1 save_init_m4 cont_expacc(4,5,4) save_1ymm(4,2,0,0,1,%3)
+#define SAVE_m4n2 SAVE_m4n1 cont_expacc(6,7,6) save_1ymm(6,3,0,0,1,%3,%4,1)
+#define SAVE_m4n4 SAVE_m4n2 "leaq (%3,%4,2),%3;"\
+  cont_expacc(8,9,8) cont_expacc(10,11,10) save_1ymm(8,2,0,0,1,%3) save_1ymm(10,3,0,0,1,%3,%4,1)
+#define SAVE_m4n6 SAVE_m4n4 "leaq (%3,%4,2),%3;"\
+  cont_expacc(12,13,12) cont_expacc(14,15,14) save_1ymm(12,2,0,0,1,%3) save_1ymm(14,3,0,0,1,%3,%4,1)
+#define COMPUTE_m4(ndim) \
+  "movq %%r14,%1;" INIT_m4n##ndim "movq %%r13,%5;"\
+  "testq %5,%5; jz "#ndim"4442f;"\
+  #ndim"4441:\n\t"\
+  KERNEL_k1m4n##ndim\
+  "decq %5; jnz "#ndim"4441b;"\
+  #ndim"4442:\n\t"\
+  SAVE_m4n##ndim
+/* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */
+#if A_CONJ == B_CONJ
+  #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
+#else
+  #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
+#endif
+#define KERNEL_h_k1m2n1 \
+  "vmovsldup (%0),%%xmm0; vmovshdup (%0),%%xmm1; addq $16,%0;"\
+  "vmovddup (%1),%%xmm2;" acc_m2n1_exp(0,1,2,4,5)
+#define KERNEL_h_k1m2n2 KERNEL_h_k1m2n1\
+  "vmovddup 8(%1),%%xmm3;" acc_m2n1_exp(0,1,3,6,7)
+#define acc_m2n2_exp(c1,c2,c3,c4,...)\
+  "vmovddup ("#__VA_ARGS__"),%%xmm2;" acc_m2n1_exp(0,1,2,c1,c2)\
+  "vmovddup 8("#__VA_ARGS__"),%%xmm3;" acc_m2n1_exp(0,1,3,c3,c4)
+#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 acc_m2n2_exp(8,9,10,11,%1,%%r12,1)
+#define KERNEL_h_k1m2n6 KERNEL_h_k1m2n4 acc_m2n2_exp(12,13,14,15,%1,%%r12,2)
+#define KERNEL_k1m2n1 KERNEL_h_k1m2n1 "addq $8,%1;"
+#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
+#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
+#define KERNEL_k1m2n6 KERNEL_h_k1m2n6 "addq $16,%1;"
+#define zero_2xmm(no1,no2) "vpxor %%xmm"#no1",%%xmm"#no1",%%xmm"#no1"; vpxor %%xmm"#no2",%%xmm"#no2",%%xmm"#no2";"
+#define INIT_m2n1 zero_2xmm(4,5)
+#define INIT_m2n2 INIT_m2n1 zero_2xmm(6,7)
+#define INIT_m2n4 INIT_m2n2 zero_2xmm(8,9) zero_2xmm(10,11)
+#define INIT_m2n6 INIT_m2n4 zero_2xmm(12,13) zero_2xmm(14,15)
+#if A_CONJ == B_CONJ
+  #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cl",%%xmm"#cr",%%xmm"#dst";"
+#else
+  #define cont_expxmmacc(cl,cr,dst) "vpermilps $177,%%xmm"#cr",%%xmm"#cr"; vaddsubps %%xmm"#cr",%%xmm"#cl",%%xmm"#dst";"
+#endif
+#if A_CONJ == 0
+  #define save_1xmm(c,tmp,alpr,alpi) \
+    "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmsubadd213ps (%3),%%xmm"#alpr",%%xmm"#c";"\
+    "vfmsubadd231ps %%xmm"#tmp",%%xmm"#alpi",%%xmm"#c"; vmovups %%xmm"#c",(%3); addq %4,%3;"
+#else
+  #define save_1xmm(c,tmp,alpr,alpi) \
+    "vpermilps $177,%%xmm"#c",%%xmm"#tmp"; vfmaddsub213ps (%3),%%xmm"#alpi",%%xmm"#tmp";"\
+    "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp"; vmovups %%xmm"#tmp",(%3); addq %4,%3;"
+#endif
+#define save_init_m2 "movq %2,%3; addq $16,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
+#define SAVE_m2n1 save_init_m2 cont_expxmmacc(4,5,4) save_1xmm(4,2,0,1)
+#define SAVE_m2n2 SAVE_m2n1 cont_expacc(6,7,6) save_1xmm(6,3,0,1)
+#define SAVE_m2n4 SAVE_m2n2 cont_expacc(8,9,8) save_1xmm(8,2,0,1) cont_expacc(10,11,10) save_1xmm(10,3,0,1)
+#define SAVE_m2n6 SAVE_m2n4 cont_expacc(12,13,12) save_1xmm(12,2,0,1) cont_expacc(14,15,14) save_1xmm(14,3,0,1)
+#define COMPUTE_m2(ndim) \
+  "movq %%r14,%1;" INIT_m2n##ndim "movq %%r13,%5;"\
+  "testq %5,%5; jz "#ndim"2222f;"\
+  #ndim"2221:\n\t"\
+  KERNEL_k1m2n##ndim\
+  "decq %5; jnz "#ndim"2221b;"\
+  #ndim"2222:\n\t"\
+  SAVE_m2n##ndim
+/* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */
+#if A_CONJ == B_CONJ
+  #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
+  #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
+#else
+  #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfnmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
+  #define acc_m1n2_exp(arb,aib,b4,cl,cr) "vfmadd231ps %%xmm"#arb",%%xmm"#b4",%%xmm"#cl"; vfnmadd231ps %%xmm"#aib",%%xmm"#b4",%%xmm"#cr";"
+#endif
+#define KERNEL_k1m1n1 \
+  "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
+  "vmovsd (%1),%%xmm2; addq $8,%1;" acc_m1n1_exp(0,1,2,4,5)
+#define KERNEL_h_k1m1n2 \
+  "vbroadcastss (%0),%%xmm0; vbroadcastss 4(%0),%%xmm1; addq $8,%0;"\
+  "vmovups (%1),%%xmm2;" acc_m1n2_exp(0,1,2,4,5)
+#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vmovups (%1,%%r12,1),%%xmm2;" acc_m1n2_exp(0,1,2,6,7)
+#define KERNEL_h_k1m1n6 KERNEL_h_k1m1n4 "vmovups (%1,%%r12,2),%%xmm2;" acc_m1n2_exp(0,1,2,8,9)
+#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
+#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
+#define KERNEL_k1m1n6 KERNEL_h_k1m1n6 "addq $16,%1;"
+#define INIT_m1n1 zero_2xmm(4,5)
+#define INIT_m1n2 zero_2xmm(4,5)
+#define INIT_m1n4 INIT_m1n2 zero_2xmm(6,7)
+#define INIT_m1n6 INIT_m1n4 zero_2xmm(8,9)
+#if A_CONJ == 0
+  #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
+    "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c";"\
+    "vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c"; vmovsd %%xmm"#c",(%3);"
+  #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
+    "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
+    "vfmsubadd213ps %%xmm"#tmp2",%%xmm"#alpr",%%xmm"#c"; vfmsubadd231ps %%xmm"#tmp1",%%xmm"#alpi",%%xmm"#c";"\
+    "vmovsd %%xmm"#c",(%3); vmovhpd %%xmm"#c",(%3,%4,1); leaq (%3,%4,2),%3;"
+#else
+  #define save_m1n1(c,tmp1,tmp2,alpr,alpi) \
+    "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1";"\
+    "vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1"; vmovsd %%xmm"#tmp1",(%3);"
+  #define save_m1n2(c,tmp1,tmp2,alpr,alpi) \
+    "vpermilps $177,%%xmm"#c",%%xmm"#tmp1"; vmovsd (%3),%%xmm"#tmp2"; vmovhpd (%3,%4,1),%%xmm"#tmp2",%%xmm"#tmp2";"\
+    "vfmaddsub213ps %%xmm"#tmp2",%%xmm"#alpi",%%xmm"#tmp1"; vfmaddsub231ps %%xmm"#c",%%xmm"#alpr",%%xmm"#tmp1";"\
+    "vmovsd %%xmm"#tmp1",(%3); vmovhpd %%xmm"#tmp1",(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define save_init_m1 "movq %2,%3; addq $8,%2; vbroadcastss (%6),%%xmm0; vbroadcastss 4(%6),%%xmm1;"
+#define SAVE_m1n1 save_init_m1 cont_expxmmacc(4,5,4) save_m1n1(4,2,3,0,1)
+#define SAVE_m1n2 save_init_m1 cont_expxmmacc(4,5,4) save_m1n2(4,2,3,0,1)
+#define SAVE_m1n4 SAVE_m1n2 cont_expxmmacc(6,7,6) save_m1n2(6,2,3,0,1)
+#define SAVE_m1n6 SAVE_m1n4 cont_expxmmacc(8,9,8) save_m1n2(8,2,3,0,1)
+#define COMPUTE_m1(ndim) \
+  "movq %%r14,%1;" INIT_m1n##ndim "movq %%r13,%5;"\
+  "testq %5,%5; jz "#ndim"1112f;"\
+  #ndim"1111:\n\t"\
+  KERNEL_k1m1n##ndim\
+  "decq %5; jnz "#ndim"1111b;"\
+  #ndim"1112:\n\t"\
+  SAVE_m1n##ndim
+#define COMPUTE(ndim) {\
+  b_pref = b_ptr + ndim * K *2;\
+  __asm__ __volatile__ (\
+    "movq %1,%%r14; movq %5,%%r13; movq %5,%%r12; salq $4,%%r12; movq %7,%%r11;"\
+    "cmpq $8,%7; jb "#ndim"9992f;"\
+    #ndim"9991:\n\t"\
+    COMPUTE_m8(ndim)\
+    "subq $8,%7; cmpq $8,%7; jnb "#ndim"9991b;"\
+    #ndim"9992:\n\t"\
+    "cmpq $4,%7; jb "#ndim"9993f;"\
+    COMPUTE_m4(ndim) "subq $4,%7;"\
+    #ndim"9993:\n\t"\
+    "cmpq $2,%7; jb "#ndim"9994f;"\
+    COMPUTE_m2(ndim) "subq $2,%7;"\
+    #ndim"9994:\n\t"\
+    "testq %7,%7; jz "#ndim"9995f;"\
+    COMPUTE_m1(ndim)\
+    #ndim"9995:\n\t"\
+    "movq %%r14,%1; movq %%r13,%5; movq %%r11,%7; vzeroupper;"\
+    :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(K),"+r"(alp),"+r"(M),"+r"(b_pref)\
+    ::"cc","memory","r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5",\
+    "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
+  a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
+}
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0||(alphar==0.0 && alphai==0.0)) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
+#if A_CONJ == B_CONJ
+    float const_val[2] = {-alphar, -alphai};
+#else
+    float const_val[2] = {alphar, alphai};
+#endif
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    float *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*alp = const_val,*b_pref = B;
+    for(;n_count>5;n_count-=6) COMPUTE(6)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}

From c418c81224b56e2a99b5f3e7a159b30bfd8f8d8b Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 23 Dec 2019 23:41:44 +0800
Subject: [PATCH 137/210] Update KERNEL.HASWELL

---
 kernel/x86_64/KERNEL.HASWELL | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 5c11ced1d..9bd34f1e3 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -56,7 +56,7 @@ DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CTRMMKERNEL    =  cgemm_kernel_8x2_haswell.S
-CGEMMKERNEL    =  cgemm_kernel_8x2_haswell.S
+CGEMMKERNEL    =  cgemm_kernel_8x2_haswell.c
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c

From 2cd9306bb5138f8ec796964fa578b2ea1b73e921 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 23 Dec 2019 23:42:30 +0800
Subject: [PATCH 138/210] Update KERNEL.ZEN

---
 kernel/x86_64/KERNEL.ZEN | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN
index be4503d47..aa4ba4834 100644
--- a/kernel/x86_64/KERNEL.ZEN
+++ b/kernel/x86_64/KERNEL.ZEN
@@ -53,7 +53,7 @@ DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CTRMMKERNEL    =  cgemm_kernel_8x2_haswell.S
-CGEMMKERNEL    =  cgemm_kernel_8x2_haswell.S
+CGEMMKERNEL    =  cgemm_kernel_8x2_haswell.c
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c
@@ -64,7 +64,7 @@ CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZTRMMKERNEL    =  zgemm_kernel_4x2_haswell.S
-ZGEMMKERNEL    =  zgemm_kernel_4x2_haswell.S
+ZGEMMKERNEL    =  zgemm_kernel_4x2_haswell.c
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_4.c
 ZGEMMITCOPY    =  ../generic/zgemm_tcopy_4.c
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_2.c

From 611445c7f8d136ce66bb8a825b3383fc8eb028bd Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 23 Dec 2019 23:44:55 +0800
Subject: [PATCH 139/210] Update param.h

---
 param.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/param.h b/param.h
index 5fb0868b2..d80bbf4f2 100644
--- a/param.h
+++ b/param.h
@@ -668,8 +668,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_P 768
 #define DGEMM_DEFAULT_P 512
-#define CGEMM_DEFAULT_P 384
-#define ZGEMM_DEFAULT_P 256
+#define CGEMM_DEFAULT_P 256
+#define ZGEMM_DEFAULT_P 192
 
 #ifdef WINDOWS_ABI
 #define SGEMM_DEFAULT_Q 320
@@ -678,8 +678,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 384
 #define DGEMM_DEFAULT_Q 256
 #endif
-#define CGEMM_DEFAULT_Q 192
-#define ZGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 256
+#define ZGEMM_DEFAULT_Q 192
 
 #define SGEMM_DEFAULT_R sgemm_r
 #define DGEMM_DEFAULT_R 13824
@@ -1571,7 +1571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_P 768
 #define DGEMM_DEFAULT_P 512
-#define CGEMM_DEFAULT_P 384
+#define CGEMM_DEFAULT_P 256
 #define ZGEMM_DEFAULT_P 192
 
 #ifdef WINDOWS_ABI
@@ -1581,7 +1581,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 384
 #define DGEMM_DEFAULT_Q 256
 #endif
-#define CGEMM_DEFAULT_Q 192
+#define CGEMM_DEFAULT_Q 256
 #define ZGEMM_DEFAULT_Q 192
 
 #define SGEMM_DEFAULT_R sgemm_r

From 6fbe51072bed086b71d18ed77ee7b8cc79e63dd6 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 24 Dec 2019 00:24:40 +0800
Subject: [PATCH 140/210] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 3859a9c19..99f82df9d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -171,3 +171,9 @@ In chronological order:
   * [2019-02-01] added missing Blas Level-1,2 (single precision)  simd codes
   * [2019-03-14] power9 dgemm/dtrmm kernel
   * [2019-04-29] power9 sgemm/strmm kernel 
+
+* Jiachen Wang <https://github.com/wjc404>
+  * [2018.07] optimize AVX2 DGEMM
+  * [2018-11] optimize AVX512 SGEMM and DGEMM
+  * [2018-11] AVX512 CGEMM & ZGEMM kernels
+  * [2018-12] optimize AVX2 CGEMM and ZGEMM

From 3ce6bcdb5f61fad716703b9facf26087aade7ae2 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 24 Dec 2019 00:30:16 +0800
Subject: [PATCH 141/210] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 99f82df9d..6d30ee942 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -173,7 +173,8 @@ In chronological order:
   * [2019-04-29] power9 sgemm/strmm kernel 
 
 * Jiachen Wang <https://github.com/wjc404>
-  * [2018.07] optimize AVX2 DGEMM
-  * [2018-11] optimize AVX512 SGEMM and DGEMM
-  * [2018-11] AVX512 CGEMM & ZGEMM kernels
-  * [2018-12] optimize AVX2 CGEMM and ZGEMM
+  * [2019-07-29] optimize AVX2 DGEMM
+  * [2019-10-20] AVX512 DGEMM kernel (4x8)
+  * [2019-11-06] optimize AVX512 SGEMM
+  * [2019-11-12] AVX512 CGEMM & ZGEMM kernels
+  * [2019-12-23] optimize AVX2 CGEMM and ZGEMM

From eeecd623d85e90c75172b610e4ecb11f4c04650e Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 24 Dec 2019 00:40:16 +0800
Subject: [PATCH 142/210] Update cgemm_kernel_8x2_haswell.c

---
 kernel/x86_64/cgemm_kernel_8x2_haswell.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/x86_64/cgemm_kernel_8x2_haswell.c b/kernel/x86_64/cgemm_kernel_8x2_haswell.c
index 49fef90db..eab8c9ea5 100644
--- a/kernel/x86_64/cgemm_kernel_8x2_haswell.c
+++ b/kernel/x86_64/cgemm_kernel_8x2_haswell.c
@@ -104,6 +104,7 @@
   KERNEL_k1m8n##ndim "decq %5; jnz "#ndim"8882b;"\
   #ndim"8883:\n\t"\
   "prefetcht0 (%%r14); prefetcht0 64(%%r14);" SAVE_m8n##ndim
+
 /* m=4, ymm 0-3 temp, ymm 4-15 acc, expanded accumulators */
 #define KERNEL_k1m4n1 \
   "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
@@ -137,6 +138,7 @@
   "decq %5; jnz "#ndim"4441b;"\
   #ndim"4442:\n\t"\
   SAVE_m4n##ndim
+
 /* m=2, xmm 0-3 temp, xmm 4-15 acc, expanded accumulators */
 #if A_CONJ == B_CONJ
   #define acc_m2n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
@@ -189,6 +191,7 @@
   "decq %5; jnz "#ndim"2221b;"\
   #ndim"2222:\n\t"\
   SAVE_m2n##ndim
+
 /* m=1, xmm 0-3 temp, xmm 4-9 acc, expanded accumulators */
 #if A_CONJ == B_CONJ
   #define acc_m1n1_exp(ar,ai,b2,cl,cr) "vfmadd231ps %%xmm"#ar",%%xmm"#b2",%%xmm"#cl"; vfmadd231ps %%xmm"#ai",%%xmm"#b2",%%xmm"#cr";"
@@ -242,6 +245,7 @@
   "decq %5; jnz "#ndim"1111b;"\
   #ndim"1112:\n\t"\
   SAVE_m1n##ndim
+
 #define COMPUTE(ndim) {\
   b_pref = b_ptr + ndim * K *2;\
   __asm__ __volatile__ (\
@@ -266,6 +270,7 @@
     "xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\
   a_ptr -= M * K *2; b_ptr += ndim * K *2; c_ptr += (ndim * LDC - M) * 2;\
 }
+
 int __attribute__ ((noinline))
 CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
 {

From 5fd1edead95b86df0e92fd2be1e0435d746af56d Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 27 Dec 2019 18:00:55 +0800
Subject: [PATCH 143/210] Create cgemm3m_kernel_8x4_haswell.c

---
 kernel/x86_64/cgemm3m_kernel_8x4_haswell.c | 279 +++++++++++++++++++++
 1 file changed, 279 insertions(+)
 create mode 100644 kernel/x86_64/cgemm3m_kernel_8x4_haswell.c

diff --git a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c
new file mode 100644
index 000000000..831f25483
--- /dev/null
+++ b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c
@@ -0,0 +1,279 @@
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
+/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */
+
+#include "common.h"
+#include <stdint.h>
+
+//recommended settings: GEMM_P = 320, GEMM_Q = 320.
+
+/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
+#define KERNEL_k1m8n1 \
+    "vmovups (%0),%%ymm1; addq $32,%0;"\
+    "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
+    "addq $4,%1;"
+#define KERNEL_h_k1m8n2 \
+    "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
+    "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
+#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;"
+#define KERNEL_h_k1m8n4 \
+    KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
+#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
+#define unit_kernel_k1m8n4(c1,c2,c3,c4,...) \
+    "vbroadcastsd  ("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
+    "vbroadcastsd 8("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
+#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1)
+#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;"
+#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2)
+#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;"
+#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
+#define unit_init_m8n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m8n8  INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
+#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
+#define SAVE_m8n1 \
+    "vunpcklps %%ymm4,%%ymm4,%%ymm2; vunpckhps %%ymm4,%%ymm4,%%ymm3;"\
+    "vperm2f128 $2,%%ymm2,%%ymm3,%%ymm1; vperm2f128 $19,%%ymm2,%%ymm3,%%ymm2;"\
+    "vfmadd213ps (%2),%%ymm0,%%ymm1; vfmadd213ps 32(%2),%%ymm0,%%ymm2; vmovups %%ymm1,(%2); vmovups %%ymm2,32(%2);"
+#define unit_save_m8n2(c1,c2) \
+    "vunpcklpd "#c2","#c1",%%ymm2; vunpckhpd "#c2","#c1",%%ymm3;"\
+    "vperm2f128 $2,%%ymm2,%%ymm3,"#c1"; vperm2f128 $19,%%ymm2,%%ymm3,"#c2";"\
+    "vmovsldup "#c1",%%ymm2; vmovsldup "#c2",%%ymm3;"\
+    "vfmadd213ps (%5),%%ymm0,%%ymm2; vfmadd213ps 32(%5),%%ymm0,%%ymm3; vmovups %%ymm2,(%5); vmovups %%ymm3,32(%5);"\
+    "vmovshdup "#c1",%%ymm2; vmovshdup "#c2",%%ymm3;"\
+    "vfmadd213ps (%5,%3,1),%%ymm0,%%ymm2; vfmadd213ps 32(%5,%3,1),%%ymm0,%%ymm3; vmovups %%ymm2,(%5,%3,1); vmovups %%ymm3,32(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
+#define SAVE_m8n4  SAVE_m8n2  unit_save_m8n2(%%ymm6,%%ymm7)
+#define SAVE_m8n8  SAVE_m8n4  unit_save_m8n2(%%ymm8,%%ymm9)   unit_save_m8n2(%%ymm10,%%ymm11)
+#define SAVE_m8n12 SAVE_m8n8  unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
+#define COMPUTE_m8(ndim) \
+    INIT_m8n##ndim\
+    "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\
+    "cmpq $24,%4; jb "#ndim"882f;"\
+    #ndim"881:\n\t"\
+    "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
+    "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
+    "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
+    "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
+    "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\
+    "prefetcht0 64(%1); prefetcht0 64(%1,%%r12,1); prefetcht0 64(%1,%%r12,2);"\
+    "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
+    "prefetcht0 512(%0);" KERNEL_k1m8n##ndim KERNEL_k1m8n##ndim\
+    "prefetcht1 (%8); addq $16,%8;"\
+    "subq $8,%4; cmpq $24,%4; jnb "#ndim"881b;"\
+    "movq %2,%5;"\
+    #ndim"882:\n\t"\
+    "testq %4,%4; jz "#ndim"883f;"\
+    "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\
+    KERNEL_k1m8n##ndim\
+    "decq %4; jmp "#ndim"882b;"\
+    #ndim"883:\n\t"\
+    "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
+    SAVE_m8n##ndim "addq $64,%2;"
+
+/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
+#define KERNEL_k1m4n1 \
+    "vmovups (%0),%%xmm1; addq $16,%0;"\
+    "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $4,%1;"
+#define KERNEL_h_k1m4n2 \
+    "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
+    "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
+#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;"
+#define KERNEL_h_k1m4n4 \
+    KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
+#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
+#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
+    "vmovddup  ("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
+    "vmovddup 8("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
+#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,%1,%%r12,1)
+#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;"
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,%1,%%r12,2)
+#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;"
+#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define unit_init_m4n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m4n8  INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
+#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
+#define SAVE_m4n1 \
+    "vunpcklps %%xmm4,%%xmm4,%%xmm2; vunpckhps %%xmm4,%%xmm4,%%xmm3;"\
+    "vfmadd213ps (%2),%%xmm0,%%xmm2; vfmadd213ps 16(%2),%%xmm0,%%xmm3; vmovups %%xmm2,(%2); vmovups %%xmm3,16(%2);"
+#define unit_save_m4n2(c1,c2) \
+    "vunpcklpd "#c2","#c1",%%xmm2; vunpckhpd "#c2","#c1","#c2"; vmovapd %%xmm2,"#c1";"\
+    "vmovsldup "#c1",%%xmm2; vmovsldup "#c2",%%xmm3;"\
+    "vfmadd213ps (%5),%%xmm0,%%xmm2; vfmadd213ps 16(%5),%%xmm0,%%xmm3; vmovups %%xmm2,(%5); vmovups %%xmm3,16(%5);"\
+    "vmovshdup "#c1",%%xmm2; vmovshdup "#c2",%%xmm3;"\
+    "vfmadd213ps (%5,%3,1),%%xmm0,%%xmm2; vfmadd213ps 16(%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm2,(%5,%3,1); vmovups %%xmm3,16(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
+#define SAVE_m4n4  SAVE_m4n2  unit_save_m4n2(%%xmm6,%%xmm7)
+#define SAVE_m4n8  SAVE_m4n4  unit_save_m4n2(%%xmm8,%%xmm9)   unit_save_m4n2(%%xmm10,%%xmm11)
+#define SAVE_m4n12 SAVE_m4n8  unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
+#define COMPUTE_m4(ndim) \
+    INIT_m4n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim"442:\n\t"\
+    "testq %4,%4; jz "#ndim"443f;"\
+    KERNEL_k1m4n##ndim\
+    "decq %4; jmp "#ndim"442b;"\
+    #ndim"443:\n\t"\
+    SAVE_m4n##ndim "addq $32,%2;"
+
+/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
+#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define KERNEL_k1m2n1 \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $4,%1;"
+#define SAVE_m2n1 \
+    "vunpcklps %%xmm4,%%xmm4,%%xmm1; vfmadd213ps (%2),%%xmm0,%%xmm1; vmovups %%xmm1,(%2);"
+#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define KERNEL_k1m2n2 \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vbroadcastss  (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
+    "addq $8,%1;"
+#define SAVE_m2n2 SAVE_m2n1 \
+    "vunpcklps %%xmm5,%%xmm5,%%xmm1; vfmadd213ps (%2,%3,1),%%xmm0,%%xmm1; vmovups %%xmm1,(%2,%3,1);"
+#define INIT_m2n4  INIT_m2n2
+#define INIT_m2n8  INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
+#define KERNEL_k1m2n4 \
+    "vmovups (%1),%%xmm3; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
+    "addq $8,%0;"
+#define KERNEL_k1m2n8 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
+    "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
+    "addq $8,%0;"
+#define KERNEL_k1m2n12 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
+    "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
+    "addq $8,%0;"
+#define unit_save_m2n4(c1,c2) \
+    "vunpcklpd "#c2","#c1",%%xmm1; vunpckhpd "#c2","#c1",%%xmm2;"\
+    "vmovsldup %%xmm1,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\
+    "vmovshdup %%xmm1,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"\
+    "vmovsldup %%xmm2,%%xmm3; vfmadd213ps (%5),%%xmm0,%%xmm3; vmovups %%xmm3,(%5);"\
+    "vmovshdup %%xmm2,%%xmm3; vfmadd213ps (%5,%3,1),%%xmm0,%%xmm3; vmovups %%xmm3,(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
+#define SAVE_m2n8   SAVE_m2n4   unit_save_m2n4(%%xmm6,%%xmm7)
+#define SAVE_m2n12  SAVE_m2n8   unit_save_m2n4(%%xmm8,%%xmm9)
+#define COMPUTE_m2(ndim) \
+    INIT_m2n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim"222:\n\t"\
+    "testq %4,%4; jz "#ndim"223f;"\
+    KERNEL_k1m2n##ndim\
+    "decq %4; jmp "#ndim"222b;"\
+    #ndim"223:\n\t"\
+    SAVE_m2n##ndim "addq $16,%2;"
+
+/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
+#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define KERNEL_k1m1n1 \
+    "vmovss (%1),%%xmm3; addq $4,%1;"\
+    "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#define SAVE_m1n1 \
+    "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
+#define INIT_m1n2 INIT_m1n1
+#define KERNEL_k1m1n2 \
+    "vmovsd (%1),%%xmm3; addq $8,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#define SAVE_m1n2 \
+    "vunpcklps %%xmm4,%%xmm4,%%xmm4; vmovsd (%2),%%xmm3; vmovhpd (%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
+    "vmovsd %%xmm4,(%2); vmovhpd %%xmm4,(%2,%3,1);"
+#define INIT_m1n4  INIT_m1n2
+#define INIT_m1n8  INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
+#define KERNEL_k1m1n4 \
+    "vmovups (%1),%%xmm3; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#define KERNEL_k1m1n8 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
+    "addq $4,%0;"
+#define KERNEL_k1m1n12 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,1),%%xmm2; vmovups (%1,%%r12,2),%%xmm1; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
+    "addq $4,%0;"
+#define unit_save_m1n4(c1) \
+    "vunpcklps "#c1","#c1",%%xmm1; vunpckhps "#c1","#c1",%%xmm2;"\
+    "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
+    "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
+    "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
+#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
+#define SAVE_m1n8  SAVE_m1n4    unit_save_m1n4(%%xmm5)
+#define SAVE_m1n12 SAVE_m1n8    unit_save_m1n4(%%xmm6)
+#define COMPUTE_m1(ndim) \
+    INIT_m1n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim"112:\n\t"\
+    "testq %4,%4; jz "#ndim"113f;"\
+    KERNEL_k1m1n##ndim\
+    "decq %4; jmp "#ndim"112b;"\
+    #ndim"113:\n\t"\
+    SAVE_m1n##ndim "addq $8,%2;"
+
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
+/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */
+/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const),r15 = tmp */
+
+#define COMPUTE(ndim) {\
+    next_b = b_pointer + ndim * K;\
+    __asm__ __volatile__(\
+    "vbroadcastsd (%6),%%ymm0;"\
+    "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
+    "cmpq $8,%7;jb 33101"#ndim"f;"\
+    "33109"#ndim":\n\t"\
+    COMPUTE_m8(ndim)\
+    "subq $8,%7;cmpq $8,%7;jnb 33109"#ndim"b;"\
+    "33101"#ndim":\n\t"\
+    "cmpq $4,%7;jb 33103"#ndim"f;"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%7;"\
+    "33103"#ndim":\n\t"\
+    "cmpq $2,%7;jb 33104"#ndim"f;"\
+    COMPUTE_m2(ndim)\
+    "subq $2,%7;"\
+    "33104"#ndim":\n\t"\
+    "testq %7,%7;jz 33105"#ndim"f;"\
+    COMPUTE_m1(ndim)\
+    "33105"#ndim":\n\t"\
+    "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\
+    ::"r11","r12","r13","r14","r15"\
+    "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
+    a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\
+}
+
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alphar, float alphai, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float) * 2;
+    float constval[2]; constval[0] = alphar; constval[1] = alphai;
+    float *const_val=constval;
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}

From ed9af2f7dae61a23a18aab11025e1b4e586f5a51 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 27 Dec 2019 18:01:38 +0800
Subject: [PATCH 144/210] Update KERNEL.HASWELL

---
 kernel/x86_64/KERNEL.HASWELL | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 9bd34f1e3..bdebd22b9 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -97,6 +97,6 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
-CGEMM3MKERNEL    =  zgemm3m_kernel_4x8_nehalem.S
+CGEMM3MKERNEL    =  cgemm3m_kernel_8x4_haswell.c
 ZGEMM3MKERNEL    =  zgemm3m_kernel_2x8_nehalem.S
 

From 4c35b8dbaacfe23f76c48517674da8cf01cd2828 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 27 Dec 2019 18:03:01 +0800
Subject: [PATCH 145/210] Update gemm3m_level3.c

---
 driver/level3/gemm3m_level3.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/driver/level3/gemm3m_level3.c b/driver/level3/gemm3m_level3.c
index bbde7e5d1..d037e72cd 100644
--- a/driver/level3/gemm3m_level3.c
+++ b/driver/level3/gemm3m_level3.c
@@ -338,7 +338,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
+	if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
 
 	START_RPCC();
 
@@ -398,7 +398,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
+	if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
 
 	START_RPCC();
 
@@ -463,7 +463,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
+	if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
 
 	START_RPCC();
 

From 3a66c8cac18dbbc172fc703feab22f53755a52c9 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 27 Dec 2019 18:04:08 +0800
Subject: [PATCH 146/210] Update KERNEL.ZEN

---
 kernel/x86_64/KERNEL.ZEN | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN
index aa4ba4834..025db515e 100644
--- a/kernel/x86_64/KERNEL.ZEN
+++ b/kernel/x86_64/KERNEL.ZEN
@@ -94,6 +94,6 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
-CGEMM3MKERNEL    =  zgemm3m_kernel_4x8_nehalem.S
+CGEMM3MKERNEL    =  cgemm3m_kernel_8x4_haswell.c
 ZGEMM3MKERNEL    =  zgemm3m_kernel_2x8_nehalem.S
 

From 64639f440f7e9cf630100e4b03999e9321018876 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 27 Dec 2019 18:06:42 +0800
Subject: [PATCH 147/210] Update param.h

---
 param.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/param.h b/param.h
index d80bbf4f2..4084c781d 100644
--- a/param.h
+++ b/param.h
@@ -693,15 +693,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define XGEMM_DEFAULT_R xgemm_r
 #define XGEMM_DEFAULT_Q 128
 
-#define CGEMM3M_DEFAULT_UNROLL_N 8
-#define CGEMM3M_DEFAULT_UNROLL_M 4
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
 #define ZGEMM3M_DEFAULT_UNROLL_N 8
 #define ZGEMM3M_DEFAULT_UNROLL_M 2
 
-#define CGEMM3M_DEFAULT_P 448
+#define CGEMM3M_DEFAULT_P 320
 #define ZGEMM3M_DEFAULT_P 224
 #define XGEMM3M_DEFAULT_P 112
-#define CGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_Q 320
 #define ZGEMM3M_DEFAULT_Q 224
 #define XGEMM3M_DEFAULT_Q 224
 #define CGEMM3M_DEFAULT_R 12288
@@ -1596,15 +1596,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define XGEMM_DEFAULT_R xgemm_r
 #define XGEMM_DEFAULT_Q 128
 
-#define CGEMM3M_DEFAULT_UNROLL_N 8
-#define CGEMM3M_DEFAULT_UNROLL_M 4
+#define CGEMM3M_DEFAULT_UNROLL_N 4
+#define CGEMM3M_DEFAULT_UNROLL_M 8
 #define ZGEMM3M_DEFAULT_UNROLL_N 8
 #define ZGEMM3M_DEFAULT_UNROLL_M 2
 
-#define CGEMM3M_DEFAULT_P 448
+#define CGEMM3M_DEFAULT_P 320
 #define ZGEMM3M_DEFAULT_P 224
 #define XGEMM3M_DEFAULT_P 112
-#define CGEMM3M_DEFAULT_Q 224
+#define CGEMM3M_DEFAULT_Q 320
 #define ZGEMM3M_DEFAULT_Q 224
 #define XGEMM3M_DEFAULT_Q 224
 #define CGEMM3M_DEFAULT_R 12288

From cd765f094b52bc010091f4782e232706a854ea90 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 27 Dec 2019 18:23:29 +0800
Subject: [PATCH 148/210] Update cgemm3m_kernel_8x4_haswell.c

---
 kernel/x86_64/cgemm3m_kernel_8x4_haswell.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c
index 831f25483..01fbf3064 100644
--- a/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c
+++ b/kernel/x86_64/cgemm3m_kernel_8x4_haswell.c
@@ -255,7 +255,7 @@
     "33105"#ndim":\n\t"\
     "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
     :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\
-    ::"r11","r12","r13","r14","r15"\
+    ::"r11","r12","r13","r14","r15",\
     "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
     a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\
 }

From 312060d0d6b720eeb9bafbddaeedf3ba968a3732 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Fri, 27 Dec 2019 23:36:13 +0800
Subject: [PATCH 149/210] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 6d30ee942..fd759913d 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -178,3 +178,4 @@ In chronological order:
   * [2019-11-06] optimize AVX512 SGEMM
   * [2019-11-12] AVX512 CGEMM & ZGEMM kernels
   * [2019-12-23] optimize AVX2 CGEMM and ZGEMM
+  * [2019-12-27] AVX2 CGEMM3M kernel

From 454847588e700991a24b93d71665d3386385024a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 29 Dec 2019 21:27:18 +0100
Subject: [PATCH 150/210] Update LAPACK to 3.9.0

---
 lapack-netlib/SRC/VARIANTS/Makefile | 22 +++++++++++-----------
 lapack-netlib/SRC/VARIANTS/README   | 12 ++++++------
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile
index 9f1410755..25d8ee175 100644
--- a/lapack-netlib/SRC/VARIANTS/Makefile
+++ b/lapack-netlib/SRC/VARIANTS/Makefile
@@ -1,5 +1,3 @@
-include ../../make.inc
-
 #######################################################################
 #  This is the makefile to create a the variants libraries for LAPACK.
 #  The files are organized as follows:
@@ -17,6 +15,9 @@ include ../../make.inc
 #  1065-1081. http://dx.doi.org/10.1137/S0895479896297744
 #######################################################################
 
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
 CHOLRL = cholesky/RL/cpotrf.o cholesky/RL/dpotrf.o cholesky/RL/spotrf.o cholesky/RL/zpotrf.o
 
 CHOLTOP = cholesky/TOP/cpotrf.o cholesky/TOP/dpotrf.o cholesky/TOP/spotrf.o cholesky/TOP/zpotrf.o
@@ -30,37 +31,36 @@ LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o
 QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o
 
 
+.PHONY: all
 all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a
 
 cholrl.a: $(CHOLRL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 choltop.a: $(CHOLTOP)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lucr.a: $(LUCR)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lull.a: $(LULL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lurec.a: $(LUREC)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 qrll.a: $(QRLL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
+.PHONY: clean cleanobj cleanlib
 clean: cleanobj cleanlib
 cleanobj:
 	rm -f $(CHOLRL) $(CHOLTOP) $(LUCR) $(LULL) $(LUREC) $(QRLL)
 cleanlib:
 	rm -f *.a
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
diff --git a/lapack-netlib/SRC/VARIANTS/README b/lapack-netlib/SRC/VARIANTS/README
index 4d301cc6e..ef7626deb 100644
--- a/lapack-netlib/SRC/VARIANTS/README
+++ b/lapack-netlib/SRC/VARIANTS/README
@@ -34,7 +34,7 @@ References:For a more detailed description please refer to
 =========
 
 These variants are compiled by default in the build process but they are not tested by default.
-The build process creates one new library per variants in the four arithmetics (single real/double real/single complex/double complex).
+The build process creates one new library per variants in the four arithmetic (single real/double real/single complex/double complex).
 The libraries are in the SRC/VARIANTS directory.
 
 Corresponding libraries created in SRC/VARIANTS:
@@ -64,16 +64,16 @@ You should then see the following files in the TESTING directory:
 = LINKING YOUR PROGRAM =
 ========================
 
-You just need to add the variants methods library in your linking sequence before your lapack libary.
+You just need to add the variants methods library in your linking sequence before your lapack library.
 Here is a quick example for LU
 
 Default using LU Right Looking version:
- $(FORTRAN) -c myprog.f
- $(FORTRAN) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB)
+ $(FC) $(FFLAGS) -c myprog.f
+ $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB)
 
 Using LU Left Looking version:
- $(FORTRAN) -c myprog.f
- $(FORTRAN) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB)
+ $(FC) $(FFLAGS) -c myprog.f
+ $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB)
 
 ===========
 = SUPPORT =

From 3ccf8885acbb3a536c67fa8a23b54ba000507a21 Mon Sep 17 00:00:00 2001
From: w00421467 <wuzeyi1@huawei.com>
Date: Mon, 30 Dec 2019 11:45:49 +0800
Subject: [PATCH 151/210] prefetching for dgemm_beta

---
 kernel/arm64/dgemm_beta.S | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S
index 636954695..1ce452212 100644
--- a/kernel/arm64/dgemm_beta.S
+++ b/kernel/arm64/dgemm_beta.S
@@ -43,7 +43,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define betaV0		v11.d[0]
 #define I	x16
 
-#define size 128
+#define prfm_size 640
+#define calc_size 128
 
 /**************************************************************************************
 * Macro definitions
@@ -119,27 +120,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	ldp	q2, q3, [A02]
 	ldp	q4, q5, [A03]
 	ldp	q6, q7, [A04]
-  
+ 
 	fmul v0.2d, v0.2d, betaV0
 	fmul v1.2d, v1.2d, betaV0
-	
+
 	fmul v2.2d, v2.2d, betaV0
 	fmul v3.2d, v3.2d, betaV0
-	
+
+	prfm    PLDL1KEEP, [A01, prfm_size]
+
 	fmul v4.2d, v4.2d, betaV0
 	fmul v5.2d, v5.2d, betaV0
-	
+
+	prfm    PLDL1KEEP, [A03, prfm_size]
+
 	fmul v6.2d, v6.2d, betaV0
 	fmul v7.2d, v7.2d, betaV0
 
 	st1	{v0.2d, v1.2d}, [A01]
-	add	A01, A01, size
+	add	A01, A01, calc_size
 	st1	{v2.2d, v3.2d}, [A02]
-	add	A02, A02, size
+	add	A02, A02, calc_size
 	st1	{v4.2d, v5.2d}, [A03]
-	add	A03, A03, size
+	add	A03, A03, calc_size
 	st1	{v6.2d, v7.2d}, [A04]
-	add	A04, A04, size
+	add	A04, A04, calc_size
 
 	subs    I , I , #1
 	bne	.Lgemm_beta_03

From ae1579be13bb864309c494d64a0696fbbc79d819 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 16:02:51 +0800
Subject: [PATCH 152/210] Create zgemm3m_kernel_4x4_haswell.c

---
 kernel/x86_64/zgemm3m_kernel_4x4_haswell.c | 212 +++++++++++++++++++++
 1 file changed, 212 insertions(+)
 create mode 100644 kernel/x86_64/zgemm3m_kernel_4x4_haswell.c

diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
new file mode 100644
index 000000000..7b5b835c8
--- /dev/null
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
@@ -0,0 +1,212 @@
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
+/* r12 = k << 5(const), r13 = k(const), r14 = b_head_pos(const), r15 = tmp */
+
+#include "common.h"
+#include <stdint.h>
+
+//recommended settings: GEMM_Q=256, GEMM_P=256
+
+/* m = 4 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
+#define KERNEL_k1m4n1 \
+    "vmovupd (%0),%%ymm1; addq $32,%0;"\
+    "vbroadcastsd (%1),%%ymm2; vfmadd231pd %%ymm1,%%ymm2,%%ymm4;"\
+    "addq $8,%1;"
+#define KERNEL_h_k1m4n2 \
+    "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;"\
+    "vbroadcastf128 (%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm4; vfmadd231pd %%ymm2,%%ymm3,%%ymm5;"
+#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
+#define KERNEL_h_k1m4n4 \
+    KERNEL_h_k1m4n2 "vbroadcastf128 16(%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm6; vfmadd231pd %%ymm2,%%ymm3,%%ymm7;"
+#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;"
+#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
+    "vbroadcastf128  ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\
+    "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";"
+#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1)
+#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%1;"
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2)
+#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%1;"
+#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define INIT_m4n2 INIT_m4n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m4n4 INIT_m4n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
+#define unit_init_m4n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m4n8  INIT_m4n4 unit_init_m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
+#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
+#define SAVE_h_m4n1 \
+    "vpermpd $216,%%ymm4,%%ymm3; vunpcklpd %%ymm3,%%ymm3,%%ymm1; vunpckhpd %%ymm3,%%ymm3,%%ymm2;"\
+    "vfmadd213pd (%2),%%ymm0,%%ymm1; vfmadd213pd 32(%2),%%ymm0,%%ymm2; vmovupd %%ymm1,(%2); vmovupd %%ymm2,32(%2);"
+#define unit_save_m4n2(c1,c2) \
+    "vperm2f128 $2,"#c1","#c2",%%ymm2; vperm2f128 $19,"#c1","#c2","#c2"; vmovapd %%ymm2,"#c1";"\
+    "vunpcklpd "#c1","#c1",%%ymm2; vunpcklpd "#c2","#c2",%%ymm3;"\
+    "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd 32(%5),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,32(%5);"\
+    "vunpckhpd "#c1","#c1",%%ymm2; vunpckhpd "#c2","#c2",%%ymm3;"\
+    "vfmadd213pd (%5,%3,1),%%ymm0,%%ymm2; vfmadd213pd 32(%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5,%3,1); vmovupd %%ymm3,32(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_h_m4n2 "movq %2,%5;" unit_save_m4n2(%%ymm4,%%ymm5)
+#define SAVE_h_m4n4  SAVE_h_m4n2  unit_save_m4n2(%%ymm6,%%ymm7)
+#define SAVE_h_m4n8  SAVE_h_m4n4  unit_save_m4n2(%%ymm8,%%ymm9)   unit_save_m4n2(%%ymm10,%%ymm11)
+#define SAVE_h_m4n12 SAVE_h_m4n8  unit_save_m4n2(%%ymm12,%%ymm13) unit_save_m4n2(%%ymm14,%%ymm15)
+#define SAVE_m4(ndim) SAVE_h_m4n##ndim "addq $64,%2;"
+#define COMPUTE_m4(ndim) \
+    INIT_m4n##ndim\
+    "movq %%r13,%4; movq %%r14,%1; movq %2,%5; xorq %%r15,%%r15;"\
+    "cmpq $24,%4; jb "#ndim"004042f;"\
+    #ndim"004041:\n\t"\
+    "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
+    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
+    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
+    "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\
+    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
+    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
+    "prefetcht1 (%8); addq $32,%8;"\
+    "subq $8,%4; cmpq $24,%4; jnb "#ndim"004041b;"\
+    "movq %2,%5;"\
+    #ndim"004042:\n\t"\
+    "testq %4,%4; jz "#ndim"004043f;"\
+    "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\
+    KERNEL_k1m4n##ndim\
+    "decq %4; jmp "#ndim"004042b;"\
+    #ndim"004043:\n\t"\
+    "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
+    SAVE_m4(ndim)
+
+/* m = 2 *//* vmm0 for alpha, vmm1-vmm3 for temporary use, vmm4-vmm9 for accumulators */
+#define KERNEL_k1m2n1 \
+    "vmovupd (%0),%%xmm1; addq $16,%0;"\
+    "vmovddup (%1),%%xmm2; vfmadd231pd %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $8,%1;"
+#define KERNEL_h_k1m2n2 \
+    "vmovddup (%0),%%xmm1; vmovddup 8(%0),%%xmm2; addq $16,%0;"\
+    "vmovupd (%1),%%xmm3; vfmadd231pd %%xmm1,%%xmm3,%%xmm4; vfmadd231pd %%xmm2,%%xmm3,%%xmm5;"
+#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
+#define unit_kernel_k1m2n4(c1,c2,...) \
+    "vmovupd ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"
+#define KERNEL_h_k1m2n4 \
+    "vbroadcastsd (%0),%%ymm1; vbroadcastsd 8(%0),%%ymm2; addq $16,%0;"\
+    unit_kernel_k1m2n4(%%ymm4,%%ymm5,%1)
+#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $32,%1;"
+#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 \
+    unit_kernel_k1m2n4(%%ymm6,%%ymm7,%1,%%r12,1)
+#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $32,%1;"
+#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n8 \
+    unit_kernel_k1m2n4(%%ymm8,%%ymm9,%1,%%r12,2)
+#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $32,%1;"
+#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define unit_init_m2n4(c1,c2) "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";"
+#define INIT_m2n4 unit_init_m2n4(%%ymm4,%%ymm5)
+#define INIT_m2n8 INIT_m2n4 unit_init_m2n4(%%ymm6,%%ymm7)
+#define INIT_m2n12 INIT_m2n8 unit_init_m2n4(%%ymm8,%%ymm9)
+#define SAVE_h_m2n1 \
+    "vinsertf128 $1,%%xmm4,%%ymm4,%%ymm4; vpermilpd $12,%%ymm4,%%ymm4; vfmadd213pd (%2),%%ymm0,%%ymm4; vmovupd %%ymm4,(%2);"
+#define SAVE_h_m2n2 \
+    "vinsertf128 $1,%%xmm5,%%ymm4,%%ymm4; vunpcklpd %%ymm4,%%ymm4,%%ymm1; vunpckhpd %%ymm4,%%ymm4,%%ymm2;"\
+    "vfmadd213pd (%2),%%ymm0,%%ymm1; vmovupd %%ymm1,(%2);"\
+    "vfmadd213pd (%2,%3,1),%%ymm0,%%ymm2; vmovupd %%ymm2,(%2,%3,1);"
+#define unit_save_m2n4(c1,c2) \
+    "vperm2f128 $2,"#c1","#c2",%%ymm1; vunpcklpd %%ymm1,%%ymm1,%%ymm2; vunpckhpd %%ymm1,%%ymm1,%%ymm3;"\
+    "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd (%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vperm2f128 $19,"#c1","#c2",%%ymm1; vunpcklpd %%ymm1,%%ymm1,%%ymm2; vunpckhpd %%ymm1,%%ymm1,%%ymm3;"\
+    "vfmadd213pd (%5),%%ymm0,%%ymm2; vfmadd213pd (%5,%3,1),%%ymm0,%%ymm3; vmovupd %%ymm2,(%5); vmovupd %%ymm3,(%5,%3,1); leaq (%5,%3,2),%5;"
+#define SAVE_h_m2n4 "movq %2,%5;" unit_save_m2n4(%%ymm4,%%ymm5)
+#define SAVE_h_m2n8 SAVE_h_m2n4 unit_save_m2n4(%%ymm6,%%ymm7)
+#define SAVE_h_m2n12 SAVE_h_m2n8 unit_save_m2n4(%%ymm8,%%ymm9)
+#define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $32,%2;"
+#define COMPUTE_m2(ndim) \
+    INIT_m2n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim"002022:\n\t"\
+    "testq %4,%4; jz "#ndim"002023f;"\
+    KERNEL_k1m2n##ndim\
+    "decq %4; jmp "#ndim"002022b;"\
+    #ndim"002023:\n\t"\
+    SAVE_m2(ndim)
+
+/* m = 1 *//* vmm0 for alpha, vmm1-vmm3 and vmm10-vmm15 for temporary use, vmm4-vmm6 for accumulators */
+#define KERNEL_k1m1n1 \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vfmadd231sd (%1),%%xmm1,%%xmm4; addq $8,%1;"
+#define KERNEL_k1m1n2 \
+    "vmovddup (%0),%%xmm1; addq $8,%0;"\
+    "vfmadd231pd (%1),%%xmm1,%%xmm4; addq $16,%1;"
+#define unit_kernel_k1m1n4(c1,...) \
+    "vmovupd ("#__VA_ARGS__"),%%ymm2; vfmadd231pd %%ymm1,%%ymm2,"#c1";"
+#define KERNEL_h_k1m1n4 \
+    "vbroadcastsd (%0),%%ymm1; addq $8,%0;"\
+    unit_kernel_k1m1n4(%%ymm4,%1)
+#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $32,%1;"
+#define KERNEL_h_k1m1n8 KERNEL_h_k1m1n4 unit_kernel_k1m1n4(%%ymm5,%1,%%r12,1)
+#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $32,%1;"
+#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n8 unit_kernel_k1m1n4(%%ymm6,%1,%%r12,2)
+#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $32,%1;"
+#define INIT_m1n1 INIT_m2n1
+#define INIT_m1n2 INIT_m2n1
+#define INIT_m1n4 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define INIT_m1n8 INIT_m1n4 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m1n12 INIT_m1n8 "vpxor %%ymm6,%%ymm6,%%ymm6;"
+#define SAVE_h_m1n1 \
+    "vmovddup %%xmm4,%%xmm4; vfmadd213pd (%2),%%xmm0,%%xmm4; vmovupd %%xmm4,(%2);"
+#define SAVE_h_m1n2 \
+    "vunpcklpd %%xmm4,%%xmm4,%%xmm1; vunpckhpd %%xmm4,%%xmm4,%%xmm2;"\
+    "vfmadd213pd (%2),%%xmm0,%%xmm1; vmovupd %%xmm1,(%2);"\
+    "vfmadd213pd (%2,%3,1),%%xmm0,%%xmm2; vmovupd %%xmm2,(%2,%3,1);"
+#define unit_save_m1n4(c1) \
+    "vunpcklpd "#c1","#c1",%%ymm1; vunpckhpd "#c1","#c1",%%ymm2;"\
+    "vmovupd (%5),%%xmm3; vinsertf128 $1,(%5,%3,2),%%ymm3,%%ymm3;"\
+    "vfmadd213pd %%ymm3,%%ymm0,%%ymm1; vmovupd %%xmm1,(%5); vextractf128 $1,%%ymm1,(%5,%3,2); addq %3,%5;"\
+    "vmovupd (%5),%%xmm3; vinsertf128 $1,(%5,%3,2),%%ymm3,%%ymm3;"\
+    "vfmadd213pd %%ymm3,%%ymm0,%%ymm2; vmovupd %%xmm2,(%5); vextractf128 $1,%%ymm2,(%5,%3,2); addq %3,%5; leaq (%5,%3,2),%5;"
+#define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%ymm4)
+#define SAVE_h_m1n8 SAVE_h_m1n4 unit_save_m1n4(%%ymm5)
+#define SAVE_h_m1n12 SAVE_h_m1n8 unit_save_m1n4(%%ymm6)
+#define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $16,%2;"
+#define COMPUTE_m1(ndim) \
+    INIT_m1n##ndim\
+    "movq %%r13,%4; movq %%r14,%1;"\
+    #ndim"001011:\n\t"\
+    "testq %4,%4; jz "#ndim"001012f;"\
+    KERNEL_k1m1n##ndim\
+    "decq %4; jmp "#ndim"001011b;"\
+    #ndim"001012:\n\t"\
+    SAVE_m1(ndim)
+
+#define COMPUTE(ndim) {\
+    next_b = b_pointer + ndim * K;\
+    __asm__ __volatile__(\
+    "vbroadcastf128 (%6),%%ymm0;"\
+    "movq %4,%%r13; movq %4,%%r12; salq $5,%%r12; movq %1,%%r14; movq %7,%%r11;"\
+    "cmpq $4,%7;jb 33101"#ndim"f;"\
+    "33109"#ndim":\n\t"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%7;cmpq $4,%7;jnb 33109"#ndim"b;"\
+    "33101"#ndim":\n\t"\
+    "cmpq $2,%7;jb 33104"#ndim"f;"\
+    COMPUTE_m2(ndim)\
+    "subq $2,%7;"\
+    "33104"#ndim":\n\t"\
+    "testq %7,%7;jz 33105"#ndim"f;"\
+    COMPUTE_m1(ndim)\
+    "33105"#ndim":\n\t"\
+    "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\
+    ::"r11","r12","r13","r14","r15","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14",\
+    "ymm15","cc","memory");\
+    a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\
+}
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alphar, double alphai, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG LDC)
+{
+    if(m==0||n==0||k==0) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(double) * 2;
+    double constval[2]; constval[0] = alphar; constval[1] = alphai;
+    double *const_val=constval;
+    int64_t M = (int64_t)m, K = (int64_t)k;
+    BLASLONG n_count = n;
+    double *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}

From 109e18cd96707e1fab40b2777bf35e8257f540d2 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 16:03:24 +0800
Subject: [PATCH 153/210] Update KERNEL.HASWELL

---
 kernel/x86_64/KERNEL.HASWELL | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index bdebd22b9..9e30c12f2 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -98,5 +98,5 @@ ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 CGEMM3MKERNEL    =  cgemm3m_kernel_8x4_haswell.c
-ZGEMM3MKERNEL    =  zgemm3m_kernel_2x8_nehalem.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c
 

From f60840c4207f8ebdfedfd987505013ed3dcbb3d2 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 16:04:23 +0800
Subject: [PATCH 154/210] Update KERNEL.ZEN

---
 kernel/x86_64/KERNEL.ZEN | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN
index 025db515e..98cd38dfa 100644
--- a/kernel/x86_64/KERNEL.ZEN
+++ b/kernel/x86_64/KERNEL.ZEN
@@ -95,5 +95,5 @@ ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 CGEMM3MKERNEL    =  cgemm3m_kernel_8x4_haswell.c
-ZGEMM3MKERNEL    =  zgemm3m_kernel_2x8_nehalem.S
+ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c
 

From 6362c34ee60490a10f11b0adba1c9c84579c682e Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 16:08:19 +0800
Subject: [PATCH 155/210] Update param.h

---
 param.h | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/param.h b/param.h
index 4084c781d..d03e60fcb 100644
--- a/param.h
+++ b/param.h
@@ -695,14 +695,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define CGEMM3M_DEFAULT_UNROLL_N 4
 #define CGEMM3M_DEFAULT_UNROLL_M 8
-#define ZGEMM3M_DEFAULT_UNROLL_N 8
-#define ZGEMM3M_DEFAULT_UNROLL_M 2
+#define ZGEMM3M_DEFAULT_UNROLL_N 4
+#define ZGEMM3M_DEFAULT_UNROLL_M 4
 
 #define CGEMM3M_DEFAULT_P 320
-#define ZGEMM3M_DEFAULT_P 224
+#define ZGEMM3M_DEFAULT_P 256
 #define XGEMM3M_DEFAULT_P 112
 #define CGEMM3M_DEFAULT_Q 320
-#define ZGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 256
 #define XGEMM3M_DEFAULT_Q 224
 #define CGEMM3M_DEFAULT_R 12288
 #define ZGEMM3M_DEFAULT_R 12288
@@ -1598,14 +1598,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define CGEMM3M_DEFAULT_UNROLL_N 4
 #define CGEMM3M_DEFAULT_UNROLL_M 8
-#define ZGEMM3M_DEFAULT_UNROLL_N 8
-#define ZGEMM3M_DEFAULT_UNROLL_M 2
+#define ZGEMM3M_DEFAULT_UNROLL_N 4
+#define ZGEMM3M_DEFAULT_UNROLL_M 4
 
 #define CGEMM3M_DEFAULT_P 320
-#define ZGEMM3M_DEFAULT_P 224
+#define ZGEMM3M_DEFAULT_P 256
 #define XGEMM3M_DEFAULT_P 112
 #define CGEMM3M_DEFAULT_Q 320
-#define ZGEMM3M_DEFAULT_Q 224
+#define ZGEMM3M_DEFAULT_Q 256
 #define XGEMM3M_DEFAULT_Q 224
 #define CGEMM3M_DEFAULT_R 12288
 #define ZGEMM3M_DEFAULT_R 12288

From aae44d040db343ee2fab98fa600c192cc271e73a Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 16:10:08 +0800
Subject: [PATCH 156/210] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index fd759913d..3d7617f92 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -178,4 +178,4 @@ In chronological order:
   * [2019-11-06] optimize AVX512 SGEMM
   * [2019-11-12] AVX512 CGEMM & ZGEMM kernels
   * [2019-12-23] optimize AVX2 CGEMM and ZGEMM
-  * [2019-12-27] AVX2 CGEMM3M kernel
+  * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernel

From bb2729c85521310f26ed08792711c2b9c8cf3299 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 16:11:37 +0800
Subject: [PATCH 157/210] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 3d7617f92..9829c31f9 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -178,4 +178,4 @@ In chronological order:
   * [2019-11-06] optimize AVX512 SGEMM
   * [2019-11-12] AVX512 CGEMM & ZGEMM kernels
   * [2019-12-23] optimize AVX2 CGEMM and ZGEMM
-  * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernel
+  * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels

From 700fe5b5ee4866b9ff791a5952f5e6ae8afa2d04 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 17:18:59 +0800
Subject: [PATCH 158/210] Add files via upload

---
 kernel/x86_64/zgemm3m_kernel_4x4_haswell.c | 38 ++++++++++++++--------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
index 7b5b835c8..c57dccd36 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
@@ -18,13 +18,26 @@
 #define KERNEL_h_k1m4n4 \
     KERNEL_h_k1m4n2 "vbroadcastf128 16(%1),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm6; vfmadd231pd %%ymm2,%%ymm3,%%ymm7;"
 #define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $32,%1;"
-#define unit_kernel_k1m4n4(c1,c2,c3,c4,...) \
-    "vbroadcastf128  ("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\
-    "vbroadcastf128 16("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";"
-#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,%1,%%r12,1)
+#define unit_kernel_k1m4n4(c1,c2,c3,c4,off1,off2,...) \
+    "vbroadcastf128 "#off1"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c1"; vfmadd231pd %%ymm2,%%ymm3,"#c2";"\
+    "vbroadcastf128 "#off2"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,"#c3"; vfmadd231pd %%ymm2,%%ymm3,"#c4";"
+#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)
 #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $32,%1;"
-#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,%1,%%r12,2)
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)
 #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $32,%1;"
+#define KERNEL_k2m4n1 KERNEL_k1m4n1 KERNEL_k1m4n1
+#define KERNEL_k2m4n2 KERNEL_k1m4n2 KERNEL_k1m4n2
+#define KERNEL_k2m4n4 KERNEL_k1m4n4 KERNEL_k1m4n4
+#define KERNEL_k2m4n8 KERNEL_k1m4n8 KERNEL_k1m4n8
+#define KERNEL_k2m4n12 \
+    "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2;"\
+    unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,16,%1)\
+    unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,16,%1,%%r12,1)\
+    unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,16,%1,%%r12,2)\
+    "vmovddup 32(%0),%%ymm1; vmovddup 40(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\
+    unit_kernel_k1m4n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,32,48,%1)\
+    unit_kernel_k1m4n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,32,48,%1,%%r12,1)\
+    unit_kernel_k1m4n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,32,48,%1,%%r12,2) "addq $64,%1;"
 #define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
 #define INIT_m4n2 INIT_m4n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
 #define INIT_m4n4 INIT_m4n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
@@ -53,18 +66,17 @@
     "cmpq $24,%4; jb "#ndim"004042f;"\
     #ndim"004041:\n\t"\
     "cmpq $126,%%r15; movq $126,%%r15; cmoveq %3,%%r15;"\
-    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
-    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
-    "prefetcht1 (%5); leaq -63(%5,%%r15,1),%5;"\
-    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
-    "prefetcht0 512(%0);" KERNEL_k1m4n##ndim KERNEL_k1m4n##ndim\
-    "prefetcht1 (%8); addq $32,%8;"\
-    "subq $8,%4; cmpq $24,%4; jnb "#ndim"004041b;"\
+    KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\
+    "prefetcht1 (%5); subq $63,%5;"\
+    KERNEL_k2m4n##ndim KERNEL_k2m4n##ndim\
+    "addq %%r15,%5; prefetcht1 (%8); addq $32,%8;"\
+    "subq $8,%4; cmpq $16,%4; jnb "#ndim"004041b;"\
     "movq %2,%5;"\
     #ndim"004042:\n\t"\
     "testq %4,%4; jz "#ndim"004043f;"\
-    "prefetcht0 (%5); prefetcht0 63(%5); addq %3,%5;"\
+    "prefetcht0 (%5); prefetcht0 63(%5);"\
     KERNEL_k1m4n##ndim\
+    "prefetcht0 (%5,%3,4); prefetcht0 63(%5,%3,4); addq %3,%5;"\
     "decq %4; jmp "#ndim"004042b;"\
     #ndim"004043:\n\t"\
     "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\

From a0f0a802fcb895ae533f167f60667cc808e79b65 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 30 Dec 2019 17:33:42 +0800
Subject: [PATCH 159/210] Update zgemm3m_kernel_4x4_haswell.c

---
 kernel/x86_64/zgemm3m_kernel_4x4_haswell.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
index c57dccd36..56bc06c5c 100644
--- a/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
+++ b/kernel/x86_64/zgemm3m_kernel_4x4_haswell.c
@@ -201,8 +201,8 @@
     "33105"#ndim":\n\t"\
     "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
     :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(M),"+r"(next_b)\
-    ::"r11","r12","r13","r14","r15","ymm0","ymm1","ymm2","ymm3","ymm4","ymm5","ymm6","ymm7","ymm8","ymm9","ymm10","ymm11","ymm12","ymm13","ymm14",\
-    "ymm15","cc","memory");\
+    ::"r11","r12","r13","r14","r15","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14",\
+    "xmm15","cc","memory");\
     a_pointer -= M * K; b_pointer += ndim * K; c_pointer += 2*(LDC * ndim - M);\
 }
 int __attribute__ ((noinline))

From 50f7fc1401b77c105c6681b3aefb8ce7555b7831 Mon Sep 17 00:00:00 2001
From: zq <zuoqian3@huawei.com>
Date: Tue, 31 Dec 2019 10:21:23 +0800
Subject: [PATCH 160/210] [WIP] Use arm neon instructions to optimize tcopy
 operation

---
 kernel/arm64/KERNEL.ARMV8     |   8 +
 kernel/arm64/KERNEL.TSV110    |   8 +
 kernel/arm64/sgemm_tcopy_16.S | 824 ++++++++++++++++++++++++++++++++++
 3 files changed, 840 insertions(+)
 create mode 100644 kernel/arm64/sgemm_tcopy_16.S

diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index b90dd228b..28eff773f 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -108,12 +108,20 @@ SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110
index 04d6940d7..8c31f83b1 100644
--- a/kernel/arm64/KERNEL.TSV110
+++ b/kernel/arm64/KERNEL.TSV110
@@ -110,12 +110,20 @@ SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+ifeq ($(SGEMM_UNROLL_M), 16)
+SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
+else
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+endif
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+ifeq ($(SGEMM_UNROLL_N), 16)
+SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+endif
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S
new file mode 100644
index 000000000..12b80bdca
--- /dev/null
+++ b/kernel/arm64/sgemm_tcopy_16.S
@@ -0,0 +1,824 @@
+/***************************************************************************
+Copyright (c) 2019, The OpenBLAS Project
+All rights reserved.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M		x0
+#define	N		x1
+#define	A		x2
+#define	LDA		x3
+#define	B		x4
+
+#define M8		x5
+
+#define	A01		x6
+#define	A02		x7
+#define	A03		x8
+#define	A04		x9
+#define	A05		x10
+#define	A06		x11
+#define	A07		x12
+#define	A08		x13
+
+#define	B01		x14
+#define	B02		x15
+#define	B03		x16
+#define	B04		x17
+#define	B00		x22
+
+
+#define I		x18
+#define	J		x19
+
+#define TEMP1		x20
+
+#define A_PREFETCH	256
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+	//prfm	PSTL1KEEP, [B00, M8]
+	
+	ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add  A01, A01, #64
+	
+	st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add TEMP1, B00, #64
+
+	ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add  A02, A02, #64
+	
+	st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add  A03, A03, #64
+	
+	st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add  A04, A04, #64
+	
+	st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [A05]
+	add  A05, A05, #64
+	
+	st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [A06]
+	add  A06, A06, #64
+	
+	st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v24.4s, v25.4s, v26.4s, v27.4s}, [A07]
+	add  A07, A07, #64
+	
+	st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+
+	ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [A08]
+	add  A08, A08, #64
+	
+	st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [TEMP1]
+	add TEMP1, TEMP1, #64
+	
+	add	B00, B00, M8
+
+.endm
+
+.macro COPY8x8
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+	
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+	
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q8, q9, [A05]
+	ldp	q10, q11, [A06]
+	add	A05, A05, #32
+	add	A06, A06, #32
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q12, q13, [A07]
+	ldp	q14, q15, [A08]
+	add	A07, A07, #32
+	add	A08, A08, #32
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+	add	B02, B02, #64
+
+	ldr	q4, [A05]
+	ldr	q5, [A06]
+	ldr	q6, [A07]
+	ldr	q7, [A08]
+
+	add	A05, A05, #16
+	add	A06, A06, #16
+	add	A07, A07, #16
+	add	A08, A08, #16
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B02]
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+	add	B03, B03, #16
+
+	ldr	d4, [A05]
+	ldr	d5, [A06]
+	ldr	d6, [A07]
+	ldr	d7, [A08]
+	
+	add	A05, A05, #8
+	add	A06, A06, #8
+	add	A07, A07, #8
+	add	A08, A08, #8
+
+	stp	d4, d5, [B03]
+	add	B03, B03, #16
+	stp	d6, d7, [B03]
+	add	B03, B03, #16
+
+.endm
+
+.macro COPY1x8
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A05, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A06, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A07, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A08, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+	ldr	s4, [A05]
+	ldr	s5, [A06]
+	ldr	s6, [A07]
+	ldr	s7, [A08]
+	
+	ldr	d4, [A05], #8
+	ldr	d5, [A06], #8
+	ldr	d6, [A07], #8
+	ldr	d7, [A08], #8
+
+	stp	s4, s5, [B04]
+	add	B04, B04, #8
+	stp	s6, s7, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+.macro COPY16x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v8.4s, v9.4s, v10.4s, v11.4s}, [A03]
+	add	A03, A03, #64
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1]
+	add	TEMP1, TEMP1, #64
+
+	ld1	{v12.4s, v13.4s, v14.4s, v15.4s}, [A04]
+	add	A04, A04, #64
+
+	st1	{v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1]
+
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+	add	A03, A03, #32
+	add	A04, A04, #32
+
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	ldr	q2, [A03]
+	ldr	q3, [A04]
+	add	A01, A01, #16
+	add	A02, A02, #16
+	add	A03, A03, #16
+	add	A04, A04, #16
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B02]
+
+	add	B02, B02, #64
+.endm
+
+.macro COPY2x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	ldr	d2, [A03]
+	ldr	d3, [A04]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	add	A03, A03, #8
+	add	A04, A04, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+	stp	d2, d3, [B03]
+
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x4
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	ldr	s2, [A03]
+	ldr	s3, [A04]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+	add	A03, A03, #4
+	add	A04, A04, #4
+
+	stp	s0, s1, [B04]
+	add	B04, B04, #8
+	stp	s2, s3, [B04]
+	add	B04, B04, #8
+
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+	
+	ld1	{v4.4s, v5.4s, v6.4s, v7.4s}, [A02]
+	add	A02, A02, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	TEMP1, B00, #64
+	st1	{v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s}, [A01]
+	ld1	{v2.4s, v3.4s}, [A02]
+	add	A01, A01, #32
+	add	A02, A02, #32
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B01]
+	add	B01, B01, #64
+.endm
+
+.macro COPY4x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01]
+	ldr	q1, [A02]
+	add	A01, A01, #16
+	add	A02, A02, #16
+
+	stp	q0, q1, [B02]
+	add	B02, B02, #32
+.endm
+
+.macro COPY2x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	ldr	d1, [A02]
+	
+	add	A01, A01, #8
+	add	A02, A02, #8
+	
+	stp	d0, d1, [B03]
+	add	B03, B03, #16
+.endm
+
+.macro COPY1x2
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	//prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	ldr	s1, [A02]
+	
+	add	A01, A01, #4
+	add	A02, A02, #4
+
+	stp	s0, s1, [B04]
+
+	add	B04, B04, #8
+.endm
+
+/*************************************************************************************************************************/
+
+.macro COPY16x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ld1	{v0.4s, v1.4s, v2.4s, v3.4s}, [A01]
+	add	A01, A01, #64
+
+	st1	{v0.4s, v1.4s, v2.4s, v3.4s}, [B00]
+	add	B00, B00, M8
+.endm
+
+.macro COPY8x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	
+	ldp	q0, q1, [A01]
+	add	A01, A01, #32
+	stp	q0, q1, [B01]
+
+	add	B01, B01, #32
+.endm
+
+.macro COPY4x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr q0, [A01]
+	add	A01, A01, #16
+	str q0, [B02]
+
+	add	B02, B02, #16
+.endm
+
+.macro COPY2x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	d0, [A01]
+	add	A01, A01, #8
+	str d0, [B03]
+
+	add	B03, B03, #8
+.endm
+
+.macro COPY1x1
+	//prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01]
+	add	A01, A01, #4
+	str	s0, [B04]
+
+	add	B04, B04, #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+	lsl	TEMP1, M, #2					// TEMP1 = M * SIZE
+
+	and	B01 , N , #-16
+	and	B02 , N , #-8
+	and	B03 , N , #-4
+	and	B04 , N , #-2
+
+	mul	B01, B01, TEMP1
+	mul	B02, B02, TEMP1
+	mul	B03, B03, TEMP1
+	mul	B04, B04, TEMP1
+
+	add	B01 , B01, B
+	add	B02 , B02, B
+	add	B03 , B03, B
+	add	B04 , B04, B
+
+	lsl	M8, M, #6					// M8 = M * 16 * SIZE
+
+.Lsgemm_tcopy_L8_BEGIN:
+	asr 	J, M, #3					// J = M / 8
+	cmp 	J, #0
+	ble	.Lsgemm_tcopy_L4_BEGIN
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A05, A04, LDA
+	add	A06, A05, LDA
+	add	A07, A06, LDA
+	add	A08, A07, LDA
+	add	A, A08, LDA
+
+	mov	B00, B
+	add	B, B00, #512					// B = B + 8 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L8_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L8_M16_20:
+
+	COPY16x8
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L8_M16_20
+
+.Lsgemm_tcopy_L8_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L8_M16_60
+
+	COPY8x8
+	
+.Lsgemm_tcopy_L8_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L8_M16_80
+
+	COPY4x8
+
+.Lsgemm_tcopy_L8_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L8_M16_100
+
+	COPY2x8
+
+.Lsgemm_tcopy_L8_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L8_M16_END
+
+	COPY1x8
+
+.Lsgemm_tcopy_L8_M16_END:
+
+	subs	J , J, #1						// j--
+	bne	.Lsgemm_tcopy_L8_M16_BEGIN
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L4_BEGIN:
+	tst	M, #7
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #4
+	ble	.Lsgemm_tcopy_L2_BEGIN
+	
+.Lsgemm_tcopy_L4_M16_BEGIN:
+
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A, A04, LDA
+
+	mov	B00, B
+	add	B, B00, #256					// B = B + 4 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L4_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L4_M16_20:
+
+	COPY16x4
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L4_M16_20
+
+.Lsgemm_tcopy_L4_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L4_M16_60
+
+	COPY8x4
+	
+.Lsgemm_tcopy_L4_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L4_M16_80
+
+	COPY4x4
+
+.Lsgemm_tcopy_L4_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L4_M16_100
+
+	COPY2x4
+
+
+.Lsgemm_tcopy_L4_M16_100:
+
+	tst	N, #1
+	ble	.Lsgemm_tcopy_L4_M16_END
+
+	COPY1x4
+
+
+.Lsgemm_tcopy_L4_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L2_BEGIN:
+
+	tst	M, #3
+	ble	.Lsgemm_tcopy_L999
+
+	tst	M, #2
+	ble	.Lsgemm_tcopy_L1_BEGIN
+
+.Lsgemm_tcopy_L2_M16_BEGIN:
+	mov	A01, A
+	add	A02, A01, LDA
+	add	A, A02, LDA
+
+	mov	B00, B
+	add	B, B00, #128					// B = B + 2 * 16 * SIZE
+
+	asr	I, N, #4					// I = N / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L2_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L2_M16_20:
+
+	COPY16x2
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L2_M16_20
+
+.Lsgemm_tcopy_L2_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L2_M16_60
+
+	COPY8x2
+
+.Lsgemm_tcopy_L2_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L2_M16_80
+
+	COPY4x2
+
+.Lsgemm_tcopy_L2_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L2_M16_100
+
+	COPY2x2
+
+.Lsgemm_tcopy_L2_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L2_M16_END
+
+	COPY1x2
+
+.Lsgemm_tcopy_L2_M16_END:
+
+/*********************************************************************************************/
+
+.Lsgemm_tcopy_L1_BEGIN:
+
+	tst	M, #1
+	ble	.Lsgemm_tcopy_L999
+
+
+.Lsgemm_tcopy_L1_M16_BEGIN:
+
+	mov	A01, A						// A01 = A
+	mov	B00, B
+
+	asr	I, N, #4					// I = M / 16
+	cmp 	I, #0
+	ble	.Lsgemm_tcopy_L1_M16_40
+
+	.align	5
+.Lsgemm_tcopy_L1_M16_20:
+
+	COPY16x1
+
+	subs	I , I , #1
+	bne	.Lsgemm_tcopy_L1_M16_20
+	
+.Lsgemm_tcopy_L1_M16_40:
+	tst	N , #8
+	ble	.Lsgemm_tcopy_L1_M16_60
+
+	COPY8x1
+
+.Lsgemm_tcopy_L1_M16_60:
+	tst	N , #4
+	ble	.Lsgemm_tcopy_L1_M16_80
+
+	COPY4x1
+
+.Lsgemm_tcopy_L1_M16_80:
+
+	tst	N , #2
+	ble	.Lsgemm_tcopy_L1_M16_100
+
+	COPY2x1
+
+.Lsgemm_tcopy_L1_M16_100:
+
+	tst	N , #1
+	ble	.Lsgemm_tcopy_L1_M16_END
+
+	COPY1x1
+
+
+.Lsgemm_tcopy_L1_M16_END:
+
+.Lsgemm_tcopy_L999:
+	mov	x0, #0						// set return value
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+
+

From 0833a4846ac77ee21a6e2100a51b4e31f3d0b9c7 Mon Sep 17 00:00:00 2001
From: w00421467 <wuzeyi1@huawei.com>
Date: Tue, 31 Dec 2019 10:31:07 +0800
Subject: [PATCH 161/210] Use arm neon instructions to optimize sgemm_beta
 operation

---
 kernel/arm64/KERNEL.ARMV8 |   1 +
 kernel/arm64/sgemm_beta.S | 259 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 260 insertions(+)
 create mode 100755 kernel/arm64/sgemm_beta.S

diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index b90dd228b..587ee25c6 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -103,6 +103,7 @@ ZDOTKERNEL     = zdot.S
 DSDOTKERNEL    = dot.S
 
 DGEMM_BETA     = dgemm_beta.S
+SGEMM_BETA     = sgemm_beta.S
 
 SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
diff --git a/kernel/arm64/sgemm_beta.S b/kernel/arm64/sgemm_beta.S
new file mode 100755
index 000000000..a3b97e231
--- /dev/null
+++ b/kernel/arm64/sgemm_beta.S
@@ -0,0 +1,259 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	BETA    s0
+#define	LDC     x6
+#define	C00	x7
+
+#define	A01	x8
+#define	A02	x9
+#define	A03	x10
+#define	A04	x11
+#define I  	x12
+
+#define beta0		s11
+#define betaV0		v11.s[0]
+
+#define prfm_size 640
+#define calc_size 128
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+.macro INIT_ZERO
+	fmul v0.4s, v0.4s, betaV0
+	fmul v1.4s, v1.4s, betaV0
+	fmul v2.4s, v2.4s, betaV0
+	fmul v3.4s, v3.4s, betaV0
+	fmul v4.4s, v4.4s, betaV0
+	fmul v5.4s, v5.4s, betaV0
+	fmul v6.4s, v6.4s, betaV0
+	fmul v7.4s, v7.4s, betaV0
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	ldr LDC, [sp]
+	SAVE_REGS
+
+.Lgemm_beta_BEGIN:
+
+	fmov beta0, BETA
+	cmp 	N, #0
+	ble	.Lgemm_beta_L999
+
+	fcmp BETA, #0.0
+	beq .Lgemm_beta_zero_01
+
+.Lgemm_beta_01:
+
+	lsl LDC, LDC, #2
+
+	.align 5
+.Lgemm_beta_02:
+
+	mov	A01, C00
+	add C00, C00, LDC
+	asr	I, M, #5
+	cmp 	I, #0
+	ble .Lgemm_beta_04
+	add	A02, A01, #32
+	add	A03, A02, #32
+	add	A04, A03, #32
+
+	.align 5
+.Lgemm_beta_03:
+
+	prfm	PLDL1KEEP, [A01, prfm_size]
+
+	ldp	q0, q1, [A01]
+	ldp	q2, q3, [A02]
+	ldp	q4, q5, [A03]
+	ldp	q6, q7, [A04]
+  
+	fmul v0.4s, v0.4s, betaV0
+	fmul v1.4s, v1.4s, betaV0
+	
+	fmul v2.4s, v2.4s, betaV0
+	fmul v3.4s, v3.4s, betaV0
+	
+	fmul v4.4s, v4.4s, betaV0
+	fmul v5.4s, v5.4s, betaV0
+	
+	fmul v6.4s, v6.4s, betaV0
+	fmul v7.4s, v7.4s, betaV0
+
+	prfm    PLDL1KEEP, [A01, prfm_size + 64]
+
+	st1	{v0.4s, v1.4s}, [A01]
+	add	A01, A01, calc_size
+	st1	{v2.4s, v3.4s}, [A02]
+	add	A02, A02, calc_size
+	st1	{v4.4s, v5.4s}, [A03]
+	add	A03, A03, calc_size
+	st1	{v6.4s, v7.4s}, [A04]
+	add	A04, A04, calc_size
+
+	subs	I , I , #1
+	bne	.Lgemm_beta_03
+
+	.align 5
+.Lgemm_beta_04:
+
+	and	I, M , #31
+	cmp	I, #0
+	ble	.Lgemm_beta_06
+
+	.align 5
+.Lgemm_beta_05:
+
+	ldr	s12, [A01]
+	fmul	s12, s12, beta0
+	str	s12, [A01]
+	add	A01, A01, #4
+
+	subs	I , I , #1
+	bne	.Lgemm_beta_05
+
+	.align 5
+.Lgemm_beta_06:
+
+	subs	N , N, #1			// N--
+	bne	.Lgemm_beta_02
+
+	.align 5
+.Lgemm_beta_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	.align 5
+.Lgemm_beta_zero_01:
+
+	INIT_ZERO
+	lsl LDC, LDC, #2
+
+	.align 5
+.Lgemm_beta_zero_02:
+
+	mov A01, C00
+	add C00, C00, LDC
+
+	asr I, M, #5
+	cmp I, #0
+	ble .Lgemm_beta_zero_04
+	add A02, A01, #32
+	add A03, A02, #32
+	add A04, A03, #32
+
+	.align 5
+.Lgemm_beta_zero_03:
+
+	st1	{v0.4s, v1.4s}, [A01]
+	add	 A01, A01, calc_size
+	st1	{v2.4s, v3.4s}, [A02]
+	add 	A02, A02, calc_size
+	st1	{v4.4s, v5.4s}, [A03]
+	add	A03, A03, calc_size
+	st1	{v6.4s, v7.4s}, [A04]
+	add	A04, A04, calc_size
+
+	subs I, I, #1
+	bne .Lgemm_beta_zero_03
+
+	.align 5
+.Lgemm_beta_zero_04:
+
+	and I, M, #31
+	cmp I, #0
+	ble .Lgemm_beta_zero_06
+
+	.align 5
+.Lgemm_beta_zero_05:
+
+	str beta0, [A01]
+	add A01, A01, #4
+
+	subs I, I, #1
+	bne .Lgemm_beta_zero_05
+
+	.align 5
+.Lgemm_beta_zero_06:
+
+	subs N, N, #1
+	bne .Lgemm_beta_zero_02
+
+	.align 5
+.Lgemm_beta_zero_L999:
+	mov x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE

From 8729db117ced2094d688b38f85fc2cf1faf7fde7 Mon Sep 17 00:00:00 2001
From: shengyang <shengyang1@huawei.com>
Date: Tue, 31 Dec 2019 15:59:52 +0800
Subject: [PATCH 162/210] 	modified:   ctest/din3 	modified:   ctest/sin3

---
 ctest/din3 | 2 +-
 ctest/sin3 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ctest/din3 b/ctest/din3
index 23fedfe32..9919774ac 100644
--- a/ctest/din3
+++ b/ctest/din3
@@ -5,7 +5,7 @@ T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
-6                 NUMBER OF VALUES OF N
+7                 NUMBER OF VALUES OF N
 1 2 3 5 7 9 35    VALUES OF N
 3                 NUMBER OF VALUES OF ALPHA
 0.0 1.0 0.7       VALUES OF ALPHA
diff --git a/ctest/sin3 b/ctest/sin3
index 644083f22..b74206b70 100644
--- a/ctest/sin3
+++ b/ctest/sin3
@@ -5,7 +5,7 @@ T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
-6                 NUMBER OF VALUES OF N
+7                 NUMBER OF VALUES OF N
 0 1 2 3 5 9 35    VALUES OF N
 3                 NUMBER OF VALUES OF ALPHA
 0.0 1.0 0.7       VALUES OF ALPHA

From 8d844032056bc3533a396c16f6386df2b9173eb4 Mon Sep 17 00:00:00 2001
From: shengyang <shengyang1@huawei.com>
Date: Tue, 31 Dec 2019 17:06:35 +0800
Subject: [PATCH 163/210] 	Use arm neon instructions to optimize ncopy
 operation

	modified:   KERNEL.ARMV8
	modified:   KERNEL.TSV110
	new file:   sgemm_ncopy_4.S
---
 kernel/arm64/KERNEL.ARMV8    |   9 +
 kernel/arm64/KERNEL.TSV110   |   9 +
 kernel/arm64/sgemm_ncopy_4.S | 333 +++++++++++++++++++++++++++++++++++
 3 files changed, 351 insertions(+)
 create mode 100644 kernel/arm64/sgemm_ncopy_4.S

diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index b90dd228b..e73bed76e 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -107,12 +107,21 @@ DGEMM_BETA     = dgemm_beta.S
 SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
+
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110
index 04d6940d7..0db068dcf 100644
--- a/kernel/arm64/KERNEL.TSV110
+++ b/kernel/arm64/KERNEL.TSV110
@@ -109,12 +109,21 @@ ZGEMVTKERNEL = zgemv_t.S
 SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
 ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+endif
 SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
 SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
+
+ifeq ($(SGEMM_UNROLL_N), 4)
+SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
+else
 SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+endif
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
diff --git a/kernel/arm64/sgemm_ncopy_4.S b/kernel/arm64/sgemm_ncopy_4.S
new file mode 100644
index 000000000..30450cc7d
--- /dev/null
+++ b/kernel/arm64/sgemm_ncopy_4.S
@@ -0,0 +1,333 @@
+/***************************************************************************
+Copyright (c) 2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define	M	x0
+#define	N	x1
+#define	A00	x2
+#define	LDA	x3
+#define	B00	x4
+
+#define	A01	x5
+#define	A02	x6
+#define	A03	x7
+#define	A04	x8
+
+#define I	x9
+#define	J	x10
+
+#define	TEMP1	x11
+#define	TEMP2	x12
+
+#define A_PREFETCH	2560
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro SAVE_REGS
+	add	sp, sp, #-(11 * 16)
+	stp	d8, d9, [sp, #(0 * 16)]
+	stp	d10, d11, [sp, #(1 * 16)]
+	stp	d12, d13, [sp, #(2 * 16)]
+	stp	d14, d15, [sp, #(3 * 16)]
+	stp	d16, d17, [sp, #(4 * 16)]
+	stp	x18, x19, [sp, #(5 * 16)]
+	stp	x20, x21, [sp, #(6 * 16)]
+	stp	x22, x23, [sp, #(7 * 16)]
+	stp	x24, x25, [sp, #(8 * 16)]
+	stp	x26, x27, [sp, #(9 * 16)]
+	str	x28, [sp, #(10 * 16)]
+.endm
+
+.macro RESTORE_REGS
+	ldp	d8, d9, [sp, #(0 * 16)]
+	ldp	d10, d11, [sp, #(1 * 16)]
+	ldp	d12, d13, [sp, #(2 * 16)]
+	ldp	d14, d15, [sp, #(3 * 16)]
+	ldp	d16, d17, [sp, #(4 * 16)]
+	ldp	x18, x19, [sp, #(5 * 16)]
+	ldp	x20, x21, [sp, #(6 * 16)]
+	ldp	x22, x23, [sp, #(7 * 16)]
+	ldp	x24, x25, [sp, #(8 * 16)]
+	ldp	x26, x27, [sp, #(9 * 16)]
+	ldr	x28, [sp, #(10 * 16)]
+	add	sp, sp, #(11*16)
+.endm
+
+.macro COPY4x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	ldr	q2, [A03], #16
+	ins	v8.s[2], v2.s[0]
+	ins	v9.s[2], v2.s[1]
+	ins	v10.s[2], v2.s[2]
+	ins	v11.s[2], v2.s[3]
+
+	ldr	q3, [A04], #16
+	ins	v8.s[3], v3.s[0]
+	ins	v9.s[3], v3.s[1]
+	ins	v10.s[3], v3.s[2]
+	ins	v11.s[3], v3.s[3]
+
+	st1	{v8.4s, v9.4s, v10.4s, v11.4s}, [B00]
+	add	B00, B00, #64
+
+.endm
+
+.macro COPY1x4
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A03, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A04, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+	ldr	s2, [A03], #4
+	ldr	s3, [A04], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+   	stp	s2, s3, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	ins	v8.s[0], v0.s[0]
+	ins	v9.s[0], v0.s[1]
+	ins	v10.s[0], v0.s[2]
+	ins	v11.s[0], v0.s[3]
+
+	ldr	q1, [A02], #16
+	ins	v8.s[1], v1.s[0]
+	ins	v9.s[1], v1.s[1]
+	ins	v10.s[1], v1.s[2]
+	ins	v11.s[1], v1.s[3]
+
+	st1	{v8.2s, v9.2s, v10.2s, v11.2s}, [B00]
+	add	B00, B00, #32
+.endm
+
+
+.macro COPY1x2
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+	prfm	PLDL1KEEP, [A02, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	ldr	s1, [A02], #4
+
+	stp	s0, s1, [B00]
+	add	B00, B00, #8
+.endm
+
+.macro COPY4x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	q0, [A01], #16
+	str	q0, [B00], #16
+.endm
+
+
+.macro COPY1x1
+	prfm	PLDL1KEEP, [A01, #A_PREFETCH]
+
+	ldr	s0, [A01], #4
+	str	s0, [B00], #4
+.endm
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+
+	SAVE_REGS
+
+	lsl	LDA, LDA, #2					// LDA = LDA * SIZE
+
+.Ldgemm_ncopy_L4_BEGIN:
+
+	asr	J, N, #2					// J = N / 4
+	cmp 	J, #0
+	ble	.Ldgemm_ncopy_L2_BEGIN
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_BEGIN:
+
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A03, A02, LDA
+	add	A04, A03, LDA
+	add	A00, A04, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_20:
+
+	COPY4x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_20
+
+.Ldgemm_ncopy_L4_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L4_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L4_M4_60:
+
+	COPY1x4
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L4_M4_60
+
+.Ldgemm_ncopy_L4_M4_END:
+
+	subs	J , J, #1						// j--
+	bne	.Ldgemm_ncopy_L4_M4_BEGIN
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L2_BEGIN:
+
+	tst	N, #3
+	ble	.Ldgemm_ncopy_L999
+
+	tst	N, #2
+	ble	.Ldgemm_ncopy_L1_BEGIN
+
+.Ldgemm_ncopy_L2_M4_BEGIN:
+	mov	A01, A00
+	add	A02, A01, LDA
+	add	A00, A02, LDA
+
+	asr	I, M, #2					// I = M / 4
+	cmp 	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_20:
+
+	COPY4x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_20
+
+.Ldgemm_ncopy_L2_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L2_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L2_M4_60:
+
+	COPY1x2
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L2_M4_60
+
+.Ldgemm_ncopy_L2_M4_END:
+
+
+/*********************************************************************************************/
+
+.Ldgemm_ncopy_L1_BEGIN:
+
+	tst	N, #1
+	ble	.Ldgemm_ncopy_L999
+
+.Ldgemm_ncopy_L1_M4_BEGIN:
+
+	mov	A01, A00
+
+	asr	I, M, #2					// I = M / 4
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_40
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_20:
+
+	COPY4x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_20
+
+
+.Ldgemm_ncopy_L1_M4_40:
+
+	and	I, M , #3
+	cmp	I, #0
+	ble	.Ldgemm_ncopy_L1_M4_END
+
+	.align	5
+.Ldgemm_ncopy_L1_M4_60:
+
+	COPY1x1
+
+	subs	I , I , #1
+	bne	.Ldgemm_ncopy_L1_M4_60
+
+
+.Ldgemm_ncopy_L1_M4_END:
+
+.Ldgemm_ncopy_L999:
+
+	mov	x0, #0
+	RESTORE_REGS
+	ret
+
+	EPILOGUE
+

From 96ad5794284b54331afe0db07b6471913d96f2a9 Mon Sep 17 00:00:00 2001
From: int_13h <30789322+nk521@users.noreply.github.com>
Date: Tue, 31 Dec 2019 22:33:27 +0530
Subject: [PATCH 164/210] add in runtime cpu detection for zarch (#2349)

 add in runtime cpu detection for zarch
---
 Makefile.system                   |   7 ++
 driver/others/Makefile            |   8 ++
 driver/others/dynamic_zarch.c     | 131 ++++++++++++++++++++++++++++++
 kernel/setparam-ref.c             |  21 +++++
 kernel/zarch/KERNEL.Z13           |  20 ++---
 kernel/zarch/KERNEL.Z14           |  20 ++---
 kernel/zarch/KERNEL.ZARCH_GENERIC |  16 ++--
 7 files changed, 195 insertions(+), 28 deletions(-)
 create mode 100644 driver/others/dynamic_zarch.c

diff --git a/Makefile.system b/Makefile.system
index ab2ffca52..c0e45515f 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -25,6 +25,8 @@ else ifeq ($(ARCH), i386)
 override ARCH=x86
 else ifeq ($(ARCH), aarch64)
 override ARCH=arm64
+else ifeq ($(ARCH), zarch)
+override ARCH=zarch
 endif
 
 NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-netlib
@@ -558,6 +560,11 @@ DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
 endif
 
+ifeq ($(ARCH), zarch)
+DYNAMIC_CORE = Z13
+DYNAMIC_CORE += Z14
+endif
+
 ifeq ($(ARCH), power)
 DYNAMIC_CORE = POWER6
 DYNAMIC_CORE += POWER8
diff --git a/driver/others/Makefile b/driver/others/Makefile
index d4b5c26d5..5653f3c25 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -21,9 +21,13 @@ else
 ifeq ($(ARCH),power)
 COMMONOBJS	+=  dynamic_power.$(SUFFIX)
 else
+ifeq ($(ARCH),zarch)
+COMMONOBJS += dynamic_zarch.$(SUFFIX)
+else
 COMMONOBJS	+=  dynamic.$(SUFFIX)
 endif
 endif
+endif
 else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
@@ -85,9 +89,13 @@ else
 ifeq ($(ARCH),power)
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX)
 else
+ifeq ($(ARCH),zarch)
+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
+else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
 endif
 endif
+endif
 else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
 endif
diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c
new file mode 100644
index 000000000..1206bf870
--- /dev/null
+++ b/driver/others/dynamic_zarch.c
@@ -0,0 +1,131 @@
+
+#include "common.h"
+
+extern gotoblas_t gotoblas_Z13;
+extern gotoblas_t gotoblas_Z14;
+extern gotoblas_t gotoblas_Z15;
+//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
+//extern gotoblas_t gotoblas_Z14;
+//#endif
+
+#define NUM_CORETYPES 5
+
+extern void openblas_warning(int verbose, const char* msg);
+
+static char* corename[] = {
+	"unknown",
+	"Z13",
+	"Z14",
+	"Z15",
+	"ZARCH_GENERIC",
+};
+
+char* gotoblas_corename(void) {
+	if (gotoblas == &gotoblas_Z13)	return corename[1];
+	if (gotoblas == &gotoblas_Z14)	return corename[2];
+	if (gotoblas == &gotoblas_Z15)	return corename[3];
+//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
+//	if (gotoblas == &gotoblas_POWER9)	return corename[3];
+//#endif
+	return corename[0]; // try generic?
+}
+
+// __builtin_cpu_is is not supported by zarch
+static gotolabs_t* get_coretype(void) {
+	FILE* infile;
+	char buffer[512], * p;
+
+	p = (char*)NULL;
+	infile = fopen("/proc/sysinfo", "r");
+	while (fgets(buffer, sizeof(buffer), infile)) {
+		if (!strncmp("Type", buffer, 4)) {
+			p = strchr(buffer, ':') + 2;
+#if 0
+			fprintf(stderr, "%s\n", p);
+#endif
+			break;
+		}
+	}
+
+	fclose(infile);
+
+	if (strstr(p, "2964")) return &gotoblas_Z13;
+	if (strstr(p, "2965")) return &gotoblas_Z13;
+	if (strstr(p, "3906")) return &gotoblas_Z14;
+	if (strstr(p, "3907")) return &gotoblas_Z14;
+	if (strstr(p, "8561")) return &gotoblas_Z14;        // fallback z15 to z14
+	if (strstr(p, "8562")) return &gotoblas_Z14;        // fallback z15 to z14
+
+	return NULL; // should be ZARCH_GENERIC
+}
+
+static gotoblas_t* force_coretype(char* coretype) {
+
+	int i;
+	int found = -1;
+	char message[128];
+
+	for (i = 0; i < NUM_CORETYPES; i++)
+	{
+		if (!strncasecmp(coretype, corename[i], 20))
+		{
+			found = i;
+			break;
+		}
+	}
+
+	switch (found)
+	{
+	case  1: return (&gotoblas_Z13);
+	case  2: return (&gotoblas_Z14);
+	case  3: return (&gotoblas_Z15);
+//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
+//	case  3: return (&gotoblas_POWER9);
+//#endif
+	default: return NULL;
+	}
+	snprintf(message, 128, "Core not found: %s\n", coretype);
+	openblas_warning(1, message);
+}
+
+void gotoblas_dynamic_init(void) {
+
+	char coremsg[128];
+	char coren[22];
+	char* p;
+
+
+	if (gotoblas) return;
+
+	p = getenv("OPENBLAS_CORETYPE");
+	if (p)
+	{
+		gotoblas = force_coretype(p);
+	}
+	else
+	{
+		gotoblas = get_coretype();
+	}
+
+	if (gotoblas == NULL)
+	{
+		snprintf(coremsg, 128, "Falling back to Z14 core\n");
+		openblas_warning(1, coremsg);
+		gotoblas = &gotoblas_Z14;
+	}
+
+	if (gotoblas && gotoblas->init) {
+		strncpy(coren, gotoblas_corename(), 20);
+		sprintf(coremsg, "Core: %s\n", coren);
+		openblas_warning(2, coremsg);
+		gotoblas->init();
+	}
+	else {
+		openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+		exit(1);
+	}
+}
+
+void gotoblas_dynamic_quit(void) {
+	gotoblas = NULL;
+}
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 8e8214e70..3c71c778e 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -739,6 +739,26 @@ static void init_parameter(void) {
 }
 #else //POWER
 
+#if defined(ARCH_ZARCH)
+static void init_parameter(void) {
+	TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+	TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+	TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+	TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+
+	TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
+	TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
+	TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
+	TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
+
+
+	TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
+	TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
+	TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
+	TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
+}
+#else //ZARCH
+
 #ifdef ARCH_X86
 static int get_l2_size_old(void){
   int i, eax, ebx, ecx, edx, cpuid_level;
@@ -1325,4 +1345,5 @@ static void init_parameter(void) {
 
 }
 #endif //POWER
+#endif //ZARCH
 #endif //defined(ARCH_ARM64)
diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13
index b1ffd3c54..3bcc32197 100644
--- a/kernel/zarch/KERNEL.Z13
+++ b/kernel/zarch/KERNEL.Z13
@@ -96,10 +96,10 @@ SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ = sgemm_incopy.o
-SGEMMITCOPYOBJ = sgemm_itcopy.o
-SGEMMONCOPYOBJ = sgemm_oncopy.o
-SGEMMOTCOPYOBJ = sgemm_otcopy.o
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 
  
@@ -108,16 +108,16 @@ DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    = ctrmm4x4V.S
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    = ztrmm4x4V.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14
index 971896c2d..f6e3bec23 100644
--- a/kernel/zarch/KERNEL.Z14
+++ b/kernel/zarch/KERNEL.Z14
@@ -96,10 +96,10 @@ SGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ = sgemm_incopy.o
-SGEMMITCOPYOBJ = sgemm_itcopy.o
-SGEMMONCOPYOBJ = sgemm_oncopy.o
-SGEMMOTCOPYOBJ = sgemm_otcopy.o
+SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 
  
@@ -108,16 +108,16 @@ DGEMMINCOPY    = ../generic/gemm_ncopy_8.c
 DGEMMITCOPY    = ../generic/gemm_tcopy_8.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ = dgemm_incopy.o
-DGEMMITCOPYOBJ = dgemm_itcopy.o
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    = ctrmm4x4V.S
 CGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_4.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    = ztrmm4x4V.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_4.c
diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC
index 3bbeb9155..33850d0f7 100644
--- a/kernel/zarch/KERNEL.ZARCH_GENERIC
+++ b/kernel/zarch/KERNEL.ZARCH_GENERIC
@@ -94,26 +94,26 @@ ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
 SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
 SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
 DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c

From 375b1875c8c1d1d59d1bb6f2c227e6da12563faf Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 1 Jan 2020 13:18:53 +0100
Subject: [PATCH 165/210] [WIP] Update LAPACK to 3.9.0 (#2353)

* Update make.inc entries for LAPACK 3.9.0

Reference-LAPACK PR 347 changed some variable names and relative paths

* Update LAPACK to 3.9.0

* Add new functions from LAPACK 3.9.0

* Add new functions from LAPACK 3.9.0

* Restore LOADER command

as it makes it easier to specify pthread as needed

* Restore LOADER

* Restore EIG/LIN prefixes in cmdbase

* add binary path to lapack_testing.py call

* Restore OpenMP version check

* Restore OpenMP version check

* Restore fix for out-of-bounds array accesses

from #2096
---
 Makefile                                      |    20 +-
 cmake/lapack.cmake                            |    16 +-
 cmake/lapacke.cmake                           |     6 +
 exports/gensymbol                             |    31 +-
 lapack-netlib/.appveyor.yml                   |    38 +
 lapack-netlib/.gitignore                      |     6 +
 lapack-netlib/.travis.yml                     |    49 +-
 lapack-netlib/BLAS/CMakeLists.txt             |     1 +
 lapack-netlib/BLAS/Makefile                   |     7 +-
 lapack-netlib/BLAS/SRC/Makefile               |    21 +-
 lapack-netlib/BLAS/SRC/icamax.f               |     2 +-
 lapack-netlib/BLAS/SRC/idamax.f               |     2 +-
 lapack-netlib/BLAS/SRC/izamax.f               |     2 +-
 lapack-netlib/BLAS/SRC/meson.build            |    29 +
 lapack-netlib/BLAS/SRC/sdsdot.f               |   146 +-
 lapack-netlib/BLAS/TESTING/Makefile           |    33 +-
 lapack-netlib/BLAS/TESTING/cblat1.f           |     2 +-
 lapack-netlib/BLAS/TESTING/dblat1.f           |     2 +-
 lapack-netlib/BLAS/TESTING/sblat1.f           |     2 +-
 lapack-netlib/BLAS/TESTING/zblat1.f           |     2 +-
 lapack-netlib/CBLAS/CMakeLists.txt            |    19 +-
 lapack-netlib/CBLAS/Makefile                  |    10 +-
 lapack-netlib/CBLAS/examples/Makefile         |    16 +-
 lapack-netlib/CBLAS/examples/cblas_example1.c |     2 +-
 lapack-netlib/CBLAS/src/Makefile              |    50 +-
 lapack-netlib/CBLAS/src/cblas_sgemm.c         |     2 +-
 lapack-netlib/CBLAS/testing/Makefile          |    40 +-
 lapack-netlib/CBLAS/testing/c_cblat1.f        |     2 +-
 lapack-netlib/CBLAS/testing/c_dblat1.f        |     2 +-
 lapack-netlib/CBLAS/testing/c_sblat1.f        |     2 +-
 lapack-netlib/CBLAS/testing/c_zblat1.f        |     2 +-
 .../CMAKE/CheckLAPACKCompilerFlags.cmake      |     2 +-
 lapack-netlib/CMAKE/FindGcov.cmake            |     2 +-
 lapack-netlib/CMAKE/Findcodecov.cmake         |     2 +-
 lapack-netlib/CMAKE/FortranMangling.cmake     |     2 +-
 .../CMAKE/lapack-config-build.cmake.in        |     4 +
 .../CMAKE/lapack-config-install.cmake.in      |     4 +
 lapack-netlib/CMakeLists.txt                  |    99 +-
 lapack-netlib/DOCS/Doxyfile                   |     2 +-
 lapack-netlib/DOCS/Doxyfile_man               |     2 +-
 lapack-netlib/DOCS/lawn81.tex                 |    90 +-
 lapack-netlib/INSTALL/Makefile                |    28 +-
 lapack-netlib/INSTALL/dlamch.f                |     5 +
 lapack-netlib/INSTALL/dlamchf77.f             |     4 +
 lapack-netlib/INSTALL/ilaver.f                |    10 +-
 lapack-netlib/INSTALL/make.inc.ALPHA          |    34 +-
 lapack-netlib/INSTALL/make.inc.HPPA           |    34 +-
 lapack-netlib/INSTALL/make.inc.IRIX64         |    39 +-
 lapack-netlib/INSTALL/make.inc.O2K            |    39 +-
 lapack-netlib/INSTALL/make.inc.SGI5           |    34 +-
 lapack-netlib/INSTALL/make.inc.SUN4           |    34 +-
 lapack-netlib/INSTALL/make.inc.SUN4SOL2       |    41 +-
 lapack-netlib/INSTALL/make.inc.XLF            |    34 +-
 lapack-netlib/INSTALL/make.inc.gfortran       |    34 +-
 lapack-netlib/INSTALL/make.inc.gfortran_debug |    34 +-
 lapack-netlib/INSTALL/make.inc.ifort          |    34 +-
 lapack-netlib/INSTALL/make.inc.pgf95          |    34 +-
 lapack-netlib/INSTALL/make.inc.pghpf          |    34 +-
 lapack-netlib/INSTALL/slamch.f                |     1 +
 lapack-netlib/LAPACKE/CMakeLists.txt          |    51 +-
 lapack-netlib/LAPACKE/Makefile                |    14 +-
 .../cmake/lapacke-config-build.cmake.in       |     5 +-
 .../cmake/lapacke-config-install.cmake.in     |     5 +-
 lapack-netlib/LAPACKE/example/Makefile        |    22 +-
 lapack-netlib/LAPACKE/include/CMakeLists.txt  |     2 +-
 lapack-netlib/LAPACKE/include/lapack.h        | 13715 ++++++++++++++++
 lapack-netlib/LAPACKE/include/lapacke.h       |  7025 +-------
 lapack-netlib/LAPACKE/src/CMakeLists.txt      |   315 +-
 lapack-netlib/LAPACKE/src/Makefile            |   355 +-
 lapack-netlib/LAPACKE/src/lapacke_cgejsv.c    |     3 -
 lapack-netlib/LAPACKE/src/lapacke_cgelsd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c   |   106 +
 .../LAPACKE/src/lapacke_cgesvdq_work.c        |   149 +
 lapack-netlib/LAPACKE/src/lapacke_cggesx.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_chbevd.c    |     2 +-
 .../LAPACKE/src/lapacke_chbevd_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_chbgvd.c    |     2 +-
 .../LAPACKE/src/lapacke_cheev_work.c          |     4 +-
 lapack-netlib/LAPACKE/src/lapacke_cheevd.c    |     4 +-
 .../LAPACKE/src/lapacke_cheevd_2stage.c       |     4 +-
 .../LAPACKE/src/lapacke_cheevd_2stage_work.c  |     4 +-
 .../LAPACKE/src/lapacke_cheevd_work.c         |     5 +-
 lapack-netlib/LAPACKE/src/lapacke_cheevr.c    |     2 +-
 .../LAPACKE/src/lapacke_cheevr_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_chegst.c    |     2 +-
 .../LAPACKE/src/lapacke_chegst_work.c         |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_chegvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_chpevd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_chpgvd.c    |     2 +-
 .../LAPACKE/src/lapacke_clantr_work.c         |    19 +-
 lapack-netlib/LAPACKE/src/lapacke_cstedc.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_cstegr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_cstemr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_csytrs2.c   |     2 +-
 .../LAPACKE/src/lapacke_csytrs2_work.c        |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ctgsen.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ctprfb.c    |    35 +-
 lapack-netlib/LAPACKE/src/lapacke_cunmhr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dgeesx.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dgejsv.c    |     3 -
 lapack-netlib/LAPACKE/src/lapacke_dgelsd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c   |   106 +
 .../LAPACKE/src/lapacke_dgesvdq_work.c        |   149 +
 lapack-netlib/LAPACKE/src/lapacke_dggesx.c    |     2 +-
 .../LAPACKE/src/lapacke_dlantr_work.c         |    19 +-
 lapack-netlib/LAPACKE/src/lapacke_dormhr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dsbevd.c    |     2 +-
 .../LAPACKE/src/lapacke_dsbevd_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dspevd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dspgvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dstedc.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dstegr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dstemr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dstevd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dstevr.c    |     2 +-
 .../LAPACKE/src/lapacke_dsyev_work.c          |     4 +-
 lapack-netlib/LAPACKE/src/lapacke_dsyevd.c    |     4 +-
 .../LAPACKE/src/lapacke_dsyevd_2stage.c       |     4 +-
 .../LAPACKE/src/lapacke_dsyevd_2stage_work.c  |     4 +-
 .../LAPACKE/src/lapacke_dsyevd_work.c         |     4 +-
 lapack-netlib/LAPACKE/src/lapacke_dsyevr.c    |     2 +-
 .../LAPACKE/src/lapacke_dsyevr_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dsygvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c   |     2 +-
 .../LAPACKE/src/lapacke_dsytrs2_work.c        |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dtgsen.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_dtprfb.c    |    37 +-
 lapack-netlib/LAPACKE/src/lapacke_dtrsen.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sgeesx.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sgejsv.c    |     3 -
 lapack-netlib/LAPACKE/src/lapacke_sgelsd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c   |   106 +
 .../LAPACKE/src/lapacke_sgesvdq_work.c        |   148 +
 lapack-netlib/LAPACKE/src/lapacke_sggesx.c    |     2 +-
 .../LAPACKE/src/lapacke_slantr_work.c         |    19 +-
 lapack-netlib/LAPACKE/src/lapacke_sormhr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ssbevd.c    |     2 +-
 .../LAPACKE/src/lapacke_ssbevd_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sspevd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sspgvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sstedc.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sstegr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sstemr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sstevd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_sstevr.c    |     2 +-
 .../LAPACKE/src/lapacke_ssyev_work.c          |     4 +-
 lapack-netlib/LAPACKE/src/lapacke_ssyevd.c    |     4 +-
 .../LAPACKE/src/lapacke_ssyevd_2stage.c       |     4 +-
 .../LAPACKE/src/lapacke_ssyevd_2stage_work.c  |     4 +-
 .../LAPACKE/src/lapacke_ssyevd_work.c         |     4 +-
 lapack-netlib/LAPACKE/src/lapacke_ssyevr.c    |     2 +-
 .../LAPACKE/src/lapacke_ssyevr_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ssygvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c   |     2 +-
 .../LAPACKE/src/lapacke_ssytrs2_work.c        |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_stgsen.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_stprfb.c    |    37 +-
 lapack-netlib/LAPACKE/src/lapacke_strsen.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zgejsv.c    |     3 -
 lapack-netlib/LAPACKE/src/lapacke_zgelsd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c   |   106 +
 .../LAPACKE/src/lapacke_zgesvdq_work.c        |   149 +
 lapack-netlib/LAPACKE/src/lapacke_zggesx.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zhbevd.c    |     2 +-
 .../LAPACKE/src/lapacke_zhbevd_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c    |     2 +-
 .../LAPACKE/src/lapacke_zheev_work.c          |     4 +-
 lapack-netlib/LAPACKE/src/lapacke_zheevd.c    |     4 +-
 .../LAPACKE/src/lapacke_zheevd_2stage.c       |     4 +-
 .../LAPACKE/src/lapacke_zheevd_2stage_work.c  |     4 +-
 .../LAPACKE/src/lapacke_zheevd_work.c         |     4 +-
 lapack-netlib/LAPACKE/src/lapacke_zheevr.c    |     2 +-
 .../LAPACKE/src/lapacke_zheevr_2stage.c       |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zhegst.c    |     2 +-
 .../LAPACKE/src/lapacke_zhegst_work.c         |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zhegvd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zhpevd.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c    |     2 +-
 .../LAPACKE/src/lapacke_zlantr_work.c         |    19 +-
 lapack-netlib/LAPACKE/src/lapacke_zstedc.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zstegr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zstemr.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c   |     2 +-
 .../LAPACKE/src/lapacke_zsytrs2_work.c        |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ztgsen.c    |     2 +-
 lapack-netlib/LAPACKE/src/lapacke_ztprfb.c    |    38 +-
 lapack-netlib/LAPACKE/src/lapacke_zunmhr.c    |     2 +-
 lapack-netlib/LAPACKE/utils/Makefile          |    17 +-
 .../LAPACKE/utils/lapacke_chp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_cpf_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_cpp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_csp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_ctp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_dpf_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_dpp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_dsp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_dtp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_spf_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_spp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_ssp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_stp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_zhp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_zpf_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_zpp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_zsp_nancheck.c      |     2 +-
 .../LAPACKE/utils/lapacke_ztp_nancheck.c      |     2 +-
 lapack-netlib/Makefile                        |    49 +-
 lapack-netlib/README.md                       |    13 +-
 lapack-netlib/SRC/CMakeLists.txt              |    26 +-
 lapack-netlib/SRC/Makefile                    |    91 +-
 lapack-netlib/SRC/VARIANTS/Makefile           |    22 +-
 lapack-netlib/SRC/VARIANTS/README             |    12 +-
 lapack-netlib/SRC/cgbrfsx.f                   |    12 +-
 lapack-netlib/SRC/cgbsvxx.f                   |    10 +-
 lapack-netlib/SRC/cgebak.f                    |     8 +-
 lapack-netlib/SRC/cgeev.f                     |     2 +-
 lapack-netlib/SRC/cgejsv.f                    |    66 +-
 lapack-netlib/SRC/cgelq.f                     |    19 +-
 lapack-netlib/SRC/cgelq2.f                    |    18 +-
 lapack-netlib/SRC/cgelqf.f                    |    16 +-
 lapack-netlib/SRC/cgelqt.f                    |     1 +
 lapack-netlib/SRC/cgelqt3.f                   |     2 +
 lapack-netlib/SRC/cgemlq.f                    |     3 +-
 lapack-netlib/SRC/cgemlqt.f                   |     2 +
 lapack-netlib/SRC/cgemqr.f                    |     3 +-
 lapack-netlib/SRC/cgeqr.f                     |    20 +-
 lapack-netlib/SRC/cgeqr2.f                    |    19 +-
 lapack-netlib/SRC/cgeqr2p.f                   |    20 +-
 lapack-netlib/SRC/cgeqrf.f                    |    17 +-
 lapack-netlib/SRC/cgeqrfp.f                   |    20 +-
 lapack-netlib/SRC/cgerfsx.f                   |    12 +-
 lapack-netlib/SRC/cgesc2.f                    |     2 +-
 lapack-netlib/SRC/cgesvdq.f                   |  1391 ++
 lapack-netlib/SRC/cgesvj.f                    |    46 +-
 lapack-netlib/SRC/cgesvxx.f                   |    10 +-
 lapack-netlib/SRC/cgetsls.f                   |     2 +
 lapack-netlib/SRC/cggesx.f                    |     8 +-
 lapack-netlib/SRC/cgsvj0.f                    |    18 +-
 lapack-netlib/SRC/cgsvj1.f                    |    20 +-
 lapack-netlib/SRC/chb2st_kernels.f            |    70 +-
 lapack-netlib/SRC/checon_3.f                  |     9 +-
 lapack-netlib/SRC/cheevr.f                    |     2 +-
 lapack-netlib/SRC/cheevr_2stage.f             |     2 +-
 lapack-netlib/SRC/chegs2.f                    |     1 +
 lapack-netlib/SRC/chegst.f                    |     1 +
 lapack-netlib/SRC/cherfsx.f                   |    12 +-
 lapack-netlib/SRC/chesv_aa.f                  |     6 +-
 lapack-netlib/SRC/chesv_aa_2stage.f           |     4 +-
 lapack-netlib/SRC/chesvxx.f                   |    16 +-
 lapack-netlib/SRC/chetf2_rk.f                 |     4 +-
 lapack-netlib/SRC/chetrd_2stage.f             |    13 +-
 lapack-netlib/SRC/chetrd_hb2st.F              |     4 +-
 lapack-netlib/SRC/chetrd_he2hb.f              |     2 +-
 lapack-netlib/SRC/chetrf_aa.f                 |     8 +-
 lapack-netlib/SRC/chetrf_aa_2stage.f          |    34 +-
 lapack-netlib/SRC/chetri2.f                   |     4 +-
 lapack-netlib/SRC/chetrs_aa.f                 |   132 +-
 lapack-netlib/SRC/chetrs_aa_2stage.f          |    10 +-
 lapack-netlib/SRC/chseqr.f                    |    44 +-
 lapack-netlib/SRC/cla_gbrcond_c.f             |     4 +-
 lapack-netlib/SRC/cla_gbrcond_x.f             |     4 +-
 lapack-netlib/SRC/cla_gbrfsx_extended.f       |    12 +-
 lapack-netlib/SRC/cla_gercond_c.f             |     8 +-
 lapack-netlib/SRC/cla_gercond_x.f             |     4 +-
 lapack-netlib/SRC/cla_gerfsx_extended.f       |    12 +-
 lapack-netlib/SRC/cla_hercond_c.f             |     4 +-
 lapack-netlib/SRC/cla_hercond_x.f             |     4 +-
 lapack-netlib/SRC/cla_herfsx_extended.f       |     8 +-
 lapack-netlib/SRC/cla_porcond_c.f             |     4 +-
 lapack-netlib/SRC/cla_porcond_x.f             |     4 +-
 lapack-netlib/SRC/cla_porfsx_extended.f       |     8 +-
 lapack-netlib/SRC/cla_porpvgrw.f              |     2 +-
 lapack-netlib/SRC/cla_syrcond_c.f             |     4 +-
 lapack-netlib/SRC/cla_syrcond_x.f             |     4 +-
 lapack-netlib/SRC/cla_syrfsx_extended.f       |     8 +-
 lapack-netlib/SRC/cla_syrpvgrw.f              |     2 +-
 lapack-netlib/SRC/cla_wwaddw.f                |     2 +-
 lapack-netlib/SRC/clahef_aa.f                 |    42 +-
 lapack-netlib/SRC/clahef_rk.f                 |     4 +-
 lapack-netlib/SRC/clahqr.f                    |    14 +-
 lapack-netlib/SRC/clamswlq.f                  |     1 +
 lapack-netlib/SRC/clamtsqr.f                  |     1 +
 lapack-netlib/SRC/clangb.f                    |    23 +-
 lapack-netlib/SRC/clange.f                    |    22 +-
 lapack-netlib/SRC/clanhb.f                    |    48 +-
 lapack-netlib/SRC/clanhe.f                    |    45 +-
 lapack-netlib/SRC/clanhp.f                    |    46 +-
 lapack-netlib/SRC/clanhs.f                    |    23 +-
 lapack-netlib/SRC/clansb.f                    |    42 +-
 lapack-netlib/SRC/clansp.f                    |    54 +-
 lapack-netlib/SRC/clansy.f                    |    40 +-
 lapack-netlib/SRC/clantb.f                    |    55 +-
 lapack-netlib/SRC/clantp.f                    |    53 +-
 lapack-netlib/SRC/clantr.f                    |    56 +-
 lapack-netlib/SRC/claqps.f                    |     2 +-
 lapack-netlib/SRC/claqr0.f                    |    34 +-
 lapack-netlib/SRC/claqr1.f                    |     2 +-
 lapack-netlib/SRC/claqr2.f                    |    18 +-
 lapack-netlib/SRC/claqr3.f                    |    18 +-
 lapack-netlib/SRC/claqr4.f                    |    34 +-
 lapack-netlib/SRC/claqr5.f                    |    52 +-
 lapack-netlib/SRC/clarfb.f                    |     2 +
 lapack-netlib/SRC/clarfx.f                    |     2 +-
 lapack-netlib/SRC/clarfy.f                    |     2 +-
 lapack-netlib/SRC/clarrv.f                    |     2 +-
 lapack-netlib/SRC/classq.f                    |     4 +-
 lapack-netlib/SRC/claswlq.f                   |    20 +-
 lapack-netlib/SRC/clasyf_aa.f                 |    48 +-
 lapack-netlib/SRC/clasyf_rk.f                 |     4 +-
 lapack-netlib/SRC/clatdf.f                    |     2 +-
 lapack-netlib/SRC/clatsqr.f                   |    25 +-
 lapack-netlib/SRC/claunhr_col_getrfnp.f       |   248 +
 lapack-netlib/SRC/claunhr_col_getrfnp2.f      |   314 +
 lapack-netlib/SRC/cporfsx.f                   |    16 +-
 lapack-netlib/SRC/cposvxx.f                   |    14 +-
 lapack-netlib/SRC/cpotrf2.f                   |     4 +-
 lapack-netlib/SRC/cstemr.f                    |     4 +-
 lapack-netlib/SRC/csycon_3.f                  |     9 +-
 lapack-netlib/SRC/csyconvf.f                  |     8 +-
 lapack-netlib/SRC/csyconvf_rook.f             |     8 +-
 lapack-netlib/SRC/csyrfsx.f                   |    10 +-
 lapack-netlib/SRC/csysv_aa.f                  |    12 +-
 lapack-netlib/SRC/csysv_aa_2stage.f           |     6 +-
 lapack-netlib/SRC/csysvxx.f                   |    10 +-
 lapack-netlib/SRC/csytf2_rk.f                 |     4 +-
 lapack-netlib/SRC/csytrf.f                    |     2 +-
 lapack-netlib/SRC/csytrf_aa.f                 |    12 +-
 lapack-netlib/SRC/csytrf_aa_2stage.f          |    26 +-
 lapack-netlib/SRC/csytri2.f                   |     4 +-
 lapack-netlib/SRC/csytrs2.f                   |     2 +-
 lapack-netlib/SRC/csytrs_aa.f                 |   117 +-
 lapack-netlib/SRC/csytrs_aa_2stage.f          |    18 +-
 lapack-netlib/SRC/ctgsy2.f                    |    18 +-
 lapack-netlib/SRC/ctplqt.f                    |     2 +
 lapack-netlib/SRC/ctplqt2.f                   |     2 +
 lapack-netlib/SRC/ctpmlqt.f                   |     4 +-
 lapack-netlib/SRC/ctpmqrt.f                   |     2 +-
 lapack-netlib/SRC/ctprfb.f                    |     4 +-
 lapack-netlib/SRC/cungtsqr.f                  |   307 +
 lapack-netlib/SRC/cunhr_col.f                 |   441 +
 lapack-netlib/SRC/dbdsqr.f                    |     2 +-
 lapack-netlib/SRC/dbdsvdx.f                   |     2 +-
 lapack-netlib/SRC/dcombssq.f                  |    92 +
 lapack-netlib/SRC/dgbrfsx.f                   |    10 +-
 lapack-netlib/SRC/dgbsvxx.f                   |    10 +-
 lapack-netlib/SRC/dgebak.f                    |     8 +-
 lapack-netlib/SRC/dgeesx.f                    |     4 +-
 lapack-netlib/SRC/dgejsv.f                    |    56 +-
 lapack-netlib/SRC/dgelq.f                     |    19 +-
 lapack-netlib/SRC/dgelq2.f                    |    18 +-
 lapack-netlib/SRC/dgelqf.f                    |    16 +-
 lapack-netlib/SRC/dgemlq.f                    |     3 +-
 lapack-netlib/SRC/dgemqr.f                    |     3 +-
 lapack-netlib/SRC/dgeqr.f                     |    20 +-
 lapack-netlib/SRC/dgeqr2.f                    |    19 +-
 lapack-netlib/SRC/dgeqr2p.f                   |    20 +-
 lapack-netlib/SRC/dgeqrf.f                    |    17 +-
 lapack-netlib/SRC/dgeqrfp.f                   |    20 +-
 lapack-netlib/SRC/dgerfsx.f                   |    10 +-
 lapack-netlib/SRC/dgesc2.f                    |     4 +-
 lapack-netlib/SRC/dgesdd.f                    |     4 +-
 lapack-netlib/SRC/dgesvdq.f                   |  1385 ++
 lapack-netlib/SRC/dgesvj.f                    |    44 +-
 lapack-netlib/SRC/dgesvxx.f                   |    10 +-
 lapack-netlib/SRC/dgetc2.f                    |     2 +-
 lapack-netlib/SRC/dgetsls.f                   |     2 +
 lapack-netlib/SRC/dggesx.f                    |     8 +-
 lapack-netlib/SRC/dgsvj0.f                    |    20 +-
 lapack-netlib/SRC/dgsvj1.f                    |    30 +-
 lapack-netlib/SRC/dhseqr.f                    |    46 +-
 lapack-netlib/SRC/dla_gbrcond.f               |     4 +-
 lapack-netlib/SRC/dla_gbrfsx_extended.f       |    12 +-
 lapack-netlib/SRC/dla_gercond.f               |     4 +-
 lapack-netlib/SRC/dla_gerfsx_extended.f       |    12 +-
 lapack-netlib/SRC/dla_porcond.f               |     4 +-
 lapack-netlib/SRC/dla_porfsx_extended.f       |     8 +-
 lapack-netlib/SRC/dla_porpvgrw.f              |     2 +-
 lapack-netlib/SRC/dla_syrcond.f               |     4 +-
 lapack-netlib/SRC/dla_syrfsx_extended.f       |     8 +-
 lapack-netlib/SRC/dla_syrpvgrw.f              |     2 +-
 lapack-netlib/SRC/dla_wwaddw.f                |     2 +-
 lapack-netlib/SRC/dlaed4.f                    |     2 +-
 lapack-netlib/SRC/dlaed8.f                    |     2 +-
 lapack-netlib/SRC/dlagtf.f                    |     6 +-
 lapack-netlib/SRC/dlagts.f                    |    20 +-
 lapack-netlib/SRC/dlahqr.f                    |    14 +-
 lapack-netlib/SRC/dlaln2.f                    |     2 +-
 lapack-netlib/SRC/dlamswlq.f                  |     1 +
 lapack-netlib/SRC/dlamtsqr.f                  |     1 +
 lapack-netlib/SRC/dlangb.f                    |    26 +-
 lapack-netlib/SRC/dlange.f                    |    22 +-
 lapack-netlib/SRC/dlanhs.f                    |    25 +-
 lapack-netlib/SRC/dlansb.f                    |    44 +-
 lapack-netlib/SRC/dlansp.f                    |    48 +-
 lapack-netlib/SRC/dlansy.f                    |    42 +-
 lapack-netlib/SRC/dlantb.f                    |    57 +-
 lapack-netlib/SRC/dlantp.f                    |    55 +-
 lapack-netlib/SRC/dlantr.f                    |    58 +-
 lapack-netlib/SRC/dlanv2.f                    |     8 +-
 lapack-netlib/SRC/dlaorhr_col_getrfnp.f       |   248 +
 lapack-netlib/SRC/dlaorhr_col_getrfnp2.f      |   305 +
 lapack-netlib/SRC/dlaqps.f                    |     2 +-
 lapack-netlib/SRC/dlaqr0.f                    |    38 +-
 lapack-netlib/SRC/dlaqr1.f                    |     2 +-
 lapack-netlib/SRC/dlaqr2.f                    |    18 +-
 lapack-netlib/SRC/dlaqr3.f                    |    18 +-
 lapack-netlib/SRC/dlaqr4.f                    |    38 +-
 lapack-netlib/SRC/dlaqr5.f                    |    52 +-
 lapack-netlib/SRC/dlarfb.f                    |     2 +
 lapack-netlib/SRC/dlarfx.f                    |     2 +-
 lapack-netlib/SRC/dlarfy.f                    |     2 +-
 lapack-netlib/SRC/dlarrb.f                    |     4 +-
 lapack-netlib/SRC/dlarre.f                    |     2 +-
 lapack-netlib/SRC/dlarrj.f                    |     2 +-
 lapack-netlib/SRC/dlarrv.f                    |     2 +-
 lapack-netlib/SRC/dlasd7.f                    |     2 +-
 lapack-netlib/SRC/dlasr.f                     |     2 +-
 lapack-netlib/SRC/dlassq.f                    |     2 +-
 lapack-netlib/SRC/dlaswlq.f                   |    20 +-
 lapack-netlib/SRC/dlasyf_aa.f                 |    42 +-
 lapack-netlib/SRC/dlasyf_rk.f                 |     4 +-
 lapack-netlib/SRC/dlasyf_rook.f               |     2 +-
 lapack-netlib/SRC/dlatdf.f                    |     4 +-
 lapack-netlib/SRC/dlatsqr.f                   |    23 +-
 lapack-netlib/SRC/dorgtsqr.f                  |   306 +
 lapack-netlib/SRC/dorhr_col.f                 |   440 +
 lapack-netlib/SRC/dporfsx.f                   |    12 +-
 lapack-netlib/SRC/dposvxx.f                   |    10 +-
 lapack-netlib/SRC/dsb2st_kernels.f            |    70 +-
 lapack-netlib/SRC/dsbgvx.f                    |     6 +-
 lapack-netlib/SRC/dsgesv.f                    |    10 +-
 lapack-netlib/SRC/dsposv.f                    |     6 +-
 lapack-netlib/SRC/dstemr.f                    |     4 +-
 lapack-netlib/SRC/dsyconvf.f                  |     8 +-
 lapack-netlib/SRC/dsyconvf_rook.f             |     8 +-
 lapack-netlib/SRC/dsyev_2stage.f              |     2 +-
 lapack-netlib/SRC/dsyevd_2stage.f             |     2 +-
 lapack-netlib/SRC/dsyrfsx.f                   |    10 +-
 lapack-netlib/SRC/dsysv_aa.f                  |     6 +-
 lapack-netlib/SRC/dsysv_aa_2stage.f           |     4 +-
 lapack-netlib/SRC/dsysvxx.f                   |    10 +-
 lapack-netlib/SRC/dsytf2_rk.f                 |     4 +-
 lapack-netlib/SRC/dsytrd_2stage.f             |    13 +-
 lapack-netlib/SRC/dsytrd_sb2st.F              |     4 +-
 lapack-netlib/SRC/dsytrd_sy2sb.f              |     2 +-
 lapack-netlib/SRC/dsytrf.f                    |     6 +-
 lapack-netlib/SRC/dsytrf_aa.f                 |     8 +-
 lapack-netlib/SRC/dsytrf_aa_2stage.f          |    58 +-
 lapack-netlib/SRC/dsytri2.f                   |     4 +-
 lapack-netlib/SRC/dsytrs_aa.f                 |   112 +-
 lapack-netlib/SRC/dsytrs_aa_2stage.f          |    18 +-
 lapack-netlib/SRC/dtgsy2.f                    |     4 +-
 lapack-netlib/SRC/dtgsyl.f                    |    14 +-
 lapack-netlib/SRC/dtpmlqt.f                   |     2 +-
 lapack-netlib/SRC/dtpmqrt.f                   |     2 +-
 lapack-netlib/SRC/dtprfb.f                    |     4 +-
 lapack-netlib/SRC/ilaenv.f                    |    17 +-
 lapack-netlib/SRC/ilaenv2stage.f              |     4 +-
 lapack-netlib/SRC/iparam2stage.F              |    10 +-
 lapack-netlib/SRC/iparmq.f                    |     6 +-
 lapack-netlib/SRC/meson.build                 |    11 +
 lapack-netlib/SRC/sbdsvdx.f                   |     2 +-
 lapack-netlib/SRC/scombssq.f                  |    92 +
 lapack-netlib/SRC/sgbrfsx.f                   |    10 +-
 lapack-netlib/SRC/sgbsvxx.f                   |    10 +-
 lapack-netlib/SRC/sgebak.f                    |     8 +-
 lapack-netlib/SRC/sgeesx.f                    |     4 +-
 lapack-netlib/SRC/sgejsv.f                    |    54 +-
 lapack-netlib/SRC/sgelq.f                     |    19 +-
 lapack-netlib/SRC/sgelq2.f                    |    18 +-
 lapack-netlib/SRC/sgelqf.f                    |    16 +-
 lapack-netlib/SRC/sgelqt.f                    |     2 +
 lapack-netlib/SRC/sgelqt3.f                   |     2 +
 lapack-netlib/SRC/sgemlq.f                    |     3 +-
 lapack-netlib/SRC/sgemlqt.f                   |     2 +
 lapack-netlib/SRC/sgemqr.f                    |     3 +-
 lapack-netlib/SRC/sgeqr.f                     |    20 +-
 lapack-netlib/SRC/sgeqr2.f                    |    19 +-
 lapack-netlib/SRC/sgeqr2p.f                   |    20 +-
 lapack-netlib/SRC/sgeqrf.f                    |    17 +-
 lapack-netlib/SRC/sgeqrfp.f                   |    20 +-
 lapack-netlib/SRC/sgerfsx.f                   |    10 +-
 lapack-netlib/SRC/sgesc2.f                    |     4 +-
 lapack-netlib/SRC/sgesdd.f                    |     4 +-
 lapack-netlib/SRC/sgesvdq.f                   |  1388 ++
 lapack-netlib/SRC/sgesvj.f                    |    44 +-
 lapack-netlib/SRC/sgesvxx.f                   |    10 +-
 lapack-netlib/SRC/sgetc2.f                    |     2 +-
 lapack-netlib/SRC/sgetsls.f                   |     4 +-
 lapack-netlib/SRC/sggesx.f                    |     8 +-
 lapack-netlib/SRC/sgsvj0.f                    |    20 +-
 lapack-netlib/SRC/sgsvj1.f                    |    20 +-
 lapack-netlib/SRC/shseqr.f                    |    46 +-
 lapack-netlib/SRC/sla_gbrcond.f               |     4 +-
 lapack-netlib/SRC/sla_gbrfsx_extended.f       |    12 +-
 lapack-netlib/SRC/sla_gercond.f               |     4 +-
 lapack-netlib/SRC/sla_gerfsx_extended.f       |    12 +-
 lapack-netlib/SRC/sla_porcond.f               |     4 +-
 lapack-netlib/SRC/sla_porfsx_extended.f       |     8 +-
 lapack-netlib/SRC/sla_syrcond.f               |     4 +-
 lapack-netlib/SRC/sla_syrfsx_extended.f       |     8 +-
 lapack-netlib/SRC/sla_syrpvgrw.f              |     2 +-
 lapack-netlib/SRC/sla_wwaddw.f                |     2 +-
 lapack-netlib/SRC/slaed4.f                    |     2 +-
 lapack-netlib/SRC/slaed8.f                    |     2 +-
 lapack-netlib/SRC/slagtf.f                    |     6 +-
 lapack-netlib/SRC/slagts.f                    |    20 +-
 lapack-netlib/SRC/slahqr.f                    |    14 +-
 lapack-netlib/SRC/slaln2.f                    |     2 +-
 lapack-netlib/SRC/slamswlq.f                  |     1 +
 lapack-netlib/SRC/slamtsqr.f                  |     1 +
 lapack-netlib/SRC/slangb.f                    |    26 +-
 lapack-netlib/SRC/slange.f                    |    22 +-
 lapack-netlib/SRC/slanhs.f                    |    25 +-
 lapack-netlib/SRC/slansb.f                    |    44 +-
 lapack-netlib/SRC/slansp.f                    |    48 +-
 lapack-netlib/SRC/slansy.f                    |    42 +-
 lapack-netlib/SRC/slantb.f                    |    57 +-
 lapack-netlib/SRC/slantp.f                    |    55 +-
 lapack-netlib/SRC/slantr.f                    |    58 +-
 lapack-netlib/SRC/slanv2.f                    |     8 +-
 lapack-netlib/SRC/slaorhr_col_getrfnp.f       |   248 +
 lapack-netlib/SRC/slaorhr_col_getrfnp2.f      |   305 +
 lapack-netlib/SRC/slaqps.f                    |     2 +-
 lapack-netlib/SRC/slaqr0.f                    |    38 +-
 lapack-netlib/SRC/slaqr1.f                    |     2 +-
 lapack-netlib/SRC/slaqr2.f                    |    18 +-
 lapack-netlib/SRC/slaqr3.f                    |    18 +-
 lapack-netlib/SRC/slaqr4.f                    |    38 +-
 lapack-netlib/SRC/slaqr5.f                    |    52 +-
 lapack-netlib/SRC/slarfb.f                    |     2 +
 lapack-netlib/SRC/slarfx.f                    |     2 +-
 lapack-netlib/SRC/slarfy.f                    |     2 +-
 lapack-netlib/SRC/slarrb.f                    |     4 +-
 lapack-netlib/SRC/slarre.f                    |     2 +-
 lapack-netlib/SRC/slarrj.f                    |     2 +-
 lapack-netlib/SRC/slarrv.f                    |     2 +-
 lapack-netlib/SRC/slasd7.f                    |     2 +-
 lapack-netlib/SRC/slassq.f                    |     2 +-
 lapack-netlib/SRC/slaswlq.f                   |    22 +-
 lapack-netlib/SRC/slasyf_aa.f                 |    42 +-
 lapack-netlib/SRC/slasyf_rk.f                 |     4 +-
 lapack-netlib/SRC/slatdf.f                    |     4 +-
 lapack-netlib/SRC/slatsqr.f                   |    23 +-
 lapack-netlib/SRC/sorgtsqr.f                  |   306 +
 lapack-netlib/SRC/sorhr_col.f                 |   439 +
 lapack-netlib/SRC/sporfsx.f                   |    12 +-
 lapack-netlib/SRC/sposvxx.f                   |    10 +-
 lapack-netlib/SRC/ssb2st_kernels.f            |    70 +-
 lapack-netlib/SRC/ssbgvx.f                    |     6 +-
 lapack-netlib/SRC/sstemr.f                    |     4 +-
 lapack-netlib/SRC/ssyconvf.f                  |     8 +-
 lapack-netlib/SRC/ssyconvf_rook.f             |     8 +-
 lapack-netlib/SRC/ssyev_2stage.f              |     2 +-
 lapack-netlib/SRC/ssyevd_2stage.f             |     2 +-
 lapack-netlib/SRC/ssyrfsx.f                   |    10 +-
 lapack-netlib/SRC/ssysv_aa.f                  |     6 +-
 lapack-netlib/SRC/ssysv_aa_2stage.f           |     4 +-
 lapack-netlib/SRC/ssysvxx.f                   |    10 +-
 lapack-netlib/SRC/ssytf2_rk.f                 |     4 +-
 lapack-netlib/SRC/ssytrd_2stage.f             |    13 +-
 lapack-netlib/SRC/ssytrd_sb2st.F              |     4 +-
 lapack-netlib/SRC/ssytrd_sy2sb.f              |     2 +-
 lapack-netlib/SRC/ssytrf.f                    |     6 +-
 lapack-netlib/SRC/ssytrf_aa.f                 |     8 +-
 lapack-netlib/SRC/ssytrf_aa_2stage.f          |    26 +-
 lapack-netlib/SRC/ssytri2.f                   |     4 +-
 lapack-netlib/SRC/ssytrs_aa.f                 |   128 +-
 lapack-netlib/SRC/ssytrs_aa_2stage.f          |    18 +-
 lapack-netlib/SRC/stgsy2.f                    |     4 +-
 lapack-netlib/SRC/stgsyl.f                    |    14 +-
 lapack-netlib/SRC/stpmlqt.f                   |     2 +-
 lapack-netlib/SRC/stpmqrt.f                   |     2 +-
 lapack-netlib/SRC/stprfb.f                    |     4 +-
 lapack-netlib/SRC/zcgesv.f                    |    10 +-
 lapack-netlib/SRC/zcposv.f                    |     6 +-
 lapack-netlib/SRC/zgbrfsx.f                   |    12 +-
 lapack-netlib/SRC/zgbsvxx.f                   |    10 +-
 lapack-netlib/SRC/zgebak.f                    |     8 +-
 lapack-netlib/SRC/zgeev.f                     |     2 +-
 lapack-netlib/SRC/zgejsv.f                    |    64 +-
 lapack-netlib/SRC/zgelq.f                     |    19 +-
 lapack-netlib/SRC/zgelq2.f                    |    18 +-
 lapack-netlib/SRC/zgelqf.f                    |    16 +-
 lapack-netlib/SRC/zgemlq.f                    |     3 +-
 lapack-netlib/SRC/zgemqr.f                    |     3 +-
 lapack-netlib/SRC/zgeqr.f                     |    20 +-
 lapack-netlib/SRC/zgeqr2.f                    |    19 +-
 lapack-netlib/SRC/zgeqr2p.f                   |    20 +-
 lapack-netlib/SRC/zgeqrf.f                    |    17 +-
 lapack-netlib/SRC/zgeqrfp.f                   |    20 +-
 lapack-netlib/SRC/zgerfsx.f                   |    12 +-
 lapack-netlib/SRC/zgesc2.f                    |     2 +-
 lapack-netlib/SRC/zgesvdq.f                   |  1389 ++
 lapack-netlib/SRC/zgesvdx.f                   |     2 +-
 lapack-netlib/SRC/zgesvj.f                    |    46 +-
 lapack-netlib/SRC/zgesvxx.f                   |    10 +-
 lapack-netlib/SRC/zgetsls.f                   |     2 +
 lapack-netlib/SRC/zggesx.f                    |     8 +-
 lapack-netlib/SRC/zgsvj0.f                    |    18 +-
 lapack-netlib/SRC/zgsvj1.f                    |    20 +-
 lapack-netlib/SRC/zhb2st_kernels.f            |    70 +-
 lapack-netlib/SRC/zhecon_3.f                  |     9 +-
 lapack-netlib/SRC/zheevr.f                    |     2 +-
 lapack-netlib/SRC/zheevr_2stage.f             |     2 +-
 lapack-netlib/SRC/zhegs2.f                    |     1 +
 lapack-netlib/SRC/zhegst.f                    |     1 +
 lapack-netlib/SRC/zherfsx.f                   |    12 +-
 lapack-netlib/SRC/zhesv_aa.f                  |     6 +-
 lapack-netlib/SRC/zhesv_aa_2stage.f           |     8 +-
 lapack-netlib/SRC/zhesvxx.f                   |    16 +-
 lapack-netlib/SRC/zhetf2_rk.f                 |     4 +-
 lapack-netlib/SRC/zhetrd_2stage.f             |    13 +-
 lapack-netlib/SRC/zhetrd_hb2st.F              |     4 +-
 lapack-netlib/SRC/zhetrd_he2hb.f              |     2 +-
 lapack-netlib/SRC/zhetrf_aa.f                 |     8 +-
 lapack-netlib/SRC/zhetrf_aa_2stage.f          |    42 +-
 lapack-netlib/SRC/zhetri2.f                   |     4 +-
 lapack-netlib/SRC/zhetrs_aa.f                 |   124 +-
 lapack-netlib/SRC/zhetrs_aa_2stage.f          |    30 +-
 lapack-netlib/SRC/zhseqr.f                    |    44 +-
 lapack-netlib/SRC/zla_gbrcond_c.f             |     4 +-
 lapack-netlib/SRC/zla_gbrcond_x.f             |     4 +-
 lapack-netlib/SRC/zla_gbrfsx_extended.f       |    12 +-
 lapack-netlib/SRC/zla_gercond_c.f             |     8 +-
 lapack-netlib/SRC/zla_gercond_x.f             |     4 +-
 lapack-netlib/SRC/zla_gerfsx_extended.f       |    12 +-
 lapack-netlib/SRC/zla_hercond_c.f             |     4 +-
 lapack-netlib/SRC/zla_hercond_x.f             |     4 +-
 lapack-netlib/SRC/zla_herfsx_extended.f       |     8 +-
 lapack-netlib/SRC/zla_herpvgrw.f              |     2 +-
 lapack-netlib/SRC/zla_porcond_c.f             |     4 +-
 lapack-netlib/SRC/zla_porcond_x.f             |     4 +-
 lapack-netlib/SRC/zla_porfsx_extended.f       |     8 +-
 lapack-netlib/SRC/zla_porpvgrw.f              |     2 +-
 lapack-netlib/SRC/zla_syrcond_c.f             |     4 +-
 lapack-netlib/SRC/zla_syrcond_x.f             |     4 +-
 lapack-netlib/SRC/zla_syrfsx_extended.f       |     8 +-
 lapack-netlib/SRC/zla_syrpvgrw.f              |     2 +-
 lapack-netlib/SRC/zla_wwaddw.f                |     2 +-
 lapack-netlib/SRC/zlahef_aa.f                 |    42 +-
 lapack-netlib/SRC/zlahef_rk.f                 |     4 +-
 lapack-netlib/SRC/zlahqr.f                    |    14 +-
 lapack-netlib/SRC/zlamswlq.f                  |     1 +
 lapack-netlib/SRC/zlamtsqr.f                  |     1 +
 lapack-netlib/SRC/zlangb.f                    |    23 +-
 lapack-netlib/SRC/zlange.f                    |    22 +-
 lapack-netlib/SRC/zlanhb.f                    |    48 +-
 lapack-netlib/SRC/zlanhe.f                    |    45 +-
 lapack-netlib/SRC/zlanhp.f                    |    46 +-
 lapack-netlib/SRC/zlanhs.f                    |    23 +-
 lapack-netlib/SRC/zlansb.f                    |    42 +-
 lapack-netlib/SRC/zlansp.f                    |    54 +-
 lapack-netlib/SRC/zlansy.f                    |    40 +-
 lapack-netlib/SRC/zlantb.f                    |    55 +-
 lapack-netlib/SRC/zlantp.f                    |    53 +-
 lapack-netlib/SRC/zlantr.f                    |    56 +-
 lapack-netlib/SRC/zlaqps.f                    |     2 +-
 lapack-netlib/SRC/zlaqr0.f                    |    34 +-
 lapack-netlib/SRC/zlaqr1.f                    |     2 +-
 lapack-netlib/SRC/zlaqr2.f                    |    18 +-
 lapack-netlib/SRC/zlaqr3.f                    |    18 +-
 lapack-netlib/SRC/zlaqr4.f                    |    32 +-
 lapack-netlib/SRC/zlaqr5.f                    |    52 +-
 lapack-netlib/SRC/zlarfb.f                    |     2 +
 lapack-netlib/SRC/zlarfx.f                    |     2 +-
 lapack-netlib/SRC/zlarfy.f                    |     2 +-
 lapack-netlib/SRC/zlarrv.f                    |     2 +-
 lapack-netlib/SRC/zlassq.f                    |     4 +-
 lapack-netlib/SRC/zlaswlq.f                   |    20 +-
 lapack-netlib/SRC/zlasyf_aa.f                 |    42 +-
 lapack-netlib/SRC/zlasyf_rk.f                 |     4 +-
 lapack-netlib/SRC/zlatdf.f                    |     2 +-
 lapack-netlib/SRC/zlatsqr.f                   |    25 +-
 lapack-netlib/SRC/zlaunhr_col_getrfnp.f       |   248 +
 lapack-netlib/SRC/zlaunhr_col_getrfnp2.f      |   314 +
 lapack-netlib/SRC/zporfsx.f                   |    16 +-
 lapack-netlib/SRC/zposvxx.f                   |    14 +-
 lapack-netlib/SRC/zpotrf2.f                   |     4 +-
 lapack-netlib/SRC/zstemr.f                    |     4 +-
 lapack-netlib/SRC/zsycon_3.f                  |     9 +-
 lapack-netlib/SRC/zsyconvf.f                  |     8 +-
 lapack-netlib/SRC/zsyconvf_rook.f             |     8 +-
 lapack-netlib/SRC/zsyrfsx.f                   |    10 +-
 lapack-netlib/SRC/zsysv_aa.f                  |     6 +-
 lapack-netlib/SRC/zsysv_aa_2stage.f           |     6 +-
 lapack-netlib/SRC/zsysvxx.f                   |    10 +-
 lapack-netlib/SRC/zsytf2_rk.f                 |     4 +-
 lapack-netlib/SRC/zsytrf.f                    |     2 +-
 lapack-netlib/SRC/zsytrf_aa.f                 |     8 +-
 lapack-netlib/SRC/zsytrf_aa_2stage.f          |    26 +-
 lapack-netlib/SRC/zsytri2.f                   |     4 +-
 lapack-netlib/SRC/zsytrs2.f                   |     2 +-
 lapack-netlib/SRC/zsytrs_aa.f                 |   112 +-
 lapack-netlib/SRC/zsytrs_aa_2stage.f          |    18 +-
 lapack-netlib/SRC/ztgsy2.f                    |     4 +-
 lapack-netlib/SRC/ztpmlqt.f                   |     2 +-
 lapack-netlib/SRC/ztpmqrt.f                   |     2 +-
 lapack-netlib/SRC/ztprfb.f                    |     4 +-
 lapack-netlib/SRC/zungtsqr.f                  |   307 +
 lapack-netlib/SRC/zunhr_col.f                 |   441 +
 lapack-netlib/TESTING/CMakeLists.txt          |     2 +-
 lapack-netlib/TESTING/EIG/Makefile            |    35 +-
 lapack-netlib/TESTING/EIG/cbdt05.f            |     1 +
 lapack-netlib/TESTING/EIG/cchkst.f            |     2 +-
 lapack-netlib/TESTING/EIG/cchkst2stg.f        |     2 +-
 lapack-netlib/TESTING/EIG/cdrgsx.f            |     2 +-
 lapack-netlib/TESTING/EIG/cdrvbd.f            |   199 +-
 lapack-netlib/TESTING/EIG/cerred.f            |    59 +-
 lapack-netlib/TESTING/EIG/cget51.f            |    17 +-
 lapack-netlib/TESTING/EIG/chbt21.f            |    12 +-
 lapack-netlib/TESTING/EIG/chet21.f            |    32 +-
 lapack-netlib/TESTING/EIG/chet22.f            |    12 +-
 lapack-netlib/TESTING/EIG/chpt21.f            |    35 +-
 lapack-netlib/TESTING/EIG/cstt21.f            |    13 +-
 lapack-netlib/TESTING/EIG/dbdt05.f            |     1 +
 lapack-netlib/TESTING/EIG/dchkst.f            |     2 +-
 lapack-netlib/TESTING/EIG/dchkst2stg.f        |     2 +-
 lapack-netlib/TESTING/EIG/ddrgsx.f            |     2 +-
 lapack-netlib/TESTING/EIG/ddrvbd.f            |   113 +-
 lapack-netlib/TESTING/EIG/derred.f            |    59 +-
 lapack-netlib/TESTING/EIG/dget39.f            |     2 +-
 lapack-netlib/TESTING/EIG/dsbt21.f            |    11 +-
 lapack-netlib/TESTING/EIG/dspt21.f            |    32 +-
 lapack-netlib/TESTING/EIG/dsyt21.f            |    30 +-
 lapack-netlib/TESTING/EIG/dsyt22.f            |    12 +-
 lapack-netlib/TESTING/EIG/sbdt05.f            |     1 +
 lapack-netlib/TESTING/EIG/schkst.f            |     2 +-
 lapack-netlib/TESTING/EIG/schkst2stg.f        |     2 +-
 lapack-netlib/TESTING/EIG/sdrgsx.f            |     2 +-
 lapack-netlib/TESTING/EIG/sdrvbd.f            |   111 +-
 lapack-netlib/TESTING/EIG/serred.f            |    59 +-
 lapack-netlib/TESTING/EIG/sget39.f            |     2 +-
 lapack-netlib/TESTING/EIG/ssbt21.f            |    11 +-
 lapack-netlib/TESTING/EIG/sspt21.f            |    32 +-
 lapack-netlib/TESTING/EIG/ssyt21.f            |    30 +-
 lapack-netlib/TESTING/EIG/ssyt22.f            |    12 +-
 lapack-netlib/TESTING/EIG/zbdt05.f            |     1 +
 lapack-netlib/TESTING/EIG/zchkst.f            |     2 +-
 lapack-netlib/TESTING/EIG/zchkst2stg.f        |     2 +-
 lapack-netlib/TESTING/EIG/zdrgev3.f           |     2 +-
 lapack-netlib/TESTING/EIG/zdrgsx.f            |     2 +-
 lapack-netlib/TESTING/EIG/zdrvbd.f            |   196 +-
 lapack-netlib/TESTING/EIG/zerred.f            |    59 +-
 lapack-netlib/TESTING/EIG/zget51.f            |    17 +-
 lapack-netlib/TESTING/EIG/zhbt21.f            |    12 +-
 lapack-netlib/TESTING/EIG/zhet21.f            |    32 +-
 lapack-netlib/TESTING/EIG/zhet22.f            |    12 +-
 lapack-netlib/TESTING/EIG/zhpt21.f            |    36 +-
 lapack-netlib/TESTING/EIG/zstt21.f            |    11 +-
 lapack-netlib/TESTING/LIN/CMakeLists.txt      |    12 +-
 lapack-netlib/TESTING/LIN/Makefile            |    75 +-
 lapack-netlib/TESTING/LIN/cchkaa.f            |    40 +-
 lapack-netlib/TESTING/LIN/cchkunhr_col.f      |   239 +
 lapack-netlib/TESTING/LIN/cdrvls.f            |    25 +-
 lapack-netlib/TESTING/LIN/cdrvsy_rk.f         |     3 +-
 lapack-netlib/TESTING/LIN/cerrunhr_col.f      |   164 +
 lapack-netlib/TESTING/LIN/cerrvx.f            |     4 +-
 lapack-netlib/TESTING/LIN/clahilb.f           |     2 +-
 lapack-netlib/TESTING/LIN/ctsqr01.f           |    26 +-
 lapack-netlib/TESTING/LIN/cunhr_col01.f       |   390 +
 lapack-netlib/TESTING/LIN/dchkaa.f            |    39 +-
 lapack-netlib/TESTING/LIN/dchkorhr_col.f      |   239 +
 lapack-netlib/TESTING/LIN/ddrvls.f            |    16 +-
 lapack-netlib/TESTING/LIN/derrorhr_col.f      |   164 +
 lapack-netlib/TESTING/LIN/derrvx.f            |     2 +-
 lapack-netlib/TESTING/LIN/dorhr_col01.f       |   386 +
 lapack-netlib/TESTING/LIN/dtsqr01.f           |    26 +-
 lapack-netlib/TESTING/LIN/schkaa.f            |    35 +-
 lapack-netlib/TESTING/LIN/schkorhr_col.f      |   239 +
 lapack-netlib/TESTING/LIN/sdrvls.f            |    20 +-
 lapack-netlib/TESTING/LIN/serrorhr_col.f      |   164 +
 lapack-netlib/TESTING/LIN/serrvx.f            |     2 +-
 lapack-netlib/TESTING/LIN/sorhr_col01.f       |   386 +
 lapack-netlib/TESTING/LIN/stsqr01.f           |    26 +-
 lapack-netlib/TESTING/LIN/zchkaa.f            |    43 +-
 lapack-netlib/TESTING/LIN/zchkunhr_col.f      |   239 +
 lapack-netlib/TESTING/LIN/zdrvhe_rk.f         |     1 +
 lapack-netlib/TESTING/LIN/zdrvls.f            |    25 +-
 lapack-netlib/TESTING/LIN/zerrunhr_col.f      |   164 +
 lapack-netlib/TESTING/LIN/zerrvx.f            |    36 +-
 lapack-netlib/TESTING/LIN/zlahilb.f           |     2 +-
 lapack-netlib/TESTING/LIN/ztsqr01.f           |    26 +-
 lapack-netlib/TESTING/LIN/zunhr_col01.f       |   390 +
 lapack-netlib/TESTING/MATGEN/Makefile         |    43 +-
 lapack-netlib/TESTING/MATGEN/clahilb.f        |     2 +-
 lapack-netlib/TESTING/MATGEN/clatm2.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/clatm3.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/clatmr.f         |    30 +-
 lapack-netlib/TESTING/MATGEN/dlatm2.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/dlatm3.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/dlatmr.f         |    30 +-
 lapack-netlib/TESTING/MATGEN/slatm2.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/slatm3.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/slatmr.f         |    30 +-
 lapack-netlib/TESTING/MATGEN/zlahilb.f        |     2 +-
 lapack-netlib/TESTING/MATGEN/zlatm2.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/zlatm3.f         |     2 +-
 lapack-netlib/TESTING/MATGEN/zlatmr.f         |    30 +-
 lapack-netlib/TESTING/Makefile                |   188 +-
 lapack-netlib/TESTING/ctest.in                |     1 +
 lapack-netlib/TESTING/dtest.in                |     1 +
 lapack-netlib/TESTING/stest.in                |     1 +
 lapack-netlib/TESTING/ztest.in                |     1 +
 lapack-netlib/appveyor.yml                    |    64 -
 lapack-netlib/lapack_build.cmake              |     8 +-
 lapack-netlib/lapack_testing.py               |    16 +-
 lapack-netlib/make.inc.example                |    34 +-
 lapack-netlib/meson.build                     |    28 +
 lapack-netlib/meson_options.txt               |     3 +
 812 files changed, 36335 insertions(+), 11964 deletions(-)
 create mode 100644 lapack-netlib/.appveyor.yml
 create mode 100644 lapack-netlib/BLAS/SRC/meson.build
 create mode 100644 lapack-netlib/LAPACKE/include/lapack.h
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
 create mode 100644 lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c
 create mode 100644 lapack-netlib/SRC/cgesvdq.f
 create mode 100644 lapack-netlib/SRC/claunhr_col_getrfnp.f
 create mode 100644 lapack-netlib/SRC/claunhr_col_getrfnp2.f
 create mode 100644 lapack-netlib/SRC/cungtsqr.f
 create mode 100644 lapack-netlib/SRC/cunhr_col.f
 create mode 100644 lapack-netlib/SRC/dcombssq.f
 create mode 100644 lapack-netlib/SRC/dgesvdq.f
 create mode 100644 lapack-netlib/SRC/dlaorhr_col_getrfnp.f
 create mode 100644 lapack-netlib/SRC/dlaorhr_col_getrfnp2.f
 create mode 100644 lapack-netlib/SRC/dorgtsqr.f
 create mode 100644 lapack-netlib/SRC/dorhr_col.f
 create mode 100644 lapack-netlib/SRC/meson.build
 create mode 100644 lapack-netlib/SRC/scombssq.f
 create mode 100644 lapack-netlib/SRC/sgesvdq.f
 create mode 100644 lapack-netlib/SRC/slaorhr_col_getrfnp.f
 create mode 100644 lapack-netlib/SRC/slaorhr_col_getrfnp2.f
 create mode 100644 lapack-netlib/SRC/sorgtsqr.f
 create mode 100644 lapack-netlib/SRC/sorhr_col.f
 create mode 100644 lapack-netlib/SRC/zgesvdq.f
 create mode 100644 lapack-netlib/SRC/zlaunhr_col_getrfnp.f
 create mode 100644 lapack-netlib/SRC/zlaunhr_col_getrfnp2.f
 create mode 100644 lapack-netlib/SRC/zungtsqr.f
 create mode 100644 lapack-netlib/SRC/zunhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/cchkunhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/cerrunhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/cunhr_col01.f
 create mode 100644 lapack-netlib/TESTING/LIN/dchkorhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/derrorhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/dorhr_col01.f
 create mode 100644 lapack-netlib/TESTING/LIN/schkorhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/serrorhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/sorhr_col01.f
 create mode 100644 lapack-netlib/TESTING/LIN/zchkunhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/zerrunhr_col.f
 create mode 100644 lapack-netlib/TESTING/LIN/zunhr_col01.f
 delete mode 100644 lapack-netlib/appveyor.yml
 create mode 100644 lapack-netlib/meson.build
 create mode 100644 lapack-netlib/meson_options.txt

diff --git a/Makefile b/Makefile
index 60f189ef2..a22e16bab 100644
--- a/Makefile
+++ b/Makefile
@@ -247,21 +247,21 @@ prof_lapack : lapack_prebuild
 
 lapack_prebuild :
 ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
-	-@echo "FORTRAN     = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "OPTS        = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "FC          = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "FFLAGS      = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "LDFLAGS     = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "override ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "ARCHFLAGS   = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "AR          = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "ARFLAGS     = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "RANLIB      = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "LAPACKLIB   = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "TMGLIB      = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "LAPACKLIB   = ../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "TMGLIB      = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "BLASLIB     = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "LAPACKELIB  = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "LAPACKELIB  = ../../../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "LAPACKLIB_P = ../$(LIBNAME_P)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "SUFFIX      = $(SUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PSUFFIX     = $(PSUFFIX)" >> $(NETLIB_LAPACK_DIR)/make.inc
@@ -319,7 +319,7 @@ lapack-test :
 ifneq ($(CROSS), 1)
 	( cd $(NETLIB_LAPACK_DIR)/INSTALL; make all; ./testlsame; ./testslamch; ./testdlamch; \
         ./testsecond; ./testdsecnd; ./testieee; ./testversion )
-	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r )
+	(cd $(NETLIB_LAPACK_DIR); ./lapack_testing.py -r -b TESTING)
 endif
 
 lapack-runtest:
diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake
index d1d2cdd3b..18a74d18e 100644
--- a/cmake/lapack.cmake
+++ b/cmake/lapack.cmake
@@ -115,7 +115,9 @@ set(SLASRC
    stplqt.f stplqt2.f stpmlqt.f
    ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
    ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
-   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
+   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
+   scombssq.f sgesvdq.f slaorhr_col_getrfnp.f
+   slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f )
 
 set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f
    sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f
@@ -210,7 +212,9 @@ set(CLASRC
    ctplqt.f ctplqt2.f ctpmlqt.f
    chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
    cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
-   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
+   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
+   cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f 
+   cungtsqr.f cunhr_col.f )
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -299,7 +303,9 @@ set(DLASRC
    dtplqt.f dtplqt2.f dtpmlqt.f
    dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
    dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
-   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
+   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
+   dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f
+   dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f )
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -398,7 +404,9 @@ set(ZLASRC
    zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
    zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
    zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
-   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
+   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
+   zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
+   zungtsqr.f zunhr_col.f)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake
index 0fc88b882..f10905c4d 100644
--- a/cmake/lapacke.cmake
+++ b/cmake/lapacke.cmake
@@ -715,6 +715,8 @@ set(DSRC
   lapacke_dgesv_work.c
   lapacke_dgesvd.c
   lapacke_dgesvd_work.c
+  lapacke_dgesvdq.c
+  lapacke_dgesvdq_work.c
   lapacke_dgesvdx.c
   lapacke_dgesvdx_work.c
   lapacke_dgesvj.c
@@ -1287,6 +1289,8 @@ set(SSRC
   lapacke_sgesv_work.c
   lapacke_sgesvd.c
   lapacke_sgesvd_work.c
+  lapacke_sgesvdq.c
+  lapacke_sgesvdq_work.c
   lapacke_sgesvdx.c
   lapacke_sgesvdx_work.c
   lapacke_sgesvj.c
@@ -1853,6 +1857,8 @@ set(ZSRC
   lapacke_zgesv_work.c
   lapacke_zgesvd.c
   lapacke_zgesvd_work.c
+  lapacke_zgesvdq.c
+  lapacke_zgesvdq_work.c
   lapacke_zgesvdx.c
   lapacke_zgesvdx_work.c
   lapacke_zgesvj.c
diff --git a/exports/gensymbol b/exports/gensymbol
index 37ba0b191..d2894e6c8 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -694,7 +694,19 @@
     
     # functions added for lapack-3.8.0
 
-    ilaenv2stage
+    ilaenv2stage,
+
+    # functions added for lapack-3.9.0
+    cgesvdq,
+    cungtsqr,
+    dcombssq,
+    dgesvdq,
+    dorgtsqr,
+    scombssq,
+    sgesvdq,
+    sorgtsqr,
+    zgesvdq,
+    zungtsqr
 );
 
 @lapack_extendedprecision_objs = (
@@ -3347,6 +3359,15 @@
     LAPACKE_zsytrf_aa_2stage_work,
     LAPACKE_zsytrs_aa_2stage,
     LAPACKE_zsytrs_aa_2stage_work,
+    
+    # new functions from 3.9.0
+    LAPACKE_dgesvdq,
+    LAPACKE_dgesvdq_work,
+    LAPACKE_sgesvdq,
+    LAPACKE_sgesvdq_work,
+    LAPACKE_zgesvdq,
+    LAPACKE_zgesvdq_work
+
 );
 
 #These function may need 2 underscores.
@@ -3419,7 +3440,13 @@
     dsytrf_aa_2stage, dsytrs_aa_2stage,
     zhesv_aa_2stage, zhetrf_aa_2stage,
     zhetrs_aa_2stage, zsysv_aa_2stage,
-    zsytrf_aa_2stage, zsytrs_aa_2stage
+    zsytrf_aa_2stage, zsytrs_aa_2stage,
+# 3.9.0
+    claunhr_col_getrfnp, claunhr_col_getrfnp2, cunhr_col,
+    dlaorhr_col_getrfnp, dlaorhr_col_getrfnp2, dorhr_col,
+    slaorhr_col_getrfnp, slaorhr_col_getrfnp2, sorhr_col,
+    zlaunhr_col_getrfnp, zlaunhr_col_getrfnp2, zunhr_col
+
 );
 
 
diff --git a/lapack-netlib/.appveyor.yml b/lapack-netlib/.appveyor.yml
new file mode 100644
index 000000000..0c16dcf7b
--- /dev/null
+++ b/lapack-netlib/.appveyor.yml
@@ -0,0 +1,38 @@
+image:
+- Visual Studio 2017
+
+configuration: Release
+clone_depth: 3
+
+matrix:
+  fast_finish: false
+
+skip_commits:
+# Add [av skip] to commit messages
+  message: /\[av skip\]/
+
+cache:
+  - '%APPVEYOR_BUILD_FOLDER%\build'
+
+environment:
+  global:
+    CONDA_INSTALL_LOCN: C:\\Miniconda36-x64
+
+install:
+  - call %CONDA_INSTALL_LOCN%\Scripts\activate.bat
+  - conda config --add channels conda-forge --force
+  - conda install --yes --quiet flang jom
+  - call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
+  - set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%"
+  - set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%"
+
+before_build:
+  - ps: if (-Not (Test-Path .\build)) { mkdir build }
+  - cd build
+  - cmake -G "NMake Makefiles JOM" -DCMAKE_Fortran_COMPILER=flang -DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=ON ..
+
+build_script:
+  - cmake --build .
+
+test_script:
+  - ctest -j2
diff --git a/lapack-netlib/.gitignore b/lapack-netlib/.gitignore
index 4ac90962e..015f09d77 100644
--- a/lapack-netlib/.gitignore
+++ b/lapack-netlib/.gitignore
@@ -35,3 +35,9 @@ LAPACKE/example/xexample*
 # SED
 SRC/*-e
 LAPACKE/src/*-e
+build*
+
+# DOCS documentation
+DOCS/man
+DOCS/explore-html
+output_err
diff --git a/lapack-netlib/.travis.yml b/lapack-netlib/.travis.yml
index 68cfa607a..04369dafb 100644
--- a/lapack-netlib/.travis.yml
+++ b/lapack-netlib/.travis.yml
@@ -1,33 +1,32 @@
-language: cpp
+language: c
+dist: xenial
+group: travis_latest
+
+git:
+  depth: 3
+  quiet: true
 
 addons:
   apt:
-    sources:
-      - george-edison55-precise-backports # cmake
     packages:
-      - cmake
-      - cmake-data
-      - gfortran
-
-os:
-  - linux
-  - osx
-
-env:
-  - CMAKE_BUILD_TYPE=Release
-  - CMAKE_BUILD_TYPE=Coverage
+    - gfortran
 
-install:
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]];
-    then
-      for pkg in gcc cmake; do
-        if brew list -1 | grep -q "^${pkg}\$"; then
-          brew outdated $pkg || brew upgrade $pkg;
-        else
-          brew install $pkg;
-        fi
-      done
-    fi
+matrix:
+  include:
+  - os: linux
+    env: CMAKE_BUILD_TYPE=Release
+  - os: linux
+    env: CMAKE_BUILD_TYPE=Coverage
+  - os: osx
+    env: CMAKE_BUILD_TYPE=Release
+    before_install:
+    - brew update > /dev/null
+    - brew install gcc > /dev/null
+  - os: osx
+    env: CMAKE_BUILD_TYPE=Coverage
+    before_install:
+    - brew update > /dev/null
+    - brew install gcc > /dev/null
 
 script:
   - export PR=https://api.github.com/repos/$TRAVIS_REPO_SLUG/pulls/$TRAVIS_PULL_REQUEST
diff --git a/lapack-netlib/BLAS/CMakeLists.txt b/lapack-netlib/BLAS/CMakeLists.txt
index e122b2b33..ee5676fc6 100644
--- a/lapack-netlib/BLAS/CMakeLists.txt
+++ b/lapack-netlib/BLAS/CMakeLists.txt
@@ -6,4 +6,5 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/blas.pc.in ${CMAKE_CURRENT_BINARY_DIR
 install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/blas.pc
   DESTINATION ${PKG_CONFIG_DIR}
+  COMPONENT Development
   )
diff --git a/lapack-netlib/BLAS/Makefile b/lapack-netlib/BLAS/Makefile
index f9c4b534c..088ea5d50 100644
--- a/lapack-netlib/BLAS/Makefile
+++ b/lapack-netlib/BLAS/Makefile
@@ -1,13 +1,18 @@
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc
 
+.PHONY: all
 all: blas
 
+.PHONY: blas
 blas:
 	$(MAKE) -C SRC
 
+.PHONY: blas_testing
 blas_testing: blas
 	$(MAKE) -C TESTING run
 
+.PHONY: clean cleanobj cleanlib cleanexe cleantest
 clean:
 	$(MAKE) -C SRC clean
 	$(MAKE) -C TESTING clean
diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile
index a436365aa..66bb96421 100644
--- a/lapack-netlib/BLAS/SRC/Makefile
+++ b/lapack-netlib/BLAS/SRC/Makefile
@@ -1,5 +1,3 @@
-include ../../make.inc
-
 #######################################################################
 #  This is the makefile to create a library for the BLAS.
 #  The files are grouped as follows:
@@ -55,6 +53,10 @@ include ../../make.inc
 #
 #######################################################################
 
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
+.PHONY: all
 all: $(BLASLIB)
 
 #---------------------------------------------------------
@@ -138,33 +140,32 @@ ALLOBJ = $(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \
 	$(ZBLAS2) $(ZBLAS3) $(ALLBLAS)
 
 $(BLASLIB): $(ALLOBJ)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
+.PHONY: single double complex complex16
 single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 FRC:
 	@FRC=$(FRC)
 
+.PHONY: clean cleanobj cleanlib
 clean: cleanobj cleanlib
 cleanobj:
 	rm -f *.o
 cleanlib:
 	#rm -f $(BLASLIB)  # May point to a system lib, e.g. -lblas
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
diff --git a/lapack-netlib/BLAS/SRC/icamax.f b/lapack-netlib/BLAS/SRC/icamax.f
index 8057ab095..02bc90ae4 100644
--- a/lapack-netlib/BLAS/SRC/icamax.f
+++ b/lapack-netlib/BLAS/SRC/icamax.f
@@ -43,7 +43,7 @@
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
-*>         storage spacing between elements of SX
+*>         storage spacing between elements of CX
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/BLAS/SRC/idamax.f b/lapack-netlib/BLAS/SRC/idamax.f
index 7268534db..1578ea950 100644
--- a/lapack-netlib/BLAS/SRC/idamax.f
+++ b/lapack-netlib/BLAS/SRC/idamax.f
@@ -43,7 +43,7 @@
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
-*>         storage spacing between elements of SX
+*>         storage spacing between elements of DX
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/BLAS/SRC/izamax.f b/lapack-netlib/BLAS/SRC/izamax.f
index 63d8e97de..c0aabb7bc 100644
--- a/lapack-netlib/BLAS/SRC/izamax.f
+++ b/lapack-netlib/BLAS/SRC/izamax.f
@@ -43,7 +43,7 @@
 *> \param[in] INCX
 *> \verbatim
 *>          INCX is INTEGER
-*>         storage spacing between elements of SX
+*>         storage spacing between elements of ZX
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/BLAS/SRC/meson.build b/lapack-netlib/BLAS/SRC/meson.build
new file mode 100644
index 000000000..8d96f2acd
--- /dev/null
+++ b/lapack-netlib/BLAS/SRC/meson.build
@@ -0,0 +1,29 @@
+SBLAS1 = files('isamax.f', 'sasum.f', 'saxpy.f', 'scopy.f', 'sdot.f', 'snrm2.f', 'srot.f', 'srotg.f', 'sscal.f', 'sswap.f', 'sdsdot.f', 'srotmg.f', 'srotm.f')
+
+CBLAS1 = files('scabs1.f', 'scasum.f', 'scnrm2.f', 'icamax.f', 'caxpy.f', 'ccopy.f', 'cdotc.f', 'cdotu.f', 'csscal.f', 'crotg.f', 'cscal.f', 'cswap.f', 'csrot.f')
+
+DBLAS1 = files('idamax.f', 'dasum.f', 'daxpy.f', 'dcopy.f', 'ddot.f', 'dnrm2.f', 'drot.f', 'drotg.f', 'dscal.f', 'dsdot.f', 'dswap.f', 'drotmg.f', 'drotm.f')
+
+ZBLAS1 = files('dcabs1.f', 'dzasum.f', 'dznrm2.f', 'izamax.f', 'zaxpy.f', 'zcopy.f', 'zdotc.f', 'zdotu.f', 'zdscal.f', 'zrotg.f', 'zscal.f', 'zswap.f', 'zdrot.f')
+
+CB1AUX = files('isamax.f', 'sasum.f', 'saxpy.f', 'scopy.f', 'snrm2.f', 'sscal.f')
+
+ZB1AUX = files('idamax.f', 'dasum.f', 'daxpy.f', 'dcopy.f', 'dnrm2.f', 'dscal.f')
+
+ALLBLAS = files('lsame.f', 'xerbla.f', 'xerbla_array.f')
+
+SBLAS2 = files('sgemv.f', 'sgbmv.f', 'ssymv.f', 'ssbmv.f', 'sspmv.f', 'strmv.f', 'stbmv.f', 'stpmv.f', 'strsv.f', 'stbsv.f', 'stpsv.f', 'sger.f', 'ssyr.f', 'sspr.f', 'ssyr2.f', 'sspr2.f')
+
+CBLAS2 = files('cgemv.f', 'cgbmv.f', 'chemv.f', 'chbmv.f', 'chpmv.f', 'ctrmv.f', 'ctbmv.f', 'ctpmv.f', 'ctrsv.f', 'ctbsv.f', 'ctpsv.f', 'cgerc.f', 'cgeru.f', 'cher.f', 'chpr.f', 'cher2.f', 'chpr2.f')
+
+DBLAS2 = files('dgemv.f', 'dgbmv.f', 'dsymv.f', 'dsbmv.f', 'dspmv.f', 'dtrmv.f', 'dtbmv.f', 'dtpmv.f', 'dtrsv.f', 'dtbsv.f', 'dtpsv.f', 'dger.f', 'dsyr.f', 'dspr.f', 'dsyr2.f', 'dspr2.f')
+
+ZBLAS2 = files('zgemv.f', 'zgbmv.f', 'zhemv.f', 'zhbmv.f', 'zhpmv.f', 'ztrmv.f', 'ztbmv.f', 'ztpmv.f', 'ztrsv.f', 'ztbsv.f', 'ztpsv.f', 'zgerc.f', 'zgeru.f', 'zher.f', 'zhpr.f', 'zher2.f', 'zhpr2.f')
+
+SBLAS3 = files('sgemm.f', 'ssymm.f', 'ssyrk.f', 'ssyr2k.f', 'strmm.f', 'strsm.f')
+
+CBLAS3 = files('cgemm.f', 'csymm.f', 'csyrk.f', 'csyr2k.f', 'ctrmm.f', 'ctrsm.f', 'chemm.f', 'cherk.f', 'cher2k.f')
+
+DBLAS3 = files('dgemm.f', 'dsymm.f', 'dsyrk.f', 'dsyr2k.f', 'dtrmm.f', 'dtrsm.f')
+
+ZBLAS3 = files('zgemm.f', 'zsymm.f', 'zsyrk.f', 'zsyr2k.f', 'ztrmm.f', 'ztrsm.f', 'zhemm.f', 'zherk.f', 'zher2k.f')
diff --git a/lapack-netlib/BLAS/SRC/sdsdot.f b/lapack-netlib/BLAS/SRC/sdsdot.f
index a0ec32b6f..a491e6982 100644
--- a/lapack-netlib/BLAS/SRC/sdsdot.f
+++ b/lapack-netlib/BLAS/SRC/sdsdot.f
@@ -23,13 +23,13 @@
 *>
 *> \verbatim
 *>
-*    Compute the inner product of two vectors with extended
-*    precision accumulation.
-*
-*    Returns S.P. result with dot product accumulated in D.P.
-*    SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
-*    where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
-*    defined in a similar way using INCY.
+*>   Compute the inner product of two vectors with extended
+*>   precision accumulation.
+*>
+*>   Returns S.P. result with dot product accumulated in D.P.
+*>   SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
+*>   where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
+*>   defined in a similar way using INCY.
 *> \endverbatim
 *
 *  Arguments:
@@ -77,7 +77,14 @@
 *> \author Lawson, C. L., (JPL), Hanson, R. J., (SNLA),
 *> \author Kincaid, D. R., (U. of Texas), Krogh, F. T., (JPL)
 *
-*> \ingroup complex_blas_level1
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2017
+*
+*> \ingroup single_blas_level1
 *
 *> \par Further Details:
 *  =====================
@@ -102,65 +109,7 @@
 *>    920501  Reformatted the REFERENCES section.  (WRB)
 *>    070118  Reformat to LAPACK coding style
 *> \endverbatim
-*
-*    =====================================================================
-*
-*       .. Local Scalars ..
-*       DOUBLE PRECISION DSDOT
-*       INTEGER I,KX,KY,NS
-*       ..
-*       .. Intrinsic Functions ..
-*       INTRINSIC DBLE
-*       ..
-*       DSDOT = SB
-*       IF (N.LE.0) THEN
-*          SDSDOT = DSDOT
-*          RETURN
-*       END IF
-*       IF (INCX.EQ.INCY .AND. INCX.GT.0) THEN
-*
-*       Code for equal and positive increments.
-*
-*          NS = N*INCX
-*          DO I = 1,NS,INCX
-*             DSDOT = DSDOT + DBLE(SX(I))*DBLE(SY(I))
-*          END DO
-*       ELSE
-*
-*       Code for unequal or nonpositive increments.
-*
-*          KX = 1
-*          KY = 1
-*          IF (INCX.LT.0) KX = 1 + (1-N)*INCX
-*          IF (INCY.LT.0) KY = 1 + (1-N)*INCY
-*          DO I = 1,N
-*             DSDOT = DSDOT + DBLE(SX(KX))*DBLE(SY(KY))
-*             KX = KX + INCX
-*             KY = KY + INCY
-*          END DO
-*       END IF
-*       SDSDOT = DSDOT
-*       RETURN
-*       END
-*
-*> \par Purpose:
-*  =============
 *>
-*> \verbatim
-*> \endverbatim
-*
-*  Authors:
-*  ========
-*
-*> \author Univ. of Tennessee
-*> \author Univ. of California Berkeley
-*> \author Univ. of Colorado Denver
-*> \author NAG Ltd.
-*
-*> \date November 2017
-*
-*> \ingroup single_blas_level1
-*
 *  =====================================================================
       REAL FUNCTION SDSDOT(N,SB,SX,INCX,SY,INCY)
 *
@@ -175,71 +124,6 @@
 *     ..
 *     .. Array Arguments ..
       REAL SX(*),SY(*)
-*     ..
-*
-*  PURPOSE
-*  =======
-*
-*  Compute the inner product of two vectors with extended
-*  precision accumulation.
-*
-*  Returns S.P. result with dot product accumulated in D.P.
-*  SDSDOT = SB + sum for I = 0 to N-1 of SX(LX+I*INCX)*SY(LY+I*INCY),
-*  where LX = 1 if INCX .GE. 0, else LX = 1+(1-N)*INCX, and LY is
-*  defined in a similar way using INCY.
-*
-*  AUTHOR
-*  ======
-*  Lawson, C. L., (JPL), Hanson, R. J., (SNLA),
-*  Kincaid, D. R., (U. of Texas), Krogh, F. T., (JPL)
-*
-*  ARGUMENTS
-*  =========
-*
-*  N      (input) INTEGER
-*         number of elements in input vector(s)
-*
-*  SB     (input) REAL
-*         single precision scalar to be added to inner product
-*
-*  SX     (input) REAL array, dimension (N)
-*         single precision vector with N elements
-*
-*  INCX   (input) INTEGER
-*         storage spacing between elements of SX
-*
-*  SY     (input) REAL array, dimension (N)
-*         single precision vector with N elements
-*
-*  INCY   (input) INTEGER
-*         storage spacing between elements of SY
-*
-*  SDSDOT (output) REAL
-*         single precision dot product (SB if N .LE. 0)
-*
-*  Further Details
-*  ===============
-*
-*  REFERENCES
-*
-*  C. L. Lawson, R. J. Hanson, D. R. Kincaid and F. T.
-*  Krogh, Basic linear algebra subprograms for Fortran
-*  usage, Algorithm No. 539, Transactions on Mathematical
-*  Software 5, 3 (September 1979), pp. 308-323.
-*
-*  REVISION HISTORY  (YYMMDD)
-*
-*  791001  DATE WRITTEN
-*  890531  Changed all specific intrinsics to generic.  (WRB)
-*  890831  Modified array declarations.  (WRB)
-*  890831  REVISION DATE from Version 3.2
-*  891214  Prologue converted to Version 4.0 format.  (BAB)
-*  920310  Corrected definition of LX in DESCRIPTION.  (WRB)
-*  920501  Reformatted the REFERENCES section.  (WRB)
-*  070118  Reformat to LAPACK coding style
-*
-*  =====================================================================
-*
 *     .. Local Scalars ..
       DOUBLE PRECISION DSDOT
       INTEGER I,KX,KY,NS
diff --git a/lapack-netlib/BLAS/TESTING/Makefile b/lapack-netlib/BLAS/TESTING/Makefile
index 97150b1a3..5b3b0d6ee 100644
--- a/lapack-netlib/BLAS/TESTING/Makefile
+++ b/lapack-netlib/BLAS/TESTING/Makefile
@@ -1,5 +1,7 @@
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
 
+.PHONY: all single double complex complex16
 all: single double complex complex16
 single:    xblat1s xblat2s xblat3s
 double:    xblat1d xblat2d xblat3d
@@ -7,32 +9,33 @@ complex:   xblat1c xblat2c xblat3c
 complex16: xblat1z xblat2z xblat3z
 
 xblat1s: sblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat1d: dblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat1c: cblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat1z: zblat1.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xblat2s: sblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat2d: dblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat2c: cblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat2z: zblat2.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 xblat3s: sblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat3d: dblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat3c: cblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xblat3z: zblat3.o $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
+.PHONY: run
 run: all
 	./xblat1s > sblat1.out
 	./xblat1d > dblat1.out
@@ -47,6 +50,7 @@ run: all
 	./xblat3c < cblat3.in
 	./xblat3z < zblat3.in
 
+.PHONY: clean cleanobj cleanexe cleantest
 clean: cleanobj cleanexe cleantest
 cleanobj:
 	rm -f *.o
@@ -54,6 +58,3 @@ cleanexe:
 	rm -f xblat*
 cleantest:
 	rm -f *.out core
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
diff --git a/lapack-netlib/BLAS/TESTING/cblat1.f b/lapack-netlib/BLAS/TESTING/cblat1.f
index 036dca3e0..ecf2a44cb 100644
--- a/lapack-netlib/BLAS/TESTING/cblat1.f
+++ b/lapack-netlib/BLAS/TESTING/cblat1.f
@@ -619,7 +619,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/BLAS/TESTING/dblat1.f b/lapack-netlib/BLAS/TESTING/dblat1.f
index f3255fef4..28af121cd 100644
--- a/lapack-netlib/BLAS/TESTING/dblat1.f
+++ b/lapack-netlib/BLAS/TESTING/dblat1.f
@@ -991,7 +991,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/BLAS/TESTING/sblat1.f b/lapack-netlib/BLAS/TESTING/sblat1.f
index a5c1c6af6..fe05bbe87 100644
--- a/lapack-netlib/BLAS/TESTING/sblat1.f
+++ b/lapack-netlib/BLAS/TESTING/sblat1.f
@@ -946,7 +946,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/BLAS/TESTING/zblat1.f b/lapack-netlib/BLAS/TESTING/zblat1.f
index 4b0bcf884..2d7b88490 100644
--- a/lapack-netlib/BLAS/TESTING/zblat1.f
+++ b/lapack-netlib/BLAS/TESTING/zblat1.f
@@ -619,7 +619,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/CBLAS/CMakeLists.txt b/lapack-netlib/CBLAS/CMakeLists.txt
index d9fa24530..04c5ab795 100644
--- a/lapack-netlib/CBLAS/CMakeLists.txt
+++ b/lapack-netlib/CBLAS/CMakeLists.txt
@@ -12,8 +12,10 @@ FortranCInterface_HEADER(${LAPACK_BINARY_DIR}/include/cblas_mangling.h
                          SYMBOL_NAMESPACE "F77_")
 if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND)
   message(WARNING "Reverting to pre-defined include/lapacke_mangling.h")
-  configure_file(include/lapacke_mangling_with_flags.h.in
-                 ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
+    configure_file(include/lapacke_mangling_with_flags.h.in
+                  ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
+    configure_file(include/cblas_mangling_with_flags.h.in
+                 ${LAPACK_BINARY_DIR}/include/cblas_mangling.h)
 endif()
 
 include_directories(include ${LAPACK_BINARY_DIR}/include)
@@ -28,7 +30,10 @@ endforeach()
 endmacro()
 
 append_subdir_files(CBLAS_INCLUDE "include")
-install(FILES ${CBLAS_INCLUDE} ${LAPACK_BINARY_DIR}/include/cblas_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(FILES ${CBLAS_INCLUDE} ${LAPACK_BINARY_DIR}/include/cblas_mangling.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  COMPONENT Development
+  )
 
 # --------------------------------------------------
 if(BUILD_TESTING)
@@ -45,7 +50,9 @@ endif()
 set(_cblas_config_install_guard_target "")
 if(ALL_TARGETS)
   install(EXPORT cblas-targets
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION})
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}
+    COMPONENT Development
+    )
   # Choose one of the cblas targets to use as a guard for
   # cblas-config.cmake to load targets from the install tree.
   list(GET ALL_TARGETS 0 _cblas_config_install_guard_target)
@@ -82,4 +89,6 @@ install(FILES
   )
 
 #install(EXPORT cblas-targets
-#  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION})
+#  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/cblas-${LAPACK_VERSION}
+#  COMPONENT Development
+#  )
diff --git a/lapack-netlib/CBLAS/Makefile b/lapack-netlib/CBLAS/Makefile
index 513e8fc82..6e199cdce 100644
--- a/lapack-netlib/CBLAS/Makefile
+++ b/lapack-netlib/CBLAS/Makefile
@@ -1,19 +1,25 @@
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc
 
+.PHONY: all
 all: cblas
 
+.PHONY: cblas
 cblas: include/cblas_mangling.h
 	$(MAKE) -C src
 
 include/cblas_mangling.h: include/cblas_mangling_with_flags.h.in
-	cp $< $@
+	cp include/cblas_mangling_with_flags.h.in $@
 
+.PHONY: cblas_testing
 cblas_testing: cblas
 	$(MAKE) -C testing run
 
+.PHONY: cblas_example
 cblas_example: cblas
 	$(MAKE) -C examples
 
+.PHONY: clean cleanobj cleanlib cleanexe cleantest
 clean:
 	$(MAKE) -C src clean
 	$(MAKE) -C testing clean
diff --git a/lapack-netlib/CBLAS/examples/Makefile b/lapack-netlib/CBLAS/examples/Makefile
index 664b8bc57..84acd6561 100644
--- a/lapack-netlib/CBLAS/examples/Makefile
+++ b/lapack-netlib/CBLAS/examples/Makefile
@@ -1,17 +1,21 @@
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
 
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+
+.PHONY: all
 all: cblas_ex1 cblas_ex2
 
 cblas_ex1: cblas_example1.o $(CBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 cblas_ex2: cblas_example2.o $(CBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
+.PHONY: clean cleanobj cleanexe
 clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
 cleanexe:
 	rm -f cblas_ex1 cblas_ex2
-
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
diff --git a/lapack-netlib/CBLAS/examples/cblas_example1.c b/lapack-netlib/CBLAS/examples/cblas_example1.c
index c3acd554d..3d5ed330c 100644
--- a/lapack-netlib/CBLAS/examples/cblas_example1.c
+++ b/lapack-netlib/CBLAS/examples/cblas_example1.c
@@ -47,7 +47,7 @@ int main ( )
    a[m*3+1] = 6;
    a[m*3+2] = 7;
    a[m*3+3] = 8;
-   /* The elemetns of x and y */
+   /* The elements of x and y */
    x[0] = 1;
    x[1] = 2;
    x[2] = 1;
diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile
index 6c0518ac7..7100568e4 100644
--- a/lapack-netlib/CBLAS/src/Makefile
+++ b/lapack-netlib/CBLAS/src/Makefile
@@ -1,7 +1,13 @@
 # This Makefile compiles the CBLAS routines
 
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
 
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+
+.PHONY: all
 all: $(CBLASLIB)
 
 # Error handling routines for level 2 & 3
@@ -43,24 +49,25 @@ zlev1 = cblas_zswap.o cblas_zscal.o cblas_zdscal.o cblas_zcopy.o \
 # Common files for level 1 single precision
 sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o
 
+.PHONY: slib1 dlib1 clib1 zlib1
 # Single precision real
 slib1: $(slev1) $(sclev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib1: $(dlev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib1: $(clev1) $(sclev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib1: $(zlev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 #
@@ -95,24 +102,25 @@ zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \
         cblas_ztpsv.o cblas_zgeru.o cblas_zgerc.o cblas_zher.o  cblas_zher2.o \
         cblas_zhpr.o  cblas_zhpr2.o
 
+.PHONY: slib2 dlib2 clib2 zlib2
 # Single precision real
 slib2: $(slev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib2: $(dlev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib2: $(clev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib2: $(zlev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 #
@@ -141,24 +149,25 @@ zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o \
         cblas_zher2k.o cblas_ztrmm.o cblas_ztrsm.o cblas_zsyrk.o \
         cblas_zsyr2k.o
 
+.PHONY: slib3 dlib3 clib3 zlib3
 # Single precision real
 slib3: $(slev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib3: $(dlev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib3: $(clev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib3: $(zlev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 
@@ -166,36 +175,33 @@ alev1 = $(slev1) $(dlev1) $(clev1) $(zlev1) $(sclev1)
 alev2 = $(slev2) $(dlev2) $(clev2) $(zlev2)
 alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3)
 
+.PHONY: all1 all2 all3
 # All level 1
 all1: $(alev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All level 2
 all2: $(alev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All level 3
 all3: $(alev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All levels and precisions
 $(CBLASLIB): $(alev1) $(alev2) $(alev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 FRC:
 	@FRC=$(FRC)
 
+.PHONY: clean cleanobj cleanlib
 clean: cleanobj cleanlib
 cleanobj:
 	rm -f *.o
 cleanlib:
 	rm -f $(CBLASLIB)
-
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
diff --git a/lapack-netlib/CBLAS/src/cblas_sgemm.c b/lapack-netlib/CBLAS/src/cblas_sgemm.c
index c4a49a2db..51cd7d18a 100644
--- a/lapack-netlib/CBLAS/src/cblas_sgemm.c
+++ b/lapack-netlib/CBLAS/src/cblas_sgemm.c
@@ -91,7 +91,7 @@ void cblas_sgemm(const CBLAS_LAYOUT layout, const CBLAS_TRANSPOSE TransA,
       else
       {
          cblas_xerbla(2, "cblas_sgemm",
-                       "Illegal TransA setting, %d\n", TransA);
+                       "Illegal TransB setting, %d\n", TransB);
          CBLAS_CallFromC = 0;
          RowMajorStrg = 0;
          return;
diff --git a/lapack-netlib/CBLAS/testing/Makefile b/lapack-netlib/CBLAS/testing/Makefile
index 0182c3e88..e3b615b41 100644
--- a/lapack-netlib/CBLAS/testing/Makefile
+++ b/lapack-netlib/CBLAS/testing/Makefile
@@ -2,7 +2,12 @@
 # The Makefile compiles c wrappers and testers for CBLAS.
 #
 
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
 
 # Archive files necessary to compile
 LIB = $(CBLASLIB) $(BLASLIB)
@@ -27,6 +32,7 @@ ztestl1o = c_zblas1.o
 ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o
 ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o
 
+.PHONY: all all1 all2 all3
 all: all1 all2 all3
 all1: xscblat1 xdcblat1 xccblat1 xzcblat1
 all2: xscblat2 xdcblat2 xccblat2 xzcblat2
@@ -38,37 +44,38 @@ all3: xscblat3 xdcblat3 xccblat3 xzcblat3
 
 # Single real
 xscblat1: c_sblat1.o $(stestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xscblat2: c_sblat2.o $(stestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xscblat3: c_sblat3.o $(stestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 # Double real
 xdcblat1: c_dblat1.o $(dtestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xdcblat2: c_dblat2.o $(dtestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xdcblat3: c_dblat3.o $(dtestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 # Single complex
 xccblat1: c_cblat1.o $(ctestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xccblat2: c_cblat2.o $(ctestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xccblat3: c_cblat3.o $(ctestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 # Double complex
 xzcblat1: c_zblat1.o $(ztestl1o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xzcblat2: c_zblat2.o $(ztestl2o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 xzcblat3: c_zblat3.o $(ztestl3o) $(LIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 
 # RUN TESTS
+.PHONY: run
 run: all
 	@echo "--> TESTING CBLAS 1 - SINGLE PRECISION REAL <--"
 	@./xscblat1 > stest1.out
@@ -95,6 +102,7 @@ run: all
 	@echo "--> TESTING CBLAS 3 - DOUBLE PRECISION COMPLEX <--"
 	@./xzcblat3 < zin3 > ztest3.out
 
+.PHONY: clean cleanobj cleanexe cleantest
 clean: cleanobj cleanexe cleantest
 cleanobj:
 	rm -f *.o
@@ -102,9 +110,3 @@ cleanexe:
 	rm -f x*
 cleantest:
 	rm -f *.out core
-
-.SUFFIXES: .o .f .c
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
diff --git a/lapack-netlib/CBLAS/testing/c_cblat1.f b/lapack-netlib/CBLAS/testing/c_cblat1.f
index c741ce506..1a123d74d 100644
--- a/lapack-netlib/CBLAS/testing/c_cblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_cblat1.f
@@ -577,7 +577,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/CBLAS/testing/c_dblat1.f b/lapack-netlib/CBLAS/testing/c_dblat1.f
index c570a9140..4a71b4dcf 100644
--- a/lapack-netlib/CBLAS/testing/c_dblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_dblat1.f
@@ -653,7 +653,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/CBLAS/testing/c_sblat1.f b/lapack-netlib/CBLAS/testing/c_sblat1.f
index 773787d6f..89902f12d 100644
--- a/lapack-netlib/CBLAS/testing/c_sblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_sblat1.f
@@ -653,7 +653,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/CBLAS/testing/c_zblat1.f b/lapack-netlib/CBLAS/testing/c_zblat1.f
index 03753e782..cd0c8541d 100644
--- a/lapack-netlib/CBLAS/testing/c_zblat1.f
+++ b/lapack-netlib/CBLAS/testing/c_zblat1.f
@@ -577,7 +577,7 @@
       SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC)
 *     ************************* STEST1 *****************************
 *
-*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN
+*     THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN
 *     REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE
 *     ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT.
 *
diff --git a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake
index acc51629e..add0d1797 100644
--- a/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake
+++ b/lapack-netlib/CMAKE/CheckLAPACKCompilerFlags.cmake
@@ -1,4 +1,4 @@
-# This module checks against various known compilers and thier respective
+# This module checks against various known compilers and their respective
 # flags to determine any specific flags needing to be set.
 #
 #  1.  If FPE traps are enabled either abort or disable them
diff --git a/lapack-netlib/CMAKE/FindGcov.cmake b/lapack-netlib/CMAKE/FindGcov.cmake
index 4807f903e..3d4c0a2a0 100644
--- a/lapack-netlib/CMAKE/FindGcov.cmake
+++ b/lapack-netlib/CMAKE/FindGcov.cmake
@@ -20,7 +20,7 @@ set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})
 
 get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
 foreach (LANG ${ENABLED_LANGUAGES})
-  # Gcov evaluation is dependend on the used compiler. Check gcov support for
+  # Gcov evaluation is dependent on the used compiler. Check gcov support for
   # each compiler that is used. If gcov binary was already found for this
   # compiler, do not try to find it again.
   if(NOT GCOV_${CMAKE_${LANG}_COMPILER_ID}_BIN)
diff --git a/lapack-netlib/CMAKE/Findcodecov.cmake b/lapack-netlib/CMAKE/Findcodecov.cmake
index 1f33b2c09..384064007 100644
--- a/lapack-netlib/CMAKE/Findcodecov.cmake
+++ b/lapack-netlib/CMAKE/Findcodecov.cmake
@@ -42,7 +42,7 @@ set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY})
 
 get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES)
 foreach (LANG ${ENABLED_LANGUAGES})
-  # Coverage flags are not dependend on language, but the used compiler. So
+  # Coverage flags are not dependent on language, but the used compiler. So
   # instead of searching flags foreach language, search flags foreach compiler
   # used.
   set(COMPILER ${CMAKE_${LANG}_COMPILER_ID})
diff --git a/lapack-netlib/CMAKE/FortranMangling.cmake b/lapack-netlib/CMAKE/FortranMangling.cmake
index d772dc9bb..734ab6f4c 100644
--- a/lapack-netlib/CMAKE/FortranMangling.cmake
+++ b/lapack-netlib/CMAKE/FortranMangling.cmake
@@ -24,7 +24,7 @@ message(STATUS "=========")
     set(F77_OUTPUT_EXE "/Fe" CACHE INTERNAL
       "Fortran compiler option for setting executable file name.")
   else()
-    # in other case, let user specify their fortran configrations.
+    # in other case, let user specify their fortran configurations.
     set(F77_OPTION_COMPILE "-c" CACHE STRING
       "Fortran compiler option for compiling without linking.")
     set(F77_OUTPUT_OBJ "-o" CACHE STRING
diff --git a/lapack-netlib/CMAKE/lapack-config-build.cmake.in b/lapack-netlib/CMAKE/lapack-config-build.cmake.in
index 1d084fe13..f7e041663 100644
--- a/lapack-netlib/CMAKE/lapack-config-build.cmake.in
+++ b/lapack-netlib/CMAKE/lapack-config-build.cmake.in
@@ -5,6 +5,10 @@ if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}")
 endif()
 unset(_LAPACK_TARGET)
 
+# Hint for project building against lapack
+set(LAPACK_Fortran_COMPILER_ID "@CMAKE_Fortran_COMPILER_ID@")
+
 # Report the blas and lapack raw or imported libraries.
 set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@")
 set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@")
+set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES})
diff --git a/lapack-netlib/CMAKE/lapack-config-install.cmake.in b/lapack-netlib/CMAKE/lapack-config-install.cmake.in
index 4e04f8711..3de7362ea 100644
--- a/lapack-netlib/CMAKE/lapack-config-install.cmake.in
+++ b/lapack-netlib/CMAKE/lapack-config-install.cmake.in
@@ -8,8 +8,12 @@ if(_LAPACK_TARGET AND NOT TARGET "${_LAPACK_TARGET}")
 endif()
 unset(_LAPACK_TARGET)
 
+# Hint for project building against lapack
+set(LAPACK_Fortran_COMPILER_ID "@CMAKE_Fortran_COMPILER_ID@")
+
 # Report the blas and lapack raw or imported libraries.
 set(LAPACK_blas_LIBRARIES "@BLAS_LIBRARIES@")
 set(LAPACK_lapack_LIBRARIES "@LAPACK_LIBRARIES@")
+set(LAPACK_LIBRARIES ${LAPACK_blas_LIBRARIES} ${LAPACK_lapack_LIBRARIES})
 
 unset(_LAPACK_SELF_DIR)
diff --git a/lapack-netlib/CMakeLists.txt b/lapack-netlib/CMakeLists.txt
index caa0e7107..df43d91b1 100644
--- a/lapack-netlib/CMakeLists.txt
+++ b/lapack-netlib/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8.12)
 project(LAPACK Fortran C)
 
 set(LAPACK_MAJOR_VERSION 3)
-set(LAPACK_MINOR_VERSION 8)
+set(LAPACK_MINOR_VERSION 9)
 set(LAPACK_PATCH_VERSION 0)
 set(
   LAPACK_VERSION
@@ -13,6 +13,9 @@ set(
 # Add the CMake directory for custon CMake modules
 set(CMAKE_MODULE_PATH "${LAPACK_SOURCE_DIR}/CMAKE" ${CMAKE_MODULE_PATH})
 
+# Export all symbols on Windows when building shared libraries
+SET(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS TRUE)
+
 # Set a default build type if none was specified
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting build type to 'Release' as none was specified.")
@@ -21,8 +24,19 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "Coverage")
 endif()
 
-string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE_UPPER)
-if(${CMAKE_BUILD_TYPE_UPPER} STREQUAL "COVERAGE")
+# Coverage
+set(_is_coverage_build 0)
+set(_msg "Checking if build type is 'Coverage'")
+message(STATUS "${_msg}")
+if(NOT CMAKE_CONFIGURATION_TYPES)
+  string(TOLOWER ${CMAKE_BUILD_TYPE} _build_type_lc)
+  if(${_build_type_lc} STREQUAL "coverage")
+    set(_is_coverage_build 1)
+  endif()
+endif()
+message(STATUS "${_msg}: ${_is_coverage_build}")
+
+if(_is_coverage_build)
   message(STATUS "Adding coverage")
   find_package(codecov)
 endif()
@@ -58,18 +72,18 @@ include(PreventInSourceBuilds)
 include(PreventInBuildInstalls)
 
 if(UNIX)
-  if("${CMAKE_Fortran_COMPILER}" MATCHES "ifort")
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fp-model strict")
+  if(CMAKE_Fortran_COMPILER_ID STREQUAL Intel)
+    list(APPEND CMAKE_Fortran_FLAGS "-fp-model strict")
   endif()
-  if("${CMAKE_Fortran_COMPILER}" MATCHES "xlf")
-    set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -qnosave -qstrict=none")
+  if(CMAKE_Fortran_COMPILER_ID STREQUAL XL)
+    list(APPEND CMAKE_Fortran_FLAGS "-qnosave -qstrict=none")
   endif()
 # Delete libmtsk in linking sequence for Sun/Oracle Fortran Compiler.
 # This library is not present in the Sun package SolarisStudio12.3-linux-x86-bin
   string(REPLACE \;mtsk\; \; CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES "${CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES}")
 endif()
 
-if(CMAKE_Fortran_COMPILER_ID STREQUAL "Compaq")
+if(CMAKE_Fortran_COMPILER_ID STREQUAL Compaq)
   if(WIN32)
     if(CMAKE_GENERATOR STREQUAL "NMake Makefiles")
       get_filename_component(CMAKE_Fortran_COMPILER_CMDNAM ${CMAKE_Fortran_COMPILER} NAME_WE)
@@ -96,24 +110,16 @@ if(CMAKE_Fortran_COMPILER_ID STREQUAL "Compaq")
   endif()
 endif()
 
-# Get Python
-message(STATUS "Looking for Python greater than 2.6 - ${PYTHONINTERP_FOUND}")
-find_package(PythonInterp 2.7) # lapack_testing.py uses features from python 2.7 and greater
-if(PYTHONINTERP_FOUND)
-  message(STATUS "Using Python version ${PYTHON_VERSION_STRING}")
-else()
-  message(STATUS "No suitable Python version found, so skipping summary tests.")
-endif()
-# --------------------------------------------------
 
+# --------------------------------------------------
 set(LAPACK_INSTALL_EXPORT_NAME lapack-targets)
 
 macro(lapack_install_library lib)
   install(TARGETS ${lib}
     EXPORT ${LAPACK_INSTALL_EXPORT_NAME}
-    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
-    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT Development
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT RuntimeLibraries
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT RuntimeLibraries
   )
 endmacro()
 
@@ -121,12 +127,22 @@ set(PKG_CONFIG_DIR ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
 
 # --------------------------------------------------
 # Testing
-option(BUILD_TESTING "Build tests" OFF)
-enable_testing()
+option(BUILD_TESTING "Build tests" ${_is_coverage_build})
 include(CTest)
-enable_testing()
 message(STATUS "Build tests: ${BUILD_TESTING}")
 
+# lapack_testing.py uses features from python 2.7 and greater
+if(BUILD_TESTING)
+  set(_msg "Looking for Python >= 2.7 needed for summary tests")
+  message(STATUS "${_msg}")
+  find_package(PythonInterp 2.7 QUIET)
+  if(PYTHONINTERP_FOUND)
+    message(STATUS "${_msg} - found (${PYTHON_VERSION_STRING})")
+  else()
+    message(STATUS "${_msg} - not found (skipping summary tests)")
+  endif()
+endif()
+
 # --------------------------------------------------
 # Organize output files.  On Windows this also keeps .dll files next
 # to the .exe files that need them, making tests easy to run.
@@ -299,16 +315,40 @@ if(LAPACKE)
   add_subdirectory(LAPACKE)
 endif()
 
+#-------------------------------------
+# BLAS++ / LAPACK++
+option(BLAS++ "Build BLAS++" OFF)
+option(LAPACK++ "Build LAPACK++" OFF)
+ 
+ 
+function(_display_cpp_implementation_msg name)
+  string(TOLOWER ${name} name_lc)
+  message(STATUS "${name}++ enable")
+  message(STATUS "----------------")
+  message(STATUS "Thank you for your interest in ${name}++, a newly developed C++ API for ${name} library")
+  message(STATUS "The objective of ${name}++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc.")
+  message(STATUS "We are still working on integrating ${name}++ in our library. For the moment, you can download directly ${name_lc}++ from https://bitbucket.org/icl/${name_lc}pp")
+  message(STATUS "For support ${name}++ related question, please email: slate-user@icl.utk.edu")
+  message(STATUS "----------------")
+endfunction()
+if(BLAS++)
+  _display_cpp_implementation_msg("BLAS")
+endif()
+if(LAPACK++)
+  _display_cpp_implementation_msg("LAPACK")
+endif()
+
 # --------------------------------------------------
 # CPACK Packaging
 
 set(CPACK_PACKAGE_NAME "LAPACK")
 set(CPACK_PACKAGE_VENDOR "University of Tennessee, Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd")
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "LAPACK- Linear Algebra Package")
-set(CPACK_PACKAGE_VERSION_MAJOR 3)
-set(CPACK_PACKAGE_VERSION_MINOR 5)
-set(CPACK_PACKAGE_VERSION_PATCH 0)
+set(CPACK_PACKAGE_VERSION_MAJOR ${LAPACK_MAJOR_VERSION})
+set(CPACK_PACKAGE_VERSION_MINOR ${LAPACK_MINOR_VERSION})
+set(CPACK_PACKAGE_VERSION_PATCH ${LAPACK_PATCH_VERSION})
 set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
+set(CPACK_MONOLITHIC_INSTALL ON)
 set(CPACK_PACKAGE_INSTALL_DIRECTORY "LAPACK")
 if(WIN32 AND NOT UNIX)
   # There is a bug in NSI that does not handle full unix paths properly. Make
@@ -347,7 +387,9 @@ endif()
 set(_lapack_config_install_guard_target "")
 if(ALL_TARGETS)
   install(EXPORT lapack-targets
-    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION})
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION}
+    COMPONENT Development
+    )
 
   # Choose one of the lapack targets to use as a guard for
   # lapack-config.cmake to load targets from the install tree.
@@ -382,6 +424,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapack.pc.in ${CMAKE_CURRENT_BINARY_D
   install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/lapack.pc
   DESTINATION ${PKG_CONFIG_DIR}
+  COMPONENT Development
   )
 
 configure_file(${LAPACK_SOURCE_DIR}/CMAKE/lapack-config-install.cmake.in
@@ -398,4 +441,6 @@ install(FILES
   ${LAPACK_BINARY_DIR}/CMakeFiles/lapack-config.cmake
   ${LAPACK_BINARY_DIR}/lapack-config-version.cmake
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapack-${LAPACK_VERSION}
+  COMPONENT Development
   )
+  
diff --git a/lapack-netlib/DOCS/Doxyfile b/lapack-netlib/DOCS/Doxyfile
index 8f3558597..43cea43b5 100644
--- a/lapack-netlib/DOCS/Doxyfile
+++ b/lapack-netlib/DOCS/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = LAPACK
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 3.8.0
+PROJECT_NUMBER         = 3.9.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/lapack-netlib/DOCS/Doxyfile_man b/lapack-netlib/DOCS/Doxyfile_man
index 6fb339a73..1767cf5f4 100644
--- a/lapack-netlib/DOCS/Doxyfile_man
+++ b/lapack-netlib/DOCS/Doxyfile_man
@@ -38,7 +38,7 @@ PROJECT_NAME           = LAPACK
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 3.8.0
+PROJECT_NUMBER         = 3.9.0
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex
index 291735299..794c2a7aa 100644
--- a/lapack-netlib/DOCS/lawn81.tex
+++ b/lapack-netlib/DOCS/lawn81.tex
@@ -439,39 +439,39 @@ SHELL = /bin/sh
 \end{quote}
 and it will need to be modified to \texttt{SHELL = /sbin/sh} if you are
 installing LAPACK on an SGI architecture.
-Second, you will
-need to modify the \texttt{PLAT} definition, which is appended to all
-library names, to specify the architecture to which you are installing
-LAPACK.  This features avoids confusion in library names when you are
-installing LAPACK on more than one architecture.  Next, you will need
-to modify \texttt{FORTRAN}, \texttt{OPTS}, \texttt{DRVOPTS}, \texttt{NOOPT}, \texttt{LOADER},
-and \texttt{LOADOPTS} to specify
+Next, you will need to modify \texttt{FC}, \texttt{FFLAGS},
+\texttt{FFLAGS\_DRV}, \texttt{FFLAGS\_NOOPT}, and \texttt{LDFLAGS} to specify
 the compiler, compiler options, compiler options for the testing and
-timing\footnotemark[\value{footnote}] main programs, loader, loader options.
-Next you will have to choose which function you will use to time in the \texttt{SECOND} and \texttt{DSECND} routines.
+timing\footnotemark[\value{footnote}] main programs, and linker options.
+Next you will have to choose which function you will use to time in the
+\texttt{SECOND} and \texttt{DSECND} routines.
 \begin{verbatim}
-#The Default : SECOND and DSECND will use a call to the EXTERNAL FUNCTION ETIME
-TIMER    = EXT_ETIME
-# For RS6K : SECOND and DSECND will use a call to the EXTERNAL FUNCTION ETIME_
-# TIMER    = EXT_ETIME_
-# For gfortran compiler: SECOND and DSECND will use the INTERNAL FUNCTION ETIME
-# TIMER    = INT_ETIME
-# If your Fortran compiler does not provide etime (like Nag Fortran Compiler, etc...)
-# SECOND and DSECND will use a call to the INTERNAL FUNCTION CPU_TIME
-# TIMER    = INT_CPU_TIME
-# If neither of this works...you can use the NONE value...
-# In that case, SECOND and DSECND will always return 0
-# TIMER     = NONE
+#  Default:  SECOND and DSECND will use a call to the
+#  EXTERNAL FUNCTION ETIME
+#TIMER = EXT_ETIME
+#  For RS6K:  SECOND and DSECND will use a call to the
+#  EXTERNAL FUNCTION ETIME_
+#TIMER = EXT_ETIME_
+#  For gfortran compiler:  SECOND and DSECND will use a call to the
+#  INTERNAL FUNCTION ETIME
+TIMER = INT_ETIME
+#  If your Fortran compiler does not provide etime (like Nag Fortran
+#  Compiler, etc...) SECOND and DSECND will use a call to the
+#  INTERNAL FUNCTION CPU_TIME
+#TIMER = INT_CPU_TIME
+#  If none of these work, you can use the NONE value.
+#  In that case, SECOND and DSECND will always return 0.
+#TIMER = NONE
 \end{verbatim}
 Refer to the section~\ref{second} to get more information.
 
 
-Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver,
+Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver,
 archiver options, and ranlib for your machine.  If your architecture
 does not require \texttt{ranlib} to be run after each archive command (as
 is the case with CRAY computers running UNICOS, Hewlett Packard
 computers running HP-UX, or SUN SPARCstations running Solaris), set
-\texttt{ranlib=echo}.  And finally, you must
+\texttt{RANLIB = echo}.  And finally, you must
 modify the \texttt{BLASLIB} definition to specify the BLAS library to which
 you will be linking.  If an optimized version of the BLAS is available
 on your machine, you are highly recommended to link to that library.
@@ -721,24 +721,24 @@ The version that will be used depends on the value of the TIMER variable in the
 
 \begin{itemize}
 \item If ETIME is available as an external function, set the value of the TIMER variable in your
-make.inc to \texttt{EXT\_ETIME}:\texttt{second\_EXT\_ETIME.f} and \texttt{dsecnd\_EXT\_ETIME.f} will be used.
+make.inc to \texttt{EXT\_ETIME}: \texttt{second\_EXT\_ETIME.f} and \texttt{dsecnd\_EXT\_ETIME.f} will be used.
 Usually on HPPA architectures,
-the compiler and loader flag \texttt{+U77} should be included to access
+the compiler and linker flag \texttt{+U77} should be included to access
 the function \texttt{ETIME}.
 
 \item If ETIME\_ is available as an external function, set the value of the TIMER variable in your make.inc
-to \texttt{EXT\_ETIME\_}:\texttt{second\_EXT\_ETIME\_.f} and \texttt{dsecnd\_EXT\_ETIME\_.f} will be used.
+to \texttt{EXT\_ETIME\_}: \texttt{second\_EXT\_ETIME\_.f} and \texttt{dsecnd\_EXT\_ETIME\_.f} will be used.
 It is the case on some IBM architectures such as IBM RS/6000s.
 
 \item If ETIME is available as an internal function, set the value of the TIMER variable in your make.inc
-to \texttt{INT\_ETIME}:\texttt{second\_INT\_ETIME.f}  and \texttt{dsecnd\_INT\_ETIME.f} will be used.
+to \texttt{INT\_ETIME}: \texttt{second\_INT\_ETIME.f}  and \texttt{dsecnd\_INT\_ETIME.f} will be used.
 This is the case with gfortan.
 
 \item If CPU\_TIME is available as an internal function, set the value of the TIMER variable in your make.inc
-to \texttt{INT\_CPU\_TIME}:\texttt{second\_INT\_CPU\_TIME.f} and \texttt{dsecnd\_INT\_CPU\_TIME.f} will be used.
+to \texttt{INT\_CPU\_TIME}: \texttt{second\_INT\_CPU\_TIME.f} and \texttt{dsecnd\_INT\_CPU\_TIME.f} will be used.
 
 \item If none of these function is available, set the value of the TIMER variable in your make.inc
-to \texttt{NONE:}\texttt{second\_NONE.f} and \texttt{dsecnd\_NONE.f} will be used.
+to \texttt{NONE}: \texttt{second\_NONE.f} and \texttt{dsecnd\_NONE.f} will be used.
 These routines will always return zero.
 \end{itemize}
 
@@ -829,8 +829,8 @@ data type to the library if necessary.
 \end{itemize}
 
 \noindent
-The BLAS library is created in \texttt{LAPACK/blas\_PLAT.a}, where
-\texttt{PLAT} is the user-defined architecture suffix specified in the file
+The BLAS library is created in \texttt{LAPACK/librefblas.a},
+or in the user-defined location specified by \texttt{BLASLIB} in the file
 \texttt{LAPACK/make.inc}.
 
 \subsection{Run the BLAS Test Programs}\label{testblas}
@@ -882,8 +882,8 @@ data type to the library if necessary.
 \end{itemize}
 
 \noindent
-The LAPACK library is created in \texttt{LAPACK/lapack\_PLAT.a}, where
-\texttt{PLAT} is the user-defined architecture suffix specified in the file
+The LAPACK library is created in \texttt{LAPACK/liblapack.a},
+or in the user-defined location specified by \texttt{LAPACKLIB} in the file
 \texttt{LAPACK/make.inc}.
 
 \subsection{Create the Test Matrix Generator Library}
@@ -902,9 +902,9 @@ data type to the library if necessary.
 \end{itemize}
 
 \noindent
-The test matrix generator library is created in \texttt{LAPACK/tmglib\_PLAT.a},
-where \texttt{PLAT} is the user-defined architecture suffix specified in the
-file \texttt{LAPACK/make.inc}.
+The test matrix generator library is created in \texttt{LAPACK/libtmglib.a},
+or in the user-defined location specified by \texttt{TMGLIB} in the file
+\texttt{LAPACK/make.inc}.
 
 \subsection{Run the LAPACK Test Programs}
 
@@ -1114,9 +1114,7 @@ To make a library of the instrumented LAPACK routines, first
 go to \texttt{LAPACK/TIMING/LIN/LINSRC} and type \texttt{make} followed
 by the data types desired, as in the examples of Section~\ref{toplevelmakefile}.
 The library of instrumented code is created in
-\texttt{LAPACK/TIMING/LIN/linsrc\_PLAT.a},
-where \texttt{PLAT} is the user-defined architecture suffix specified in the
-file \texttt{LAPACK/make.inc}.
+\texttt{LAPACK/TIMING/LIN/linsrc.a}.
 \end{sloppypar}
 
 \item[b)]
@@ -1251,9 +1249,7 @@ To make a library of the instrumented LAPACK routines, first
 go to \texttt{LAPACK/TIMING/EIG/EIGSRC} and type \texttt{make} followed
 by the data types desired, as in the examples of Section~\ref{toplevelmakefile}.
 The library of instrumented code is created in
-\texttt{LAPACK/TIMING/EIG/eigsrc\_PLAT.a},
-where \texttt{PLAT} is the user-defined architecture suffix specified in the
-file \texttt{LAPACK/make.inc}.
+\texttt{LAPACK/TIMING/EIG/eigsrc.a}.
 \end{sloppypar}
 
 \item[b)]
@@ -1389,7 +1385,7 @@ installing LAPACK on an SGI architecture.
 \section{ETIME}
 
 On HPPA architectures,
-the compiler and loader flag \texttt{+U77} should be included to access
+the compiler and linker flag \texttt{+U77} should be included to access
 the function \texttt{ETIME}.
 
 \section{ILAENV and IEEE-754 compliance}
@@ -1494,13 +1490,13 @@ has two options:  increase your stack size, or force all local variables
 to be allocated statically.
 
 On HPPA architectures, the
-compiler and loader flag \texttt{-K} should be used when compiling these testing
+compiler and linker flag \texttt{-K} should be used when compiling these testing
 and timing main programs to avoid such a stack overflow.  I.e., set
-\texttt{DRVOPTS = -K} in the \texttt{LAPACK/make.inc} file.
+\texttt{FFLAGS\_DRV = -K} in the \texttt{LAPACK/make.inc} file.
 
 For similar reasons,
-on SGI architectures, the compiler and loader flag \texttt{-static} should be
-used.  I.e., set \texttt{DRVOPTS = -static} in the \texttt{LAPACK/make.inc} file.
+on SGI architectures, the compiler and linker flag \texttt{-static} should be
+used.  I.e., set \texttt{FFLAGS\_DRV = -static} in the \texttt{LAPACK/make.inc} file.
 
 \section{IEEE arithmetic}
 
diff --git a/lapack-netlib/INSTALL/Makefile b/lapack-netlib/INSTALL/Makefile
index 150a061d6..1007c1bca 100644
--- a/lapack-netlib/INSTALL/Makefile
+++ b/lapack-netlib/INSTALL/Makefile
@@ -1,30 +1,33 @@
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc
 
+.PHONY: all testlsame testslamch testdlamch testsecond testdsecnd testieee testversion
 all: testlsame testslamch testdlamch testsecond testdsecnd testieee testversion
 
 testlsame: lsame.o lsametst.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 testslamch: slamch.o lsame.o slamchtst.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 testdlamch: dlamch.o lsame.o dlamchtst.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 testsecond: second_$(TIMER).o secondtst.o
 	@echo "[INFO] : TIMER value: $(TIMER) (given by make.inc)"
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 testdsecnd: dsecnd_$(TIMER).o dsecndtst.o
 	@echo "[INFO] : TIMER value: $(TIMER) (given by make.inc)"
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 testieee: tstiee.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 testversion: ilaver.o LAPACK_version.o
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
+.PHONY: run
 run: all
 	./testlsame
 	./testslamch
@@ -34,6 +37,7 @@ run: all
 	./testieee
 	./testversion
 
+.PHONY: clean cleanobj cleanexe cleantest
 clean: cleanobj cleanexe cleantest
 cleanobj:
 	rm -f *.o
@@ -42,9 +46,5 @@ cleanexe:
 cleantest:
 	rm -f core
 
-.SUFFIXES: .o .f
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
-
-slamch.o: slamch.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-dlamch.o: dlamch.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
+slamch.o: slamch.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+dlamch.o: dlamch.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
diff --git a/lapack-netlib/INSTALL/dlamch.f b/lapack-netlib/INSTALL/dlamch.f
index 76f875cef..9073cd45e 100644
--- a/lapack-netlib/INSTALL/dlamch.f
+++ b/lapack-netlib/INSTALL/dlamch.f
@@ -10,6 +10,10 @@
 *
 *      DOUBLE PRECISION FUNCTION DLAMCH( CMACH )
 *
+*     .. Scalar Arguments ..
+*     CHARACTER          CMACH
+*     ..
+*
 *
 *> \par Purpose:
 *  =============
@@ -24,6 +28,7 @@
 *
 *> \param[in] CMACH
 *> \verbatim
+*>          CMACH is CHARACTER*1
 *>          Specifies the value to be returned by DLAMCH:
 *>          = 'E' or 'e',   DLAMCH := eps
 *>          = 'S' or 's ,   DLAMCH := sfmin
diff --git a/lapack-netlib/INSTALL/dlamchf77.f b/lapack-netlib/INSTALL/dlamchf77.f
index 3efd21535..37b30551f 100644
--- a/lapack-netlib/INSTALL/dlamchf77.f
+++ b/lapack-netlib/INSTALL/dlamchf77.f
@@ -10,6 +10,10 @@
 *
 *      DOUBLE PRECISION FUNCTION DLAMCH( CMACH )
 *
+*     .. Scalar Arguments ..
+*     CHARACTER          CMACH
+*     ..
+*
 *
 *> \par Purpose:
 *  =============
diff --git a/lapack-netlib/INSTALL/ilaver.f b/lapack-netlib/INSTALL/ilaver.f
index e1d59f465..79fe597ae 100644
--- a/lapack-netlib/INSTALL/ilaver.f
+++ b/lapack-netlib/INSTALL/ilaver.f
@@ -25,12 +25,15 @@
 *  ==========
 *
 *>  \param[out] VERS_MAJOR
+*>      VERS_MAJOR is INTEGER
 *>      return the lapack major version
 *>
 *>  \param[out] VERS_MINOR
+*>      VERS_MINOR is INTEGER
 *>      return the lapack minor version from the major version
 *>
 *>  \param[out] VERS_PATCH
+*>      VERS_PATCH is INTEGER
 *>      return the lapack patch version from the minor version
 *
 *  Authors:
@@ -41,24 +44,23 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date June 2017
+*> \date November 2019
 *
 *> \ingroup auxOTHERauxiliary
 *
 *  =====================================================================
       SUBROUTINE ILAVER( VERS_MAJOR, VERS_MINOR, VERS_PATCH )
 *
-*  -- LAPACK computational routine (version 3.7.1) --
+*  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     June 2017
 *
 *  =====================================================================
 *
       INTEGER VERS_MAJOR, VERS_MINOR, VERS_PATCH
 *  =====================================================================
       VERS_MAJOR = 3
-      VERS_MINOR = 8
+      VERS_MINOR = 9
       VERS_PATCH = 0
 *  =====================================================================
 *
diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA
index 0ceeaa155..d6397e81d 100644
--- a/lapack-netlib/INSTALL/make.inc.ALPHA
+++ b/lapack-netlib/INSTALL/make.inc.ALPHA
@@ -8,30 +8,28 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O4
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O4 -fpe1
-DRVOPTS = $(OPTS)
-NOOPT   =
+FC = f77
+FFLAGS = -O4 -fpe1
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT =
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -74,9 +72,9 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 BLASLIB      = -ldxml
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA
index 8eabbbdf4..6ee2b2dfb 100644
--- a/lapack-netlib/INSTALL/make.inc.HPPA
+++ b/lapack-netlib/INSTALL/make.inc.HPPA
@@ -8,30 +8,28 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS =
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = +O4 +U77
-DRVOPTS = $(OPTS) -K
-NOOPT   = +U77
+FC = f77
+FFLAGS = +O4 +U77
+FFLAGS_DRV = $(FFLAGS) -K
+FFLAGS_NOOPT = +U77
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -Aa +U77
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -74,9 +72,9 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 BLASLIB      = -lblas
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64
index d9e71e1bf..59fe522eb 100644
--- a/lapack-netlib/INSTALL/make.inc.IRIX64
+++ b/lapack-netlib/INSTALL/make.inc.IRIX64
@@ -8,33 +8,30 @@ SHELL = /sbin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
-#OPTS    = -g  -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
-DRVOPTS = $(OPTS) -static
-NOOPT   = -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
-#NOOPT   = -g  -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
+FC = f77
+FFLAGS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
+#FFLAGS = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
+FFLAGS_DRV = $(FFLAGS) -static
+FFLAGS_NOOPT = -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
+#FFLAGS_NOOPT = -g -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
-#LOADOPTS = -g  -DEBUG:subscript_check=ON -trapuv -OPT:IEEE_NaN_inf=ON
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -78,8 +75,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lblas
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K
index 3ffcadacc..3c3dbc800 100644
--- a/lapack-netlib/INSTALL/make.inc.O2K
+++ b/lapack-netlib/INSTALL/make.inc.O2K
@@ -8,33 +8,30 @@ SHELL = /sbin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O3 -64 -mips4 -r10000
-#OPTS    = -O3 -64 -mips4 -r10000 -mp
-DRVOPTS = $(OPTS) -static
-NOOPT   = -64 -mips4 -r10000
-#NOOPT   = -64 -mips4 -r10000 -mp
+FC = f77
+FFLAGS = -O3 -64 -mips4 -r10000
+#FFLAGS = -O3 -64 -mips4 -r10000 -mp
+FFLAGS_DRV = $(FFLAGS) -static
+FFLAGS_NOOPT = -64 -mips4 -r10000
+#FFLAGS_NOOPT = -64 -mips4 -r10000 -mp
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -O3 -64 -mips4 -r10000
-#LOADOPTS = -O3 -64 -mips4 -r10000 -mp
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -79,8 +76,8 @@ TIMER = EXT_ETIME
 #
 BLASLIB      = -lblas
 #BLASLIB      = -lblas_mp
-#BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5
index c7019ac16..1013cffdb 100644
--- a/lapack-netlib/INSTALL/make.inc.SGI5
+++ b/lapack-netlib/INSTALL/make.inc.SGI5
@@ -8,30 +8,28 @@ SHELL = /sbin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O4
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -O4
-DRVOPTS = $(OPTS) -static
-NOOPT   =
+FC = f77
+FFLAGS = -O4
+FFLAGS_DRV = $(FFLAGS) -static
+FFLAGS_NOOPT =
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -75,8 +73,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lblas
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4
index 4e44f1beb..2da0ecb65 100644
--- a/lapack-netlib/INSTALL/make.inc.SUN4
+++ b/lapack-netlib/INSTALL/make.inc.SUN4
@@ -8,30 +8,28 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-OPTS    = -dalign -O4 -fast
-DRVOPTS = $(OPTS)
-NOOPT   =
+FC = f77
+FFLAGS = -dalign -O4 -fast
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT =
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-LOADOPTS = -dalign -O4 -fast
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -75,8 +73,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lblas
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2
index e6d79add3..d2db07c61 100644
--- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2
+++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2
@@ -8,34 +8,31 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = cc
+CC = cc
 CFLAGS = -O3
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = f77
-#OPTS    = -O4 -u -f -mt
-#OPTS    = -u -f -dalign -native -xO5 -xarch=v8plusa
-OPTS    = -u -f -dalign -native -xO2 -xarch=v8plusa
-DRVOPTS = $(OPTS)
-NOOPT   = -u -f
-#NOOPT   = -u -f -mt
+FC = f77
+#FFLAGS = -O4 -u -f -mt
+#FFLAGS = -u -f -dalign -native -xO5 -xarch=v8plusa
+FFLAGS = -u -f -dalign -native -xO2 -xarch=v8plusa
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -u -f
+#FFLAGS_NOOPT = -u -f -mt
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = f77
-#LOADOPTS = -mt
-LOADOPTS = -f -dalign -native -xO2 -xarch=v8plusa
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -78,10 +75,10 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 #BLASLIB      = -xlic_lib=sunperf_mt
 BLASLIB      = -xlic_lib=sunperf
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF
index 9466ee332..cb9d791a7 100644
--- a/lapack-netlib/INSTALL/make.inc.XLF
+++ b/lapack-netlib/INSTALL/make.inc.XLF
@@ -8,31 +8,29 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = xlc
+CC = xlc
 CFLAGS = -O3 -qnosave
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = xlf
-OPTS    = -O3 -qfixed -qnosave
+FC = xlf
+FFLAGS = -O3 -qfixed -qnosave
 # For -O2, add -qstrict=none
-DRVOPTS = $(OPTS)
-NOOPT   = -O0 -qfixed -qnosave
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0 -qfixed -qnosave
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = xlf
-LOADOPTS = -qnosave
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -75,9 +73,9 @@ TIMER = EXT_ETIME_
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-#BLASLIB      = ../../librefblas.a
+#BLASLIB      = $(TOPSRCDIR)/librefblas.a
 BLASLIB      = -lessl
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran
index 39d98d4d4..104632747 100644
--- a/lapack-netlib/INSTALL/make.inc.gfortran
+++ b/lapack-netlib/INSTALL/make.inc.gfortran
@@ -8,10 +8,10 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = gcc
+CC = gcc
 CFLAGS = -O3
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
@@ -19,23 +19,21 @@ CFLAGS = -O3
 #  and handle these quantities appropriately. As a consequence, one
 #  should not compile LAPACK with flags such as -ffpe-trap=overflow.
 #
-FORTRAN = gfortran
-OPTS    = -O2 -frecursive
-DRVOPTS = $(OPTS)
-NOOPT   = -O0 -frecursive
+FC = gfortran
+FFLAGS = -O2 -frecursive
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0 -frecursive
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = gfortran
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -78,8 +76,8 @@ TIMER = INT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug
index 10e6381df..246060827 100644
--- a/lapack-netlib/INSTALL/make.inc.gfortran_debug
+++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug
@@ -8,10 +8,10 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = gcc
+CC = gcc
 CFLAGS = -g
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
@@ -19,23 +19,21 @@ CFLAGS = -g
 #  and handle these quantities appropriately. As a consequence, one
 #  should not compile LAPACK with flags such as -ffpe-trap=overflow.
 #
-FORTRAN = gfortran -fimplicit-none -g -frecursive
-OPTS    =
-DRVOPTS = $(OPTS)
-NOOPT   = -g -O0 -frecursive
+FC = gfortran
+FFLAGS = -fimplicit-none -g -frecursive
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = $(FFLAGS) -O0
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = gfortran -g
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -78,8 +76,8 @@ TIMER = INT_CPU_TIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort
index b067bd484..801b46aa5 100644
--- a/lapack-netlib/INSTALL/make.inc.ifort
+++ b/lapack-netlib/INSTALL/make.inc.ifort
@@ -8,30 +8,28 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = icc
+CC = icc
 CFLAGS = -O3
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = ifort
-OPTS    = -O3 -fp-model strict -assume protect_parens
-DRVOPTS = $(OPTS)
-NOOPT   = -O0 -fp-model strict -assume protect_parens
+FC = ifort
+FFLAGS = -O3 -fp-model strict -assume protect_parens
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0 -fp-model strict -assume protect_parens
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = ifort
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -74,8 +72,8 @@ TIMER = EXT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95
index a9a5cec98..87b691cdd 100644
--- a/lapack-netlib/INSTALL/make.inc.pgf95
+++ b/lapack-netlib/INSTALL/make.inc.pgf95
@@ -8,30 +8,28 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = pgcc
+CC = pgcc
 CFLAGS =
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = pgf95
-OPTS    = -O3
-DRVOPTS = $(OPTS)
-NOOPT   = -O0
+FC = pgf95
+FFLAGS = -O3
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = $(FORTRAN)
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -74,8 +72,8 @@ TIMER = INT_CPU_TIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf
index 1d9bf549c..97d22d27d 100644
--- a/lapack-netlib/INSTALL/make.inc.pghpf
+++ b/lapack-netlib/INSTALL/make.inc.pghpf
@@ -8,30 +8,28 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = pghpc
+CC = pghpc
 CFLAGS =
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
-FORTRAN = pghpf
-OPTS    = -O4 -Mnohpfc -Mdclchk
-DRVOPTS = $(OPTS)
-NOOPT   = -Mnohpfc -Mdclchk
+FC = pghpf
+FFLAGS = -O4 -Mnohpfc -Mdclchk
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -Mnohpfc -Mdclchk
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = pghpf
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = echo
+AR = ar
+ARFLAGS = cr
+RANLIB = echo
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -75,8 +73,8 @@ TIMER = EXT_ETIME
 #  possible.)
 #
 #BLASLIB      = -lessl
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/INSTALL/slamch.f b/lapack-netlib/INSTALL/slamch.f
index 3282fa6a3..342f446ff 100644
--- a/lapack-netlib/INSTALL/slamch.f
+++ b/lapack-netlib/INSTALL/slamch.f
@@ -28,6 +28,7 @@
 *
 *> \param[in] CMACH
 *> \verbatim
+*>          CMACH is CHARACTER*1
 *>          Specifies the value to be returned by SLAMCH:
 *>          = 'E' or 'e',   SLAMCH := eps
 *>          = 'S' or 's ,   SLAMCH := sfmin
diff --git a/lapack-netlib/LAPACKE/CMakeLists.txt b/lapack-netlib/LAPACKE/CMakeLists.txt
index 42faef5dd..0589a74ba 100644
--- a/lapack-netlib/LAPACKE/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/CMakeLists.txt
@@ -16,18 +16,16 @@ if(NOT FortranCInterface_GLOBAL_FOUND OR NOT FortranCInterface_MODULE_FOUND)
                  ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h)
 endif()
 
-if(WIN32 AND NOT UNIX)
-  add_definitions(-DHAVE_LAPACK_CONFIG_H -DLAPACK_COMPLEX_STRUCTURE)
-  message(STATUS "Windows BUILD")
-endif()
-
-get_directory_property(DirDefs COMPILE_DEFINITIONS)
-
 include_directories(include ${LAPACK_BINARY_DIR}/include)
 add_subdirectory(include)
 add_subdirectory(src)
 add_subdirectory(utils)
 
+option(LAPACKE_BUILD_SINGLE "Build LAPACKE single precision real" ON)
+option(LAPACKE_BUILD_DOUBLE "Build LAPACKE double precision real" ON)
+option(LAPACKE_BUILD_COMPLEX "Build LAPACKE single precision complex" ON)
+option(LAPACKE_BUILD_COMPLEX16 "Build LAPACKE double precision complex" ON)
+
 macro(append_subdir_files variable dirname)
   get_directory_property(holder DIRECTORY ${dirname} DEFINITION ${variable})
   foreach(depfile ${holder})
@@ -35,8 +33,29 @@ macro(append_subdir_files variable dirname)
   endforeach()
 endmacro()
 
+message(STATUS "Build LAPACKE single precision real: ${LAPACKE_BUILD_SINGLE}")
+message(STATUS "Build LAPACKE double precision real: ${LAPACKE_BUILD_DOUBLE}")
+message(STATUS "Build LAPACKE single precision complex: ${LAPACKE_BUILD_COMPLEX}")
+message(STATUS "Build LAPACKE double precision complex: ${LAPACKE_BUILD_COMPLEX16}")
+
 append_subdir_files(LAPACKE_INCLUDE "include")
 append_subdir_files(SOURCES "src")
+if (LAPACKE_BUILD_SINGLE)
+  append_subdir_files(SOURCES_SINGLE "src")
+  list(APPEND SOURCES ${SOURCES_SINGLE})
+endif()
+if (LAPACKE_BUILD_DOUBLE)
+  append_subdir_files(SOURCES_DOUBLE "src")
+  list(APPEND SOURCES ${SOURCES_DOUBLE})
+endif()
+if (LAPACKE_BUILD_COMPLEX)
+  append_subdir_files(SOURCES_COMPLEX "src")
+  list(APPEND SOURCES ${SOURCES_COMPLEX})
+endif()
+if (LAPACKE_BUILD_COMPLEX16)
+  append_subdir_files(SOURCES_COMPLEX16 "src")
+  list(APPEND SOURCES ${SOURCES_COMPLEX16})
+endif()
 append_subdir_files(DEPRECATED "src")
 append_subdir_files(EXTENDED "src")
 append_subdir_files(MATGEN "src")
@@ -61,9 +80,13 @@ set_target_properties(
   SOVERSION ${LAPACK_MAJOR_VERSION}
   )
 target_include_directories(lapacke PUBLIC
-  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
+  $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
   $<INSTALL_INTERFACE:include>
 )
+if(WIN32 AND NOT UNIX)
+  target_compile_definitions(lapacke PUBLIC HAVE_LAPACK_CONFIG_H LAPACK_COMPLEX_STRUCTURE)
+  message(STATUS "Windows BUILD")
+endif()
 
 if(LAPACKE_WITH_TMG)
   target_link_libraries(lapacke PRIVATE tmglib)
@@ -71,7 +94,11 @@ endif()
 target_link_libraries(lapacke PRIVATE ${LAPACK_LIBRARIES})
 
 lapack_install_library(lapacke)
-install(FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
+install(
+  FILES ${LAPACKE_INCLUDE} ${LAPACK_BINARY_DIR}/include/lapacke_mangling.h
+  DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
+  COMPONENT Development
+  )
 
 if(BUILD_TESTING)
   add_subdirectory(example)
@@ -82,6 +109,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/lapacke.pc.in ${CMAKE_CURRENT_BINARY_
 install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/lapacke.pc
   DESTINATION ${PKG_CONFIG_DIR}
+  COMPONENT Development
   )
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/cmake/lapacke-config-version.cmake.in
@@ -95,7 +123,10 @@ install(FILES
   ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/lapacke-config.cmake
   ${LAPACK_BINARY_DIR}/lapacke-config-version.cmake
   DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION}
+  COMPONENT Development
   )
 
 install(EXPORT lapacke-targets
-  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION})
+  DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/lapacke-${LAPACK_VERSION}
+  COMPONENT Development
+  )
diff --git a/lapack-netlib/LAPACKE/Makefile b/lapack-netlib/LAPACKE/Makefile
index 016f8a2f2..a358d7c9f 100644
--- a/lapack-netlib/LAPACKE/Makefile
+++ b/lapack-netlib/LAPACKE/Makefile
@@ -40,22 +40,26 @@
 # To clean everything including lapacke library type
 # 'make cleanall'
 #
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc
 
+.PHONY: all
 all: lapacke
 
+.PHONY: lapacke
 lapacke: include/lapacke_mangling.h
 	$(MAKE) -C src
 	$(MAKE) -C utils
 
 include/lapacke_mangling.h: include/lapacke_mangling_with_flags.h.in
-	cp $< $@
+	cp include/lapacke_mangling_with_flags.h.in $@
 
+.PHONY: lapacke_example
 lapacke_example: lapacke
 	$(MAKE) -C example
 
-#clean: cleanlib
-clean: cleanobj
+.PHONY: clean cleanobj cleanlib cleanexe
+clean:
 	$(MAKE) -C src clean
 	$(MAKE) -C utils clean
 	$(MAKE) -C example clean
@@ -64,6 +68,6 @@ cleanobj:
 	$(MAKE) -C utils cleanobj
 	$(MAKE) -C example cleanobj
 cleanlib:
-	rm -f ../$(LAPACKELIB)
+	$(MAKE) -C src cleanlib
 cleanexe:
 	$(MAKE) -C example cleanexe
diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in
index 6900f4533..0a1350172 100644
--- a/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in
+++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-build.cmake.in
@@ -7,8 +7,11 @@ if(NOT TARGET lapacke)
   include("@LAPACK_BINARY_DIR@/lapack-targets.cmake")
 endif()
 
+# Hint for project building against lapack
+set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID})
+
 # Report lapacke header search locations from build tree.
 set(LAPACKE_INCLUDE_DIRS "@LAPACK_BINARY_DIR@/include")
 
 # Report lapacke libraries.
-set(LAPACKE_LIBRARIES lapacke)
+set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES})
diff --git a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in
index caa459a24..57a5c2b2f 100644
--- a/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in
+++ b/lapack-netlib/LAPACKE/cmake/lapacke-config-install.cmake.in
@@ -13,11 +13,14 @@ if(NOT TARGET lapacke)
   include(${_LAPACKE_SELF_DIR}/lapacke-targets.cmake)
 endif()
 
+# Hint for project building against lapack
+set(LAPACKE_Fortran_COMPILER_ID ${LAPACK_Fortran_COMPILER_ID})
+
 # Report lapacke header search locations.
 set(LAPACKE_INCLUDE_DIRS ${_LAPACKE_PREFIX}/include)
 
 # Report lapacke libraries.
-set(LAPACKE_LIBRARIES lapacke)
+set(LAPACKE_LIBRARIES lapacke ${LAPACK_LIBRARIES})
 
 unset(_LAPACKE_PREFIX)
 unset(_LAPACKE_SELF_DIR)
diff --git a/lapack-netlib/LAPACKE/example/Makefile b/lapack-netlib/LAPACKE/example/Makefile
index f959a2be0..77526dc42 100644
--- a/lapack-netlib/LAPACKE/example/Makefile
+++ b/lapack-netlib/LAPACKE/example/Makefile
@@ -1,34 +1,38 @@
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
 
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I. -I../include -c -o $@ $<
+
+.PHONY: all
 all: xexample_DGESV_rowmajor \
      xexample_DGESV_colmajor \
      xexample_DGELS_rowmajor \
      xexample_DGELS_colmajor
 
-LIBRARIES = ../../$(LAPACKELIB) ../../$(LAPACKLIB) $(BLASLIB)
+LIBRARIES = $(LAPACKELIB) $(LAPACKLIB) $(BLASLIB)
 
 # Double Precision Examples
 xexample_DGESV_rowmajor: example_DGESV_rowmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@
 
 xexample_DGESV_colmajor: example_DGESV_colmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@
 
 xexample_DGELS_rowmajor: example_DGELS_rowmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@
 
 xexample_DGELS_colmajor: example_DGELS_colmajor.o lapacke_example_aux.o $(LIBRARIES)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+	$(FC) $(FFLAGS) $(LDFLAGS) -o $@ $^
 	./$@
 
+.PHONY: clean cleanobj cleanexe
 clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
 cleanexe:
 	rm -f x*
-
-.c.o:
-	$(CC) $(CFLAGS) -I. -I../include -c -o $@ $<
diff --git a/lapack-netlib/LAPACKE/include/CMakeLists.txt b/lapack-netlib/LAPACKE/include/CMakeLists.txt
index 4c30c0501..b690dc554 100644
--- a/lapack-netlib/LAPACKE/include/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/include/CMakeLists.txt
@@ -1,3 +1,3 @@
-set(LAPACKE_INCLUDE lapacke.h lapacke_config.h lapacke_utils.h)
+set(LAPACKE_INCLUDE lapacke.h lapack.h lapacke_config.h lapacke_utils.h)
 
 file(COPY ${LAPACKE_INCLUDE} DESTINATION ${LAPACK_BINARY_DIR}/include)
diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h
new file mode 100644
index 000000000..0a6226fe4
--- /dev/null
+++ b/lapack-netlib/LAPACKE/include/lapack.h
@@ -0,0 +1,13715 @@
+#ifndef LAPACK_H
+#define LAPACK_H
+
+/*
+*  Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes
+*/
+#ifdef HAVE_LAPACK_CONFIG_H
+#include "lapacke_config.h"
+#endif
+
+#include "lapacke_mangling.h"
+
+#include <stdlib.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*----------------------------------------------------------------------------*/
+#ifndef lapack_int
+#define lapack_int     int
+#endif
+
+#ifndef lapack_logical
+#define lapack_logical lapack_int
+#endif
+
+/* f2c, hence clapack and MacOS Accelerate, returns double instead of float
+ * for sdot, slange, clange, etc. */
+#if defined(LAPACK_F2C)
+    typedef double lapack_float_return;
+#else
+    typedef float lapack_float_return;
+#endif
+
+/* Complex types are structures equivalent to the
+* Fortran complex types COMPLEX(4) and COMPLEX(8).
+*
+* One can also redefine the types with his own types
+* for example by including in the code definitions like
+*
+* #define lapack_complex_float std::complex<float>
+* #define lapack_complex_double std::complex<double>
+*
+* or define these types in the command line:
+*
+* -Dlapack_complex_float="std::complex<float>"
+* -Dlapack_complex_double="std::complex<double>"
+*/
+
+#ifndef LAPACK_COMPLEX_CUSTOM
+
+/* Complex type (single precision) */
+#ifndef lapack_complex_float
+#ifndef __cplusplus
+#include <complex.h>
+#else
+#include <complex>
+#endif
+#define lapack_complex_float    float _Complex
+#endif
+
+#ifndef lapack_complex_float_real
+#define lapack_complex_float_real(z)       (creal(z))
+#endif
+
+#ifndef lapack_complex_float_imag
+#define lapack_complex_float_imag(z)       (cimag(z))
+#endif
+
+/* Complex type (double precision) */
+#ifndef lapack_complex_double
+#ifndef __cplusplus
+#include <complex.h>
+#else
+#include <complex>
+#endif
+#define lapack_complex_double   double _Complex
+#endif
+
+#ifndef lapack_complex_double_real
+#define lapack_complex_double_real(z)      (creal(z))
+#endif
+
+#ifndef lapack_complex_double_imag
+#define lapack_complex_double_imag(z)       (cimag(z))
+#endif
+
+#endif /* LAPACK_COMPLEX_CUSTOM */
+
+/* Callback logical functions of one, two, or three arguments are used
+*  to select eigenvalues to sort to the top left of the Schur form.
+*  The value is selected if function returns TRUE (non-zero). */
+
+typedef lapack_logical (*LAPACK_S_SELECT2) ( const float*, const float* );
+typedef lapack_logical (*LAPACK_S_SELECT3)
+    ( const float*, const float*, const float* );
+typedef lapack_logical (*LAPACK_D_SELECT2) ( const double*, const double* );
+typedef lapack_logical (*LAPACK_D_SELECT3)
+    ( const double*, const double*, const double* );
+
+typedef lapack_logical (*LAPACK_C_SELECT1) ( const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_C_SELECT2)
+    ( const lapack_complex_float*, const lapack_complex_float* );
+typedef lapack_logical (*LAPACK_Z_SELECT1) ( const lapack_complex_double* );
+typedef lapack_logical (*LAPACK_Z_SELECT2)
+    ( const lapack_complex_double*, const lapack_complex_double* );
+
+#define LAPACK_lsame LAPACK_GLOBAL(lsame,LSAME)
+lapack_logical LAPACK_lsame( char* ca,  char* cb,
+                              lapack_int lca, lapack_int lcb );
+
+
+/*----------------------------------------------------------------------------*/
+/* This is in alphabetical order (ignoring leading precision). */
+
+#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd,CBBCSD)
+void LAPACK_cbbcsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    float* theta,
+    float* phi,
+    lapack_complex_float* U1, lapack_int const* ldu1,
+    lapack_complex_float* U2, lapack_int const* ldu2,
+    lapack_complex_float* V1T, lapack_int const* ldv1t,
+    lapack_complex_float* V2T, lapack_int const* ldv2t,
+    float* B11D,
+    float* B11E,
+    float* B12D,
+    float* B12E,
+    float* B21D,
+    float* B21E,
+    float* B22D,
+    float* B22E,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd,DBBCSD)
+void LAPACK_dbbcsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    double* theta,
+    double* phi,
+    double* U1, lapack_int const* ldu1,
+    double* U2, lapack_int const* ldu2,
+    double* V1T, lapack_int const* ldv1t,
+    double* V2T, lapack_int const* ldv2t,
+    double* B11D,
+    double* B11E,
+    double* B12D,
+    double* B12E,
+    double* b21d,
+    double* b21e,
+    double* b22d,
+    double* b22e,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd,SBBCSD)
+void LAPACK_sbbcsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    float* theta,
+    float* phi,
+    float* U1, lapack_int const* ldu1,
+    float* U2, lapack_int const* ldu2,
+    float* V1T, lapack_int const* ldv1t,
+    float* V2T, lapack_int const* ldv2t,
+    float* B11D,
+    float* B11E,
+    float* B12D,
+    float* B12E,
+    float* B21D,
+    float* B21E,
+    float* B22D,
+    float* B22E,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd,ZBBCSD)
+void LAPACK_zbbcsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    double* theta,
+    double* phi,
+    lapack_complex_double* U1, lapack_int const* ldu1,
+    lapack_complex_double* U2, lapack_int const* ldu2,
+    lapack_complex_double* V1T, lapack_int const* ldv1t,
+    lapack_complex_double* V2T, lapack_int const* ldv2t,
+    double* B11D,
+    double* B11E,
+    double* B12D,
+    double* B12E,
+    double* B21D,
+    double* B21E,
+    double* B22D,
+    double* B22E,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc,DBDSDC)
+void LAPACK_dbdsdc(
+    char const* uplo, char const* compq,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double* U, lapack_int const* ldu,
+    double* VT, lapack_int const* ldvt,
+    double* Q, lapack_int* IQ,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc,SBDSDC)
+void LAPACK_sbdsdc(
+    char const* uplo, char const* compq,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float* U, lapack_int const* ldu,
+    float* VT, lapack_int const* ldvt,
+    float* Q, lapack_int* IQ,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr,CBDSQR)
+void LAPACK_cbdsqr(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc,
+    float* D,
+    float* E,
+    lapack_complex_float* VT, lapack_int const* ldvt,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* C, lapack_int const* ldc,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr,DBDSQR)
+void LAPACK_dbdsqr(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc,
+    double* D,
+    double* E,
+    double* VT, lapack_int const* ldvt,
+    double* U, lapack_int const* ldu,
+    double* C, lapack_int const* ldc,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr,SBDSQR)
+void LAPACK_sbdsqr(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc,
+    float* D,
+    float* E,
+    float* VT, lapack_int const* ldvt,
+    float* U, lapack_int const* ldu,
+    float* C, lapack_int const* ldc,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr,ZBDSQR)
+void LAPACK_zbdsqr(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* ncvt, lapack_int const* nru, lapack_int const* ncc,
+    double* D,
+    double* E,
+    lapack_complex_double* VT, lapack_int const* ldvt,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* C, lapack_int const* ldc,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_dbdsvdx LAPACK_GLOBAL(dbdsvdx,DBDSVDX)
+void LAPACK_dbdsvdx(
+    char const* uplo, char const* jobz, char const* range,
+    lapack_int const* n,
+    double const* D,
+    double const* E,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns,
+    double* S,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sbdsvdx LAPACK_GLOBAL(sbdsvdx,SBDSVDX)
+void LAPACK_sbdsvdx(
+    char const* uplo, char const* jobz, char const* range,
+    lapack_int const* n,
+    float const* D,
+    float const* E,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns,
+    float* S,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ddisna LAPACK_GLOBAL(ddisna,DDISNA)
+void LAPACK_ddisna(
+    char const* job,
+    lapack_int const* m, lapack_int const* n,
+    double const* D,
+    double* SEP,
+    lapack_int* info );
+
+#define LAPACK_sdisna LAPACK_GLOBAL(sdisna,SDISNA)
+void LAPACK_sdisna(
+    char const* job,
+    lapack_int const* m, lapack_int const* n,
+    float const* D,
+    float* SEP,
+    lapack_int* info );
+
+#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd,CGBBRD)
+void LAPACK_cgbbrd(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    float* D,
+    float* E,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* PT, lapack_int const* ldpt,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd,DGBBRD)
+void LAPACK_dgbbrd(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku,
+    double* AB, lapack_int const* ldab,
+    double* D,
+    double* E,
+    double* Q, lapack_int const* ldq,
+    double* PT, lapack_int const* ldpt,
+    double* C, lapack_int const* ldc,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd,SGBBRD)
+void LAPACK_sgbbrd(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku,
+    float* AB, lapack_int const* ldab,
+    float* D,
+    float* E,
+    float* Q, lapack_int const* ldq,
+    float* PT, lapack_int const* ldpt,
+    float* C, lapack_int const* ldc,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd,ZGBBRD)
+void LAPACK_zgbbrd(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ncc, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    double* D,
+    double* E,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* PT, lapack_int const* ldpt,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon,CGBCON)
+void LAPACK_cgbcon(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_float const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon,DGBCON)
+void LAPACK_dgbcon(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    double const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon,SGBCON)
+void LAPACK_sgbcon(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    float const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon,ZGBCON)
+void LAPACK_zgbcon(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_double const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ,CGBEQU)
+void LAPACK_cgbequ(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ,DGBEQU)
+void LAPACK_dgbequ(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    double const* AB, lapack_int const* ldab,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ,SGBEQU)
+void LAPACK_sgbequ(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    float const* AB, lapack_int const* ldab,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ,ZGBEQU)
+void LAPACK_zgbequ(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb,CGBEQUB)
+void LAPACK_cgbequb(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb,DGBEQUB)
+void LAPACK_dgbequb(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    double const* AB, lapack_int const* ldab,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb,SGBEQUB)
+void LAPACK_sgbequb(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    float const* AB, lapack_int const* ldab,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb,ZGBEQUB)
+void LAPACK_zgbequb(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs,CGBRFS)
+void LAPACK_cgbrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs,DGBRFS)
+void LAPACK_dgbrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    double const* AB, lapack_int const* ldab,
+    double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs,SGBRFS)
+void LAPACK_sgbrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    float const* AB, lapack_int const* ldab,
+    float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs,ZGBRFS)
+void LAPACK_zgbrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx,CGBRFSX)
+void LAPACK_cgbrfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    float* R,
+    float* C,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx,DGBRFSX)
+void LAPACK_dgbrfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    double const* AB, lapack_int const* ldab,
+    double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    double* R,
+    double* C,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx,SGBRFSX)
+void LAPACK_sgbrfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    float const* AB, lapack_int const* ldab,
+    float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    float* R,
+    float* C,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx,ZGBRFSX)
+void LAPACK_zgbrfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv,
+    double* R,
+    double* C,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv,CGBSV)
+void LAPACK_cgbsv(
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_float* AB, lapack_int const* ldab, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv,DGBSV)
+void LAPACK_dgbsv(
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    double* AB, lapack_int const* ldab, lapack_int* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv,SGBSV)
+void LAPACK_sgbsv(
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    float* AB, lapack_int const* ldab, lapack_int* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv,ZGBSV)
+void LAPACK_zgbsv(
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_double* AB, lapack_int const* ldab, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx,CGBSVX)
+void LAPACK_cgbsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx,DGBSVX)
+void LAPACK_dgbsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    double* AB, lapack_int const* ldab,
+    double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx,SGBSVX)
+void LAPACK_sgbsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    float* AB, lapack_int const* ldab,
+    float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx,ZGBSVX)
+void LAPACK_zgbsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx,CGBSVXX)
+void LAPACK_cgbsvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx,DGBSVXX)
+void LAPACK_dgbsvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    double* AB, lapack_int const* ldab,
+    double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx,SGBSVXX)
+void LAPACK_sgbsvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    float* AB, lapack_int const* ldab,
+    float* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx,ZGBSVXX)
+void LAPACK_zgbsvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* AFB, lapack_int const* ldafb, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf,CGBTRF)
+void LAPACK_cgbtrf(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_float* AB, lapack_int const* ldab, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf,DGBTRF)
+void LAPACK_dgbtrf(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    double* AB, lapack_int const* ldab, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf,SGBTRF)
+void LAPACK_sgbtrf(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    float* AB, lapack_int const* ldab, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf,ZGBTRF)
+void LAPACK_zgbtrf(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_double* AB, lapack_int const* ldab, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs,CGBTRS)
+void LAPACK_cgbtrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_float const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs,DGBTRS)
+void LAPACK_dgbtrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    double const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs,SGBTRS)
+void LAPACK_sgbtrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    float const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs,ZGBTRS)
+void LAPACK_zgbtrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs,
+    lapack_complex_double const* AB, lapack_int const* ldab, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cgebak LAPACK_GLOBAL(cgebak,CGEBAK)
+void LAPACK_cgebak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float const* scale, lapack_int const* m,
+    lapack_complex_float* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_dgebak LAPACK_GLOBAL(dgebak,DGEBAK)
+void LAPACK_dgebak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double const* scale, lapack_int const* m,
+    double* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_sgebak LAPACK_GLOBAL(sgebak,SGEBAK)
+void LAPACK_sgebak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float const* scale, lapack_int const* m,
+    float* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_zgebak LAPACK_GLOBAL(zgebak,ZGEBAK)
+void LAPACK_zgebak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double const* scale, lapack_int const* m,
+    lapack_complex_double* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_cgebal LAPACK_GLOBAL(cgebal,CGEBAL)
+void LAPACK_cgebal(
+    char const* job,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi,
+    float* scale,
+    lapack_int* info );
+
+#define LAPACK_dgebal LAPACK_GLOBAL(dgebal,DGEBAL)
+void LAPACK_dgebal(
+    char const* job,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi,
+    double* scale,
+    lapack_int* info );
+
+#define LAPACK_sgebal LAPACK_GLOBAL(sgebal,SGEBAL)
+void LAPACK_sgebal(
+    char const* job,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi,
+    float* scale,
+    lapack_int* info );
+
+#define LAPACK_zgebal LAPACK_GLOBAL(zgebal,ZGEBAL)
+void LAPACK_zgebal(
+    char const* job,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ilo, lapack_int* ihi,
+    double* scale,
+    lapack_int* info );
+
+#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd,CGEBRD)
+void LAPACK_cgebrd(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* D,
+    float* E,
+    lapack_complex_float* tauq,
+    lapack_complex_float* taup,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd,DGEBRD)
+void LAPACK_dgebrd(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* D,
+    double* E,
+    double* tauq,
+    double* taup,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd,SGEBRD)
+void LAPACK_sgebrd(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* D,
+    float* E,
+    float* tauq,
+    float* taup,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd,ZGEBRD)
+void LAPACK_zgebrd(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* D,
+    double* E,
+    lapack_complex_double* tauq,
+    lapack_complex_double* taup,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgecon LAPACK_GLOBAL(cgecon,CGECON)
+void LAPACK_cgecon(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgecon LAPACK_GLOBAL(dgecon,DGECON)
+void LAPACK_dgecon(
+    char const* norm,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgecon LAPACK_GLOBAL(sgecon,SGECON)
+void LAPACK_sgecon(
+    char const* norm,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgecon LAPACK_GLOBAL(zgecon,ZGECON)
+void LAPACK_zgecon(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ,CGEEQU)
+void LAPACK_cgeequ(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ,DGEEQU)
+void LAPACK_dgeequ(
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ,SGEEQU)
+void LAPACK_sgeequ(
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ,ZGEEQU)
+void LAPACK_zgeequ(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb,CGEEQUB)
+void LAPACK_cgeequb(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb,DGEEQUB)
+void LAPACK_dgeequb(
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb,SGEEQUB)
+void LAPACK_sgeequb(
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* R,
+    float* C,
+    float* rowcnd,
+    float* colcnd,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb,ZGEEQUB)
+void LAPACK_zgeequb(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* R,
+    double* C,
+    double* rowcnd,
+    double* colcnd,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cgees LAPACK_GLOBAL(cgees,CGEES)
+void LAPACK_cgees(
+    char const* jobvs, char const* sort, LAPACK_C_SELECT1 select,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* sdim,
+    lapack_complex_float* W,
+    lapack_complex_float* VS, lapack_int const* ldvs,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_dgees LAPACK_GLOBAL(dgees,DGEES)
+void LAPACK_dgees(
+    char const* jobvs, char const* sort, LAPACK_D_SELECT2 select,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* sdim,
+    double* WR,
+    double* WI,
+    double* VS, lapack_int const* ldvs,
+    double* work, lapack_int const* lwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_sgees LAPACK_GLOBAL(sgees,SGEES)
+void LAPACK_sgees(
+    char const* jobvs, char const* sort, LAPACK_S_SELECT2 select,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* sdim,
+    float* WR,
+    float* WI,
+    float* VS, lapack_int const* ldvs,
+    float* work, lapack_int const* lwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_zgees LAPACK_GLOBAL(zgees,ZGEES)
+void LAPACK_zgees(
+    char const* jobvs, char const* sort, LAPACK_Z_SELECT1 select,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* sdim,
+    lapack_complex_double* W,
+    lapack_complex_double* VS, lapack_int const* ldvs,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx,CGEESX)
+void LAPACK_cgeesx(
+    char const* jobvs, char const* sort, LAPACK_C_SELECT1 select, char const* sense,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* sdim,
+    lapack_complex_float* W,
+    lapack_complex_float* VS, lapack_int const* ldvs,
+    float* rconde,
+    float* rcondv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx,DGEESX)
+void LAPACK_dgeesx(
+    char const* jobvs, char const* sort, LAPACK_D_SELECT2 select, char const* sense,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* sdim,
+    double* WR,
+    double* WI,
+    double* VS, lapack_int const* ldvs,
+    double* rconde,
+    double* rcondv,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx,SGEESX)
+void LAPACK_sgeesx(
+    char const* jobvs, char const* sort, LAPACK_S_SELECT2 select, char const* sense,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* sdim,
+    float* WR,
+    float* WI,
+    float* VS, lapack_int const* ldvs,
+    float* rconde,
+    float* rcondv,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx,ZGEESX)
+void LAPACK_zgeesx(
+    char const* jobvs, char const* sort, LAPACK_Z_SELECT1 select, char const* sense,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* sdim,
+    lapack_complex_double* W,
+    lapack_complex_double* VS, lapack_int const* ldvs,
+    double* rconde,
+    double* rcondv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_cgeev LAPACK_GLOBAL(cgeev,CGEEV)
+void LAPACK_cgeev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* W,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgeev LAPACK_GLOBAL(dgeev,DGEEV)
+void LAPACK_dgeev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* WR,
+    double* WI,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgeev LAPACK_GLOBAL(sgeev,SGEEV)
+void LAPACK_sgeev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* WR,
+    float* WI,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgeev LAPACK_GLOBAL(zgeev,ZGEEV)
+void LAPACK_zgeev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* W,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx,CGEEVX)
+void LAPACK_cgeevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* W,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    float* scale,
+    float* abnrm,
+    float* rconde,
+    float* rcondv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx,DGEEVX)
+void LAPACK_dgeevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* WR,
+    double* WI,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    double* scale,
+    double* abnrm,
+    double* rconde,
+    double* rcondv,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx,SGEEVX)
+void LAPACK_sgeevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* WR,
+    float* WI,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    float* scale,
+    float* abnrm,
+    float* rconde,
+    float* rcondv,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx,ZGEEVX)
+void LAPACK_zgeevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* W,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    double* scale,
+    double* abnrm,
+    double* rconde,
+    double* rcondv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd,CGEHRD)
+void LAPACK_cgehrd(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd,DGEHRD)
+void LAPACK_dgehrd(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd,SGEHRD)
+void LAPACK_sgehrd(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd,ZGEHRD)
+void LAPACK_zgehrd(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgejsv LAPACK_GLOBAL(cgejsv,CGEJSV)
+void LAPACK_cgejsv(
+    char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* SVA,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* V, lapack_int const* ldv,
+    lapack_complex_float* cwork, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv,DGEJSV)
+void LAPACK_dgejsv(
+    char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp,
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* SVA,
+    double* U, lapack_int const* ldu,
+    double* V, lapack_int const* ldv,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv,SGEJSV)
+void LAPACK_sgejsv(
+    char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp,
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* SVA,
+    float* U, lapack_int const* ldu,
+    float* V, lapack_int const* ldv,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgejsv LAPACK_GLOBAL(zgejsv,ZGEJSV)
+void LAPACK_zgejsv(
+    char const* joba, char const* jobu, char const* jobv, char const* jobr, char const* jobt, char const* jobp,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* SVA,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* V, lapack_int const* ldv,
+    lapack_complex_double* cwork, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cgelq LAPACK_GLOBAL(cgelq,CGELQ)
+void LAPACK_cgelq(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* tsize,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgelq LAPACK_GLOBAL(dgelq,DGELQ)
+void LAPACK_dgelq(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* tsize,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgelq LAPACK_GLOBAL(sgelq,SGELQ)
+void LAPACK_sgelq(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* tsize,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgelq LAPACK_GLOBAL(zgelq,ZGELQ)
+void LAPACK_zgelq(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* tsize,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2,CGELQ2)
+void LAPACK_cgelq2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2,DGELQ2)
+void LAPACK_dgelq2(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2,SGELQ2)
+void LAPACK_sgelq2(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2,ZGELQ2)
+void LAPACK_zgelq2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf,CGELQF)
+void LAPACK_cgelqf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf,DGELQF)
+void LAPACK_dgelqf(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf,SGELQF)
+void LAPACK_sgelqf(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf,ZGELQF)
+void LAPACK_zgelqf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgels LAPACK_GLOBAL(cgels,CGELS)
+void LAPACK_cgels(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgels LAPACK_GLOBAL(dgels,DGELS)
+void LAPACK_dgels(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgels LAPACK_GLOBAL(sgels,SGELS)
+void LAPACK_sgels(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgels LAPACK_GLOBAL(zgels,ZGELS)
+void LAPACK_zgels(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd,CGELSD)
+void LAPACK_cgelsd(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float* S,
+    float const* rcond, lapack_int* rank,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd,DGELSD)
+void LAPACK_dgelsd(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* S,
+    double const* rcond, lapack_int* rank,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd,SGELSD)
+void LAPACK_sgelsd(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* S,
+    float const* rcond, lapack_int* rank,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd,ZGELSD)
+void LAPACK_zgelsd(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double* S,
+    double const* rcond, lapack_int* rank,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cgelss LAPACK_GLOBAL(cgelss,CGELSS)
+void LAPACK_cgelss(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float* S,
+    float const* rcond, lapack_int* rank,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgelss LAPACK_GLOBAL(dgelss,DGELSS)
+void LAPACK_dgelss(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* S,
+    double const* rcond, lapack_int* rank,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgelss LAPACK_GLOBAL(sgelss,SGELSS)
+void LAPACK_sgelss(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* S,
+    float const* rcond, lapack_int* rank,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgelss LAPACK_GLOBAL(zgelss,ZGELSS)
+void LAPACK_zgelss(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double* S,
+    double const* rcond, lapack_int* rank,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy,CGELSY)
+void LAPACK_cgelsy(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb, lapack_int* JPVT,
+    float const* rcond, lapack_int* rank,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy,DGELSY)
+void LAPACK_dgelsy(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb, lapack_int* JPVT,
+    double const* rcond, lapack_int* rank,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy,SGELSY)
+void LAPACK_sgelsy(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb, lapack_int* JPVT,
+    float const* rcond, lapack_int* rank,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy,ZGELSY)
+void LAPACK_zgelsy(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb, lapack_int* JPVT,
+    double const* rcond, lapack_int* rank,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgemlq LAPACK_GLOBAL(cgemlq,CGEMLQ)
+void LAPACK_cgemlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* T, lapack_int const* tsize,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgemlq LAPACK_GLOBAL(dgemlq,DGEMLQ)
+void LAPACK_dgemlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* A, lapack_int const* lda,
+    double const* T, lapack_int const* tsize,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgemlq LAPACK_GLOBAL(sgemlq,SGEMLQ)
+void LAPACK_sgemlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* A, lapack_int const* lda,
+    float const* T, lapack_int const* tsize,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgemlq LAPACK_GLOBAL(zgemlq,ZGEMLQ)
+void LAPACK_zgemlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* T, lapack_int const* tsize,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgemqr LAPACK_GLOBAL(cgemqr,CGEMQR)
+void LAPACK_cgemqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* T, lapack_int const* tsize,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgemqr LAPACK_GLOBAL(dgemqr,DGEMQR)
+void LAPACK_dgemqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* A, lapack_int const* lda,
+    double const* T, lapack_int const* tsize,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgemqr LAPACK_GLOBAL(sgemqr,SGEMQR)
+void LAPACK_sgemqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* A, lapack_int const* lda,
+    float const* T, lapack_int const* tsize,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgemqr LAPACK_GLOBAL(zgemqr,ZGEMQR)
+void LAPACK_zgemqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* T, lapack_int const* tsize,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt,CGEMQRT)
+void LAPACK_cgemqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb,
+    lapack_complex_float const* V, lapack_int const* ldv,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt,DGEMQRT)
+void LAPACK_dgemqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb,
+    double const* V, lapack_int const* ldv,
+    double const* T, lapack_int const* ldt,
+    double* C, lapack_int const* ldc,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt,SGEMQRT)
+void LAPACK_sgemqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb,
+    float const* V, lapack_int const* ldv,
+    float const* T, lapack_int const* ldt,
+    float* C, lapack_int const* ldc,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt,ZGEMQRT)
+void LAPACK_zgemqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* nb,
+    lapack_complex_double const* V, lapack_int const* ldv,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cgeql2 LAPACK_GLOBAL(cgeql2,CGEQL2)
+void LAPACK_cgeql2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dgeql2 LAPACK_GLOBAL(dgeql2,DGEQL2)
+void LAPACK_dgeql2(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sgeql2 LAPACK_GLOBAL(sgeql2,SGEQL2)
+void LAPACK_sgeql2(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zgeql2 LAPACK_GLOBAL(zgeql2,ZGEQL2)
+void LAPACK_zgeql2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf,CGEQLF)
+void LAPACK_cgeqlf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf,DGEQLF)
+void LAPACK_dgeqlf(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf,SGEQLF)
+void LAPACK_sgeqlf(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf,ZGEQLF)
+void LAPACK_zgeqlf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf,SGEQPF)
+void LAPACK_sgeqpf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
+                    lapack_int* jpvt, float* tau, float* work,
+                    lapack_int *info );
+
+#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf,DGEQPF)
+void LAPACK_dgeqpf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
+                    lapack_int* jpvt, double* tau, double* work,
+                    lapack_int *info );
+
+#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf,CGEQPF)
+void LAPACK_cgeqpf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_float* tau, lapack_complex_float* work,
+                    float* rwork, lapack_int *info );
+
+#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf,ZGEQPF)
+void LAPACK_zgeqpf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
+                    lapack_int* lda, lapack_int* jpvt,
+                    lapack_complex_double* tau, lapack_complex_double* work,
+                    double* rwork, lapack_int *info );
+
+#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3,CGEQP3)
+void LAPACK_cgeqp3(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* JPVT,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3,DGEQP3)
+void LAPACK_dgeqp3(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* JPVT,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3,SGEQP3)
+void LAPACK_sgeqp3(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* JPVT,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3,ZGEQP3)
+void LAPACK_zgeqp3(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* JPVT,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgeqr LAPACK_GLOBAL(cgeqr,CGEQR)
+void LAPACK_cgeqr(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* tsize,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgeqr LAPACK_GLOBAL(dgeqr,DGEQR)
+void LAPACK_dgeqr(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* tsize,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgeqr LAPACK_GLOBAL(sgeqr,SGEQR)
+void LAPACK_sgeqr(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* tsize,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgeqr LAPACK_GLOBAL(zgeqr,ZGEQR)
+void LAPACK_zgeqr(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* tsize,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2,CGEQR2)
+void LAPACK_cgeqr2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2,DGEQR2)
+void LAPACK_dgeqr2(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2,SGEQR2)
+void LAPACK_sgeqr2(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2,ZGEQR2)
+void LAPACK_zgeqr2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf,CGEQRF)
+void LAPACK_cgeqrf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf,DGEQRF)
+void LAPACK_dgeqrf(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf,SGEQRF)
+void LAPACK_sgeqrf(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf,ZGEQRF)
+void LAPACK_zgeqrf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp,CGEQRFP)
+void LAPACK_cgeqrfp(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp,DGEQRFP)
+void LAPACK_dgeqrfp(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp,SGEQRFP)
+void LAPACK_sgeqrfp(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp,ZGEQRFP)
+void LAPACK_zgeqrfp(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt,CGEQRT)
+void LAPACK_cgeqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nb,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt,DGEQRT)
+void LAPACK_dgeqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nb,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* ldt,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt,SGEQRT)
+void LAPACK_sgeqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nb,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* ldt,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt,ZGEQRT)
+void LAPACK_zgeqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* nb,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2,CGEQRT2)
+void LAPACK_cgeqrt2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2,DGEQRT2)
+void LAPACK_dgeqrt2(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2,SGEQRT2)
+void LAPACK_sgeqrt2(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2,ZGEQRT2)
+void LAPACK_zgeqrt2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3,CGEQRT3)
+void LAPACK_cgeqrt3(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3,DGEQRT3)
+void LAPACK_dgeqrt3(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3,SGEQRT3)
+void LAPACK_sgeqrt3(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3,ZGEQRT3)
+void LAPACK_zgeqrt3(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs,CGERFS)
+void LAPACK_cgerfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs,DGERFS)
+void LAPACK_dgerfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs,SGERFS)
+void LAPACK_sgerfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs,ZGERFS)
+void LAPACK_zgerfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx,CGERFSX)
+void LAPACK_cgerfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    float const* R,
+    float const* C,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx,DGERFSX)
+void LAPACK_dgerfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    double const* R,
+    double const* C,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx,SGERFSX)
+void LAPACK_sgerfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    float const* R,
+    float const* C,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx,ZGERFSX)
+void LAPACK_zgerfsx(
+    char const* trans, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    double const* R,
+    double const* C,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgerq2 LAPACK_GLOBAL(cgerq2,CGERQ2)
+void LAPACK_cgerq2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dgerq2 LAPACK_GLOBAL(dgerq2,DGERQ2)
+void LAPACK_dgerq2(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sgerq2 LAPACK_GLOBAL(sgerq2,SGERQ2)
+void LAPACK_sgerq2(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zgerq2 LAPACK_GLOBAL(zgerq2,ZGERQ2)
+void LAPACK_zgerq2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf,CGERQF)
+void LAPACK_cgerqf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf,DGERQF)
+void LAPACK_dgerqf(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf,SGERQF)
+void LAPACK_sgerqf(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf,ZGERQF)
+void LAPACK_zgerqf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd,CGESDD)
+void LAPACK_cgesdd(
+    char const* jobz,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* S,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* VT, lapack_int const* ldvt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd,DGESDD)
+void LAPACK_dgesdd(
+    char const* jobz,
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* S,
+    double* U, lapack_int const* ldu,
+    double* VT, lapack_int const* ldvt,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd,SGESDD)
+void LAPACK_sgesdd(
+    char const* jobz,
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* S,
+    float* U, lapack_int const* ldu,
+    float* VT, lapack_int const* ldvt,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd,ZGESDD)
+void LAPACK_zgesdd(
+    char const* jobz,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* S,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* VT, lapack_int const* ldvt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
+void LAPACK_cgesv(
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV)
+void LAPACK_dgesv(
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV)
+void LAPACK_sgesv(
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV)
+void LAPACK_zgesv(
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv,DSGESV)
+void LAPACK_dsgesv(
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* work,
+    float* swork, lapack_int* iter,
+    lapack_int* info );
+
+#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv,ZCGESV)
+void LAPACK_zcgesv(
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    lapack_complex_double* work,
+    lapack_complex_float* swork,
+    double* rwork, lapack_int* iter,
+    lapack_int* info );
+
+#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd,CGESVD)
+void LAPACK_cgesvd(
+    char const* jobu, char const* jobvt,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* S,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* VT, lapack_int const* ldvt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd,DGESVD)
+void LAPACK_dgesvd(
+    char const* jobu, char const* jobvt,
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* S,
+    double* U, lapack_int const* ldu,
+    double* VT, lapack_int const* ldvt,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd,SGESVD)
+void LAPACK_sgesvd(
+    char const* jobu, char const* jobvt,
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* S,
+    float* U, lapack_int const* ldu,
+    float* VT, lapack_int const* ldvt,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd,ZGESVD)
+void LAPACK_zgesvd(
+    char const* jobu, char const* jobvt,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* S,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* VT, lapack_int const* ldvt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgesvdq LAPACK_GLOBAL(cgesvdq,CGESVDQ)
+void LAPACK_cgesvdq(
+    char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* S,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* V, lapack_int const* ldv, lapack_int* numrank,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_complex_float* cwork, lapack_int* lcwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_dgesvdq LAPACK_GLOBAL(dgesvdq,DGESVDQ)
+void LAPACK_dgesvdq(
+    char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* S,
+    double* U, lapack_int const* ldu,
+    double* V, lapack_int const* ldv, lapack_int* numrank,
+    lapack_int* iwork, lapack_int const* liwork,
+    double* work, lapack_int* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_sgesvdq LAPACK_GLOBAL(sgesvdq,SGESVDQ)
+void LAPACK_sgesvdq(
+    char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* S,
+    float* U, lapack_int const* ldu,
+    float* V, lapack_int const* ldv, lapack_int* numrank,
+    lapack_int* iwork, lapack_int const* liwork,
+    float* work, lapack_int* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_zgesvdq LAPACK_GLOBAL(zgesvdq,ZGESVDQ)
+void LAPACK_zgesvdq(
+    char const* joba, char const* jobp, char const* jobr, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* S,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* V, lapack_int const* ldv, lapack_int* numrank,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_complex_float* cwork, lapack_int* lcwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_cgesvdx LAPACK_GLOBAL(cgesvdx,CGESVDX)
+void LAPACK_cgesvdx(
+    char const* jobu, char const* jobvt, char const* range,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns,
+    float* S,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* VT, lapack_int const* ldvt,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dgesvdx LAPACK_GLOBAL(dgesvdx,DGESVDX)
+void LAPACK_dgesvdx(
+    char const* jobu, char const* jobvt, char const* range,
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns,
+    double* S,
+    double* U, lapack_int const* ldu,
+    double* VT, lapack_int const* ldvt,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgesvdx LAPACK_GLOBAL(sgesvdx,SGESVDX)
+void LAPACK_sgesvdx(
+    char const* jobu, char const* jobvt, char const* range,
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns,
+    float* S,
+    float* U, lapack_int const* ldu,
+    float* VT, lapack_int const* ldvt,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgesvdx LAPACK_GLOBAL(zgesvdx,ZGESVDX)
+void LAPACK_zgesvdx(
+    char const* jobu, char const* jobvt, char const* range,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* ns,
+    double* S,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* VT, lapack_int const* ldvt,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cgesvj LAPACK_GLOBAL(cgesvj,CGESVJ)
+void LAPACK_cgesvj(
+    char const* joba, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* SVA, lapack_int const* mv,
+    lapack_complex_float* V, lapack_int const* ldv,
+    lapack_complex_float* cwork, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj,DGESVJ)
+void LAPACK_dgesvj(
+    char const* joba, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* SVA, lapack_int const* mv,
+    double* V, lapack_int const* ldv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj,SGESVJ)
+void LAPACK_sgesvj(
+    char const* joba, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* SVA, lapack_int const* mv,
+    float* V, lapack_int const* ldv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgesvj LAPACK_GLOBAL(zgesvj,ZGESVJ)
+void LAPACK_zgesvj(
+    char const* joba, char const* jobu, char const* jobv,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* SVA, lapack_int const* mv,
+    lapack_complex_double* V, lapack_int const* ldv,
+    lapack_complex_double* cwork, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx,CGESVX)
+void LAPACK_cgesvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx,DGESVX)
+void LAPACK_dgesvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx,SGESVX)
+void LAPACK_sgesvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx,ZGESVX)
+void LAPACK_zgesvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx,CGESVXX)
+void LAPACK_cgesvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx,DGESVXX)
+void LAPACK_dgesvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx,SGESVXX)
+void LAPACK_sgesvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    float* R,
+    float* C,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx,ZGESVXX)
+void LAPACK_zgesvxx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    double* R,
+    double* C,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2)
+void LAPACK_cgetf2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2)
+void LAPACK_dgetf2(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2)
+void LAPACK_sgetf2(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2)
+void LAPACK_zgetf2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF)
+void LAPACK_cgetrf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF)
+void LAPACK_dgetrf(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF)
+void LAPACK_sgetrf(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF)
+void LAPACK_zgetrf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_cgetrf2 LAPACK_GLOBAL(cgetrf2,CGETRF2)
+void LAPACK_cgetrf2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_dgetrf2 LAPACK_GLOBAL(dgetrf2,DGETRF2)
+void LAPACK_dgetrf2(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_sgetrf2 LAPACK_GLOBAL(sgetrf2,SGETRF2)
+void LAPACK_sgetrf2(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_zgetrf2 LAPACK_GLOBAL(zgetrf2,ZGETRF2)
+void LAPACK_zgetrf2(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_cgetri LAPACK_GLOBAL(cgetri,CGETRI)
+void LAPACK_cgetri(
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgetri LAPACK_GLOBAL(dgetri,DGETRI)
+void LAPACK_dgetri(
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgetri LAPACK_GLOBAL(sgetri,SGETRI)
+void LAPACK_sgetri(
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgetri LAPACK_GLOBAL(zgetri,ZGETRI)
+void LAPACK_zgetri(
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs,CGETRS)
+void LAPACK_cgetrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs,DGETRS)
+void LAPACK_dgetrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs,SGETRS)
+void LAPACK_sgetrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs,ZGETRS)
+void LAPACK_zgetrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cgetsls LAPACK_GLOBAL(cgetsls,CGETSLS)
+void LAPACK_cgetsls(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgetsls LAPACK_GLOBAL(dgetsls,DGETSLS)
+void LAPACK_dgetsls(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgetsls LAPACK_GLOBAL(sgetsls,SGETSLS)
+void LAPACK_sgetsls(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgetsls LAPACK_GLOBAL(zgetsls,ZGETSLS)
+void LAPACK_zgetsls(
+    char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
+void LAPACK_cggbak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float const* lscale,
+    float const* rscale, lapack_int const* m,
+    lapack_complex_float* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_dggbak LAPACK_GLOBAL(dggbak,DGGBAK)
+void LAPACK_dggbak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double const* lscale,
+    double const* rscale, lapack_int const* m,
+    double* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_sggbak LAPACK_GLOBAL(sggbak,SGGBAK)
+void LAPACK_sggbak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float const* lscale,
+    float const* rscale, lapack_int const* m,
+    float* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_zggbak LAPACK_GLOBAL(zggbak,ZGGBAK)
+void LAPACK_zggbak(
+    char const* job, char const* side,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double const* lscale,
+    double const* rscale, lapack_int const* m,
+    lapack_complex_double* V, lapack_int const* ldv,
+    lapack_int* info );
+
+#define LAPACK_cggbal LAPACK_GLOBAL(cggbal,CGGBAL)
+void LAPACK_cggbal(
+    char const* job,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi,
+    float* lscale,
+    float* rscale,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dggbal LAPACK_GLOBAL(dggbal,DGGBAL)
+void LAPACK_dggbal(
+    char const* job,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi,
+    double* lscale,
+    double* rscale,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sggbal LAPACK_GLOBAL(sggbal,SGGBAL)
+void LAPACK_sggbal(
+    char const* job,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi,
+    float* lscale,
+    float* rscale,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zggbal LAPACK_GLOBAL(zggbal,ZGGBAL)
+void LAPACK_zggbal(
+    char const* job,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb, lapack_int* ilo, lapack_int* ihi,
+    double* lscale,
+    double* rscale,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_cgges LAPACK_GLOBAL(cgges,CGGES)
+void LAPACK_cgges(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_C_SELECT2 selctg,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb, lapack_int* sdim,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* VSL, lapack_int const* ldvsl,
+    lapack_complex_float* VSR, lapack_int const* ldvsr,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_dgges LAPACK_GLOBAL(dgges,DGGES)
+void LAPACK_dgges(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_D_SELECT3 selctg,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb, lapack_int* sdim,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* VSL, lapack_int const* ldvsl,
+    double* VSR, lapack_int const* ldvsr,
+    double* work, lapack_int const* lwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_sgges LAPACK_GLOBAL(sgges,SGGES)
+void LAPACK_sgges(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_S_SELECT3 selctg,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb, lapack_int* sdim,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* VSL, lapack_int const* ldvsl,
+    float* VSR, lapack_int const* ldvsr,
+    float* work, lapack_int const* lwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_zgges LAPACK_GLOBAL(zgges,ZGGES)
+void LAPACK_zgges(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_Z_SELECT2 selctg,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb, lapack_int* sdim,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* VSL, lapack_int const* ldvsl,
+    lapack_complex_double* VSR, lapack_int const* ldvsr,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_cgges3 LAPACK_GLOBAL(cgges3,CGGES3)
+void LAPACK_cgges3(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_C_SELECT2 selctg,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb, lapack_int* sdim,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* VSL, lapack_int const* ldvsl,
+    lapack_complex_float* VSR, lapack_int const* ldvsr,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_dgges3 LAPACK_GLOBAL(dgges3,DGGES3)
+void LAPACK_dgges3(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_D_SELECT3 selctg,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb, lapack_int* sdim,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* VSL, lapack_int const* ldvsl,
+    double* VSR, lapack_int const* ldvsr,
+    double* work, lapack_int const* lwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_sgges3 LAPACK_GLOBAL(sgges3,SGGES3)
+void LAPACK_sgges3(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_S_SELECT3 selctg,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb, lapack_int* sdim,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* VSL, lapack_int const* ldvsl,
+    float* VSR, lapack_int const* ldvsr,
+    float* work, lapack_int const* lwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_zgges3 LAPACK_GLOBAL(zgges3,ZGGES3)
+void LAPACK_zgges3(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_Z_SELECT2 selctg,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb, lapack_int* sdim,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* VSL, lapack_int const* ldvsl,
+    lapack_complex_double* VSR, lapack_int const* ldvsr,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_cggesx LAPACK_GLOBAL(cggesx,CGGESX)
+void LAPACK_cggesx(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_C_SELECT2 selctg, char const* sense,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb, lapack_int* sdim,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* VSL, lapack_int const* ldvsl,
+    lapack_complex_float* VSR, lapack_int const* ldvsr,
+    float* rconde,
+    float* rcondv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_dggesx LAPACK_GLOBAL(dggesx,DGGESX)
+void LAPACK_dggesx(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_D_SELECT3 selctg, char const* sense,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb, lapack_int* sdim,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* VSL, lapack_int const* ldvsl,
+    double* VSR, lapack_int const* ldvsr,
+    double* rconde,
+    double* rcondv,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_sggesx LAPACK_GLOBAL(sggesx,SGGESX)
+void LAPACK_sggesx(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_S_SELECT3 selctg, char const* sense,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb, lapack_int* sdim,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* VSL, lapack_int const* ldvsl,
+    float* VSR, lapack_int const* ldvsr,
+    float* rconde,
+    float* rcondv,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_zggesx LAPACK_GLOBAL(zggesx,ZGGESX)
+void LAPACK_zggesx(
+    char const* jobvsl, char const* jobvsr, char const* sort, LAPACK_Z_SELECT2 selctg, char const* sense,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb, lapack_int* sdim,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* VSL, lapack_int const* ldvsl,
+    lapack_complex_double* VSR, lapack_int const* ldvsr,
+    double* rconde,
+    double* rcondv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork, lapack_int const* liwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_cggev LAPACK_GLOBAL(cggev,CGGEV)
+void LAPACK_cggev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dggev LAPACK_GLOBAL(dggev,DGGEV)
+void LAPACK_dggev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sggev LAPACK_GLOBAL(sggev,SGGEV)
+void LAPACK_sggev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zggev LAPACK_GLOBAL(zggev,ZGGEV)
+void LAPACK_zggev(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cggev3 LAPACK_GLOBAL(cggev3,CGGEV3)
+void LAPACK_cggev3(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dggev3 LAPACK_GLOBAL(dggev3,DGGEV3)
+void LAPACK_dggev3(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sggev3 LAPACK_GLOBAL(sggev3,SGGEV3)
+void LAPACK_sggev3(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zggev3 LAPACK_GLOBAL(zggev3,ZGGEV3)
+void LAPACK_zggev3(
+    char const* jobvl, char const* jobvr,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cggevx LAPACK_GLOBAL(cggevx,CGGEVX)
+void LAPACK_cggevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    float* lscale,
+    float* rscale,
+    float* abnrm,
+    float* bbnrm,
+    float* rconde,
+    float* rcondv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_dggevx LAPACK_GLOBAL(dggevx,DGGEVX)
+void LAPACK_dggevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    double* lscale,
+    double* rscale,
+    double* abnrm,
+    double* bbnrm,
+    double* rconde,
+    double* rcondv,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_sggevx LAPACK_GLOBAL(sggevx,SGGEVX)
+void LAPACK_sggevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    float* lscale,
+    float* rscale,
+    float* abnrm,
+    float* bbnrm,
+    float* rconde,
+    float* rcondv,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_zggevx LAPACK_GLOBAL(zggevx,ZGGEVX)
+void LAPACK_zggevx(
+    char const* balanc, char const* jobvl, char const* jobvr, char const* sense,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr, lapack_int* ilo, lapack_int* ihi,
+    double* lscale,
+    double* rscale,
+    double* abnrm,
+    double* bbnrm,
+    double* rconde,
+    double* rcondv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork, lapack_logical* BWORK,
+    lapack_int* info );
+
+#define LAPACK_cggglm LAPACK_GLOBAL(cggglm,CGGGLM)
+void LAPACK_cggglm(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* D,
+    lapack_complex_float* X,
+    lapack_complex_float* Y,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dggglm LAPACK_GLOBAL(dggglm,DGGGLM)
+void LAPACK_dggglm(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* D,
+    double* X,
+    double* Y,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sggglm LAPACK_GLOBAL(sggglm,SGGGLM)
+void LAPACK_sggglm(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* D,
+    float* X,
+    float* Y,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zggglm LAPACK_GLOBAL(zggglm,ZGGGLM)
+void LAPACK_zggglm(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* D,
+    lapack_complex_double* X,
+    lapack_complex_double* Y,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgghd3 LAPACK_GLOBAL(cgghd3,CGGHD3)
+void LAPACK_cgghd3(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgghd3 LAPACK_GLOBAL(dgghd3,DGGHD3)
+void LAPACK_dgghd3(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* Q, lapack_int const* ldq,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgghd3 LAPACK_GLOBAL(sgghd3,SGGHD3)
+void LAPACK_sgghd3(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* Q, lapack_int const* ldq,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgghd3 LAPACK_GLOBAL(zgghd3,ZGGHD3)
+void LAPACK_zgghd3(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd,CGGHRD)
+void LAPACK_cgghrd(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_int* info );
+
+#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd,DGGHRD)
+void LAPACK_dgghrd(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* Q, lapack_int const* ldq,
+    double* Z, lapack_int const* ldz,
+    lapack_int* info );
+
+#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd,SGGHRD)
+void LAPACK_sgghrd(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* Q, lapack_int const* ldq,
+    float* Z, lapack_int const* ldz,
+    lapack_int* info );
+
+#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd,ZGGHRD)
+void LAPACK_zgghrd(
+    char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_int* info );
+
+#define LAPACK_cgglse LAPACK_GLOBAL(cgglse,CGGLSE)
+void LAPACK_cgglse(
+    lapack_int const* m, lapack_int const* n, lapack_int const* p,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* C,
+    lapack_complex_float* D,
+    lapack_complex_float* X,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dgglse LAPACK_GLOBAL(dgglse,DGGLSE)
+void LAPACK_dgglse(
+    lapack_int const* m, lapack_int const* n, lapack_int const* p,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* C,
+    double* D,
+    double* X,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sgglse LAPACK_GLOBAL(sgglse,SGGLSE)
+void LAPACK_sgglse(
+    lapack_int const* m, lapack_int const* n, lapack_int const* p,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* C,
+    float* D,
+    float* X,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zgglse LAPACK_GLOBAL(zgglse,ZGGLSE)
+void LAPACK_zgglse(
+    lapack_int const* m, lapack_int const* n, lapack_int const* p,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* C,
+    lapack_complex_double* D,
+    lapack_complex_double* X,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf,CGGQRF)
+void LAPACK_cggqrf(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* taua,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* taub,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf,DGGQRF)
+void LAPACK_dggqrf(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    double* A, lapack_int const* lda,
+    double* taua,
+    double* B, lapack_int const* ldb,
+    double* taub,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf,SGGQRF)
+void LAPACK_sggqrf(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    float* A, lapack_int const* lda,
+    float* taua,
+    float* B, lapack_int const* ldb,
+    float* taub,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf,ZGGQRF)
+void LAPACK_zggqrf(
+    lapack_int const* n, lapack_int const* m, lapack_int const* p,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* taua,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* taub,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf,CGGRQF)
+void LAPACK_cggrqf(
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* taua,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* taub,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf,DGGRQF)
+void LAPACK_dggrqf(
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* taua,
+    double* B, lapack_int const* ldb,
+    double* taub,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf,SGGRQF)
+void LAPACK_sggrqf(
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* taua,
+    float* B, lapack_int const* ldb,
+    float* taub,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf,ZGGRQF)
+void LAPACK_zggrqf(
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* taua,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* taub,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
+lapack_int LAPACKE_sggsvd( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, float* a,
+                           lapack_int lda, float* b, lapack_int ldb,
+                           float* alpha, float* beta, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, float* q, lapack_int ldq,
+                           lapack_int* iwork );
+
+#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
+lapack_int LAPACKE_dggsvd( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double* alpha, double* beta, double* u,
+                           lapack_int ldu, double* v, lapack_int ldv, double* q,
+                           lapack_int ldq, lapack_int* iwork );
+
+#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
+lapack_int LAPACKE_cggsvd( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb,
+                           float* alpha, float* beta, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* v,
+                           lapack_int ldv, lapack_complex_float* q,
+                           lapack_int ldq, lapack_int* iwork );
+
+#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
+lapack_int LAPACKE_zggsvd( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int n, lapack_int p,
+                           lapack_int* k, lapack_int* l,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double* alpha, double* beta,
+                           lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv,
+                           lapack_complex_double* q, lapack_int ldq,
+                           lapack_int* iwork );
+
+#define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3)
+void LAPACK_cggsvd3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float* alpha,
+    float* beta,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* V, lapack_int const* ldv,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dggsvd3 LAPACK_GLOBAL(dggsvd3,DGGSVD3)
+void LAPACK_dggsvd3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* alpha,
+    double* beta,
+    double* U, lapack_int const* ldu,
+    double* V, lapack_int const* ldv,
+    double* Q, lapack_int const* ldq,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sggsvd3 LAPACK_GLOBAL(sggsvd3,SGGSVD3)
+void LAPACK_sggsvd3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* alpha,
+    float* beta,
+    float* U, lapack_int const* ldu,
+    float* V, lapack_int const* ldv,
+    float* Q, lapack_int const* ldq,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zggsvd3 LAPACK_GLOBAL(zggsvd3,ZGGSVD3)
+void LAPACK_zggsvd3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* n, lapack_int const* p, lapack_int* k, lapack_int* l,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double* alpha,
+    double* beta,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* V, lapack_int const* ldv,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
+lapack_int LAPACKE_sggsvp( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, float* a,
+                           lapack_int lda, float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l, float* u,
+                           lapack_int ldu, float* v, lapack_int ldv, float* q,
+                           lapack_int ldq );
+
+#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
+lapack_int LAPACKE_dggsvp( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n, double* a,
+                           lapack_int lda, double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, double* u, lapack_int ldu, double* v,
+                           lapack_int ldv, double* q, lapack_int ldq );
+
+#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
+lapack_int LAPACKE_cggsvp( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_float* a, lapack_int lda,
+                           lapack_complex_float* b, lapack_int ldb, float tola,
+                           float tolb, lapack_int* k, lapack_int* l,
+                           lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv,
+                           lapack_complex_float* q, lapack_int ldq );
+
+#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
+lapack_int LAPACKE_zggsvp( int matrix_layout, char jobu, char jobv, char jobq,
+                           lapack_int m, lapack_int p, lapack_int n,
+                           lapack_complex_double* a, lapack_int lda,
+                           lapack_complex_double* b, lapack_int ldb,
+                           double tola, double tolb, lapack_int* k,
+                           lapack_int* l, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* v,
+                           lapack_int ldv, lapack_complex_double* q,
+                           lapack_int ldq );
+
+#define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3)
+void LAPACK_cggsvp3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float const* tola,
+    float const* tolb, lapack_int* k, lapack_int* l,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* V, lapack_int const* ldv,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_int* iwork,
+    float* rwork,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dggsvp3 LAPACK_GLOBAL(dggsvp3,DGGSVP3)
+void LAPACK_dggsvp3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double const* tola,
+    double const* tolb, lapack_int* k, lapack_int* l,
+    double* U, lapack_int const* ldu,
+    double* V, lapack_int const* ldv,
+    double* Q, lapack_int const* ldq,
+    lapack_int* iwork,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sggsvp3 LAPACK_GLOBAL(sggsvp3,SGGSVP3)
+void LAPACK_sggsvp3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float const* tola,
+    float const* tolb, lapack_int* k, lapack_int* l,
+    float* U, lapack_int const* ldu,
+    float* V, lapack_int const* ldv,
+    float* Q, lapack_int const* ldq,
+    lapack_int* iwork,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zggsvp3 LAPACK_GLOBAL(zggsvp3,ZGGSVP3)
+void LAPACK_zggsvp3(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double const* tola,
+    double const* tolb, lapack_int* k, lapack_int* l,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* V, lapack_int const* ldv,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_int* iwork,
+    double* rwork,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon,CGTCON)
+void LAPACK_cgtcon(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_float const* DL,
+    lapack_complex_float const* D,
+    lapack_complex_float const* DU,
+    lapack_complex_float const* DU2, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon,DGTCON)
+void LAPACK_dgtcon(
+    char const* norm,
+    lapack_int const* n,
+    double const* DL,
+    double const* D,
+    double const* DU,
+    double const* DU2, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon,SGTCON)
+void LAPACK_sgtcon(
+    char const* norm,
+    lapack_int const* n,
+    float const* DL,
+    float const* D,
+    float const* DU,
+    float const* DU2, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon,ZGTCON)
+void LAPACK_zgtcon(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_double const* DL,
+    lapack_complex_double const* D,
+    lapack_complex_double const* DU,
+    lapack_complex_double const* DU2, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs,CGTRFS)
+void LAPACK_cgtrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* DL,
+    lapack_complex_float const* D,
+    lapack_complex_float const* DU,
+    lapack_complex_float const* DLF,
+    lapack_complex_float const* DF,
+    lapack_complex_float const* DUF,
+    lapack_complex_float const* DU2, lapack_int const* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs,DGTRFS)
+void LAPACK_dgtrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* DL,
+    double const* D,
+    double const* DU,
+    double const* DLF,
+    double const* DF,
+    double const* DUF,
+    double const* DU2, lapack_int const* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs,SGTRFS)
+void LAPACK_sgtrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* DL,
+    float const* D,
+    float const* DU,
+    float const* DLF,
+    float const* DF,
+    float const* DUF,
+    float const* DU2, lapack_int const* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs,ZGTRFS)
+void LAPACK_zgtrfs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* DL,
+    lapack_complex_double const* D,
+    lapack_complex_double const* DU,
+    lapack_complex_double const* DLF,
+    lapack_complex_double const* DF,
+    lapack_complex_double const* DUF,
+    lapack_complex_double const* DU2, lapack_int const* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv,CGTSV)
+void LAPACK_cgtsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* DL,
+    lapack_complex_float* D,
+    lapack_complex_float* DU,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv,DGTSV)
+void LAPACK_dgtsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    double* DL,
+    double* D,
+    double* DU,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv,SGTSV)
+void LAPACK_sgtsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    float* DL,
+    float* D,
+    float* DU,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv,ZGTSV)
+void LAPACK_zgtsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* DL,
+    lapack_complex_double* D,
+    lapack_complex_double* DU,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx,CGTSVX)
+void LAPACK_cgtsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* DL,
+    lapack_complex_float const* D,
+    lapack_complex_float const* DU,
+    lapack_complex_float* DLF,
+    lapack_complex_float* DF,
+    lapack_complex_float* DUF,
+    lapack_complex_float* DU2, lapack_int* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx,DGTSVX)
+void LAPACK_dgtsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* DL,
+    double const* D,
+    double const* DU,
+    double* DLF,
+    double* DF,
+    double* DUF,
+    double* DU2, lapack_int* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx,SGTSVX)
+void LAPACK_sgtsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* DL,
+    float const* D,
+    float const* DU,
+    float* DLF,
+    float* DF,
+    float* DUF,
+    float* DU2, lapack_int* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx,ZGTSVX)
+void LAPACK_zgtsvx(
+    char const* fact, char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* DL,
+    lapack_complex_double const* D,
+    lapack_complex_double const* DU,
+    lapack_complex_double* DLF,
+    lapack_complex_double* DF,
+    lapack_complex_double* DUF,
+    lapack_complex_double* DU2, lapack_int* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf,CGTTRF)
+void LAPACK_cgttrf(
+    lapack_int const* n,
+    lapack_complex_float* DL,
+    lapack_complex_float* D,
+    lapack_complex_float* DU,
+    lapack_complex_float* DU2, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf,DGTTRF)
+void LAPACK_dgttrf(
+    lapack_int const* n,
+    double* DL,
+    double* D,
+    double* DU,
+    double* DU2, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf,SGTTRF)
+void LAPACK_sgttrf(
+    lapack_int const* n,
+    float* DL,
+    float* D,
+    float* DU,
+    float* DU2, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf,ZGTTRF)
+void LAPACK_zgttrf(
+    lapack_int const* n,
+    lapack_complex_double* DL,
+    lapack_complex_double* D,
+    lapack_complex_double* DU,
+    lapack_complex_double* DU2, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs,CGTTRS)
+void LAPACK_cgttrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* DL,
+    lapack_complex_float const* D,
+    lapack_complex_float const* DU,
+    lapack_complex_float const* DU2, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs,DGTTRS)
+void LAPACK_dgttrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* DL,
+    double const* D,
+    double const* DU,
+    double const* DU2, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs,SGTTRS)
+void LAPACK_sgttrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* DL,
+    float const* D,
+    float const* DU,
+    float const* DU2, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs,ZGTTRS)
+void LAPACK_zgttrs(
+    char const* trans,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* DL,
+    lapack_complex_double const* D,
+    lapack_complex_double const* DU,
+    lapack_complex_double const* DU2, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chbev LAPACK_GLOBAL(chbev,CHBEV)
+void LAPACK_chbev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhbev LAPACK_GLOBAL(zhbev,ZHBEV)
+void LAPACK_zhbev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chbev_2stage LAPACK_GLOBAL(chbev_2stage,CHBEV_2STAGE)
+void LAPACK_chbev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhbev_2stage LAPACK_GLOBAL(zhbev_2stage,ZHBEV_2STAGE)
+void LAPACK_zhbev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chbevd LAPACK_GLOBAL(chbevd,CHBEVD)
+void LAPACK_chbevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd,ZHBEVD)
+void LAPACK_zhbevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_chbevd_2stage LAPACK_GLOBAL(chbevd_2stage,CHBEVD_2STAGE)
+void LAPACK_chbevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zhbevd_2stage LAPACK_GLOBAL(zhbevd_2stage,ZHBEVD_2STAGE)
+void LAPACK_zhbevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_chbevx LAPACK_GLOBAL(chbevx,CHBEVX)
+void LAPACK_chbevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx,ZHBEVX)
+void LAPACK_zhbevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_chbevx_2stage LAPACK_GLOBAL(chbevx_2stage,CHBEVX_2STAGE)
+void LAPACK_chbevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zhbevx_2stage LAPACK_GLOBAL(zhbevx_2stage,ZHBEVX_2STAGE)
+void LAPACK_zhbevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_chbgst LAPACK_GLOBAL(chbgst,CHBGST)
+void LAPACK_chbgst(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float const* BB, lapack_int const* ldbb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst,ZHBGST)
+void LAPACK_zhbgst(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double const* BB, lapack_int const* ldbb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chbgv LAPACK_GLOBAL(chbgv,CHBGV)
+void LAPACK_chbgv(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* BB, lapack_int const* ldbb,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv,ZHBGV)
+void LAPACK_zhbgv(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* BB, lapack_int const* ldbb,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd,CHBGVD)
+void LAPACK_chbgvd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* BB, lapack_int const* ldbb,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd,ZHBGVD)
+void LAPACK_zhbgvd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* BB, lapack_int const* ldbb,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx,CHBGVX)
+void LAPACK_chbgvx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* BB, lapack_int const* ldbb,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx,ZHBGVX)
+void LAPACK_zhbgvx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* BB, lapack_int const* ldbb,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd,CHBTRD)
+void LAPACK_chbtrd(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    float* D,
+    float* E,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd,ZHBTRD)
+void LAPACK_zhbtrd(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    double* D,
+    double* E,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_checon LAPACK_GLOBAL(checon,CHECON)
+void LAPACK_checon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zhecon LAPACK_GLOBAL(zhecon,ZHECON)
+void LAPACK_zhecon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_checon_3 LAPACK_GLOBAL(checon_3,CHECON_3)
+void LAPACK_checon_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* E, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zhecon_3 LAPACK_GLOBAL(zhecon_3,ZHECON_3)
+void LAPACK_zhecon_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* E, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cheequb LAPACK_GLOBAL(cheequb,CHEEQUB)
+void LAPACK_cheequb(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zheequb LAPACK_GLOBAL(zheequb,ZHEEQUB)
+void LAPACK_zheequb(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cheev LAPACK_GLOBAL(cheev,CHEEV)
+void LAPACK_cheev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* W,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zheev LAPACK_GLOBAL(zheev,ZHEEV)
+void LAPACK_zheev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* W,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cheev_2stage LAPACK_GLOBAL(cheev_2stage,CHEEV_2STAGE)
+void LAPACK_cheev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* W,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zheev_2stage LAPACK_GLOBAL(zheev_2stage,ZHEEV_2STAGE)
+void LAPACK_zheev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* W,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cheevd LAPACK_GLOBAL(cheevd,CHEEVD)
+void LAPACK_cheevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* W,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zheevd LAPACK_GLOBAL(zheevd,ZHEEVD)
+void LAPACK_zheevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* W,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_cheevd_2stage LAPACK_GLOBAL(cheevd_2stage,CHEEVD_2STAGE)
+void LAPACK_cheevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* W,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zheevd_2stage LAPACK_GLOBAL(zheevd_2stage,ZHEEVD_2STAGE)
+void LAPACK_zheevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* W,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_cheevr LAPACK_GLOBAL(cheevr,CHEEVR)
+void LAPACK_cheevr(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zheevr LAPACK_GLOBAL(zheevr,ZHEEVR)
+void LAPACK_zheevr(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_cheevr_2stage LAPACK_GLOBAL(cheevr_2stage,CHEEVR_2STAGE)
+void LAPACK_cheevr_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zheevr_2stage LAPACK_GLOBAL(zheevr_2stage,ZHEEVR_2STAGE)
+void LAPACK_zheevr_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_cheevx LAPACK_GLOBAL(cheevx,CHEEVX)
+void LAPACK_cheevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zheevx LAPACK_GLOBAL(zheevx,ZHEEVX)
+void LAPACK_zheevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_cheevx_2stage LAPACK_GLOBAL(cheevx_2stage,CHEEVX_2STAGE)
+void LAPACK_cheevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zheevx_2stage LAPACK_GLOBAL(zheevx_2stage,ZHEEVX_2STAGE)
+void LAPACK_zheevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_chegst LAPACK_GLOBAL(chegst,CHEGST)
+void LAPACK_chegst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
+void LAPACK_zhegst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
+void LAPACK_chegv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float* W,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhegv LAPACK_GLOBAL(zhegv,ZHEGV)
+void LAPACK_zhegv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double* W,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chegv_2stage LAPACK_GLOBAL(chegv_2stage,CHEGV_2STAGE)
+void LAPACK_chegv_2stage(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float* W,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhegv_2stage LAPACK_GLOBAL(zhegv_2stage,ZHEGV_2STAGE)
+void LAPACK_zhegv_2stage(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double* W,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chegvd LAPACK_GLOBAL(chegvd,CHEGVD)
+void LAPACK_chegvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float* W,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd,ZHEGVD)
+void LAPACK_zhegvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double* W,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_chegvx LAPACK_GLOBAL(chegvx,CHEGVX)
+void LAPACK_chegvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx,ZHEGVX)
+void LAPACK_zhegvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_cherfs LAPACK_GLOBAL(cherfs,CHERFS)
+void LAPACK_cherfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zherfs LAPACK_GLOBAL(zherfs,ZHERFS)
+void LAPACK_zherfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx,CHERFSX)
+void LAPACK_cherfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    float* S,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx,ZHERFSX)
+void LAPACK_zherfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    double* S,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chesv LAPACK_GLOBAL(chesv,CHESV)
+void LAPACK_chesv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhesv LAPACK_GLOBAL(zhesv,ZHESV)
+void LAPACK_zhesv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chesv_aa LAPACK_GLOBAL(chesv_aa,CHESV_AA)
+void LAPACK_chesv_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhesv_aa LAPACK_GLOBAL(zhesv_aa,ZHESV_AA)
+void LAPACK_zhesv_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chesv_aa_2stage LAPACK_GLOBAL(chesv_aa_2stage,CHESV_AA_2STAGE)
+void LAPACK_chesv_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhesv_aa_2stage LAPACK_GLOBAL(zhesv_aa_2stage,ZHESV_AA_2STAGE)
+void LAPACK_zhesv_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chesv_rk LAPACK_GLOBAL(chesv_rk,CHESV_RK)
+void LAPACK_chesv_rk(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* E, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhesv_rk LAPACK_GLOBAL(zhesv_rk,ZHESV_RK)
+void LAPACK_zhesv_rk(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* E, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chesv_rook LAPACK_GLOBAL(chesv_rook,CHESV_ROOK)
+void LAPACK_chesv_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhesv_rook LAPACK_GLOBAL(zhesv_rook,ZHESV_ROOK)
+void LAPACK_zhesv_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chesvx LAPACK_GLOBAL(chesvx,CHESVX)
+void LAPACK_chesvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx,ZHESVX)
+void LAPACK_zhesvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx,CHESVXX)
+void LAPACK_chesvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    float* S,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx,ZHESVXX)
+void LAPACK_zhesvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    double* S,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr,CHESWAPR)
+void LAPACK_cheswapr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 );
+
+#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr,ZHESWAPR)
+void LAPACK_zheswapr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 );
+
+#define LAPACK_chetrd LAPACK_GLOBAL(chetrd,CHETRD)
+void LAPACK_chetrd(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* D,
+    float* E,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd,ZHETRD)
+void LAPACK_zhetrd(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* D,
+    double* E,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrd_2stage LAPACK_GLOBAL(chetrd_2stage,CHETRD_2STAGE)
+void LAPACK_chetrd_2stage(
+    char const* vect, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    float* D,
+    float* E,
+    lapack_complex_float* tau,
+    lapack_complex_float* HOUS2, lapack_int const* lhous2,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrd_2stage LAPACK_GLOBAL(zhetrd_2stage,ZHETRD_2STAGE)
+void LAPACK_zhetrd_2stage(
+    char const* vect, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    double* D,
+    double* E,
+    lapack_complex_double* tau,
+    lapack_complex_double* HOUS2, lapack_int const* lhous2,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrf LAPACK_GLOBAL(chetrf,CHETRF)
+void LAPACK_chetrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf,ZHETRF)
+void LAPACK_zhetrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrf_aa LAPACK_GLOBAL(chetrf_aa,CHETRF_AA)
+void LAPACK_chetrf_aa(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrf_aa LAPACK_GLOBAL(zhetrf_aa,ZHETRF_AA)
+void LAPACK_zhetrf_aa(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrf_aa_2stage LAPACK_GLOBAL(chetrf_aa_2stage,CHETRF_AA_2STAGE)
+void LAPACK_chetrf_aa_2stage(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrf_aa_2stage LAPACK_GLOBAL(zhetrf_aa_2stage,ZHETRF_AA_2STAGE)
+void LAPACK_zhetrf_aa_2stage(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrf_rk LAPACK_GLOBAL(chetrf_rk,CHETRF_RK)
+void LAPACK_chetrf_rk(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* E, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrf_rk LAPACK_GLOBAL(zhetrf_rk,ZHETRF_RK)
+void LAPACK_zhetrf_rk(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* E, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrf_rook LAPACK_GLOBAL(chetrf_rook,CHETRF_ROOK)
+void LAPACK_chetrf_rook(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrf_rook LAPACK_GLOBAL(zhetrf_rook,ZHETRF_ROOK)
+void LAPACK_zhetrf_rook(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetri LAPACK_GLOBAL(chetri,CHETRI)
+void LAPACK_chetri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zhetri LAPACK_GLOBAL(zhetri,ZHETRI)
+void LAPACK_zhetri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2,CHETRI2)
+void LAPACK_chetri2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2,ZHETRI2)
+void LAPACK_zhetri2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x,CHETRI2X)
+void LAPACK_chetri2x(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* work, lapack_int const* nb,
+    lapack_int* info );
+
+#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x,ZHETRI2X)
+void LAPACK_zhetri2x(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* work, lapack_int const* nb,
+    lapack_int* info );
+
+#define LAPACK_chetri_3 LAPACK_GLOBAL(chetri_3,CHETRI_3)
+void LAPACK_chetri_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* E, lapack_int const* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetri_3 LAPACK_GLOBAL(zhetri_3,ZHETRI_3)
+void LAPACK_zhetri_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* E, lapack_int const* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrs LAPACK_GLOBAL(chetrs,CHETRS)
+void LAPACK_chetrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs,ZHETRS)
+void LAPACK_zhetrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2,CHETRS2)
+void LAPACK_chetrs2(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2,ZHETRS2)
+void LAPACK_zhetrs2(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_chetrs_3 LAPACK_GLOBAL(chetrs_3,CHETRS_3)
+void LAPACK_chetrs_3(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* E, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zhetrs_3 LAPACK_GLOBAL(zhetrs_3,ZHETRS_3)
+void LAPACK_zhetrs_3(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* E, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chetrs_aa LAPACK_GLOBAL(chetrs_aa,CHETRS_AA)
+void LAPACK_chetrs_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhetrs_aa LAPACK_GLOBAL(zhetrs_aa,ZHETRS_AA)
+void LAPACK_zhetrs_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_chetrs_aa_2stage LAPACK_GLOBAL(chetrs_aa_2stage,CHETRS_AA_2STAGE)
+void LAPACK_chetrs_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zhetrs_aa_2stage LAPACK_GLOBAL(zhetrs_aa_2stage,ZHETRS_AA_2STAGE)
+void LAPACK_zhetrs_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chetrs_rook LAPACK_GLOBAL(chetrs_rook,CHETRS_ROOK)
+void LAPACK_chetrs_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zhetrs_rook LAPACK_GLOBAL(zhetrs_rook,ZHETRS_ROOK)
+void LAPACK_zhetrs_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chfrk LAPACK_GLOBAL(chfrk,CHFRK)
+void LAPACK_chfrk(
+    char const* transr, char const* uplo, char const* trans,
+    lapack_int const* n, lapack_int const* k,
+    float const* alpha,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float const* beta,
+    lapack_complex_float* C );
+
+#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk,ZHFRK)
+void LAPACK_zhfrk(
+    char const* transr, char const* uplo, char const* trans,
+    lapack_int const* n, lapack_int const* k,
+    double const* alpha,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double const* beta,
+    lapack_complex_double* C );
+
+#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz,CHGEQZ)
+void LAPACK_chgeqz(
+    char const* job, char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_float* H, lapack_int const* ldh,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz,DHGEQZ)
+void LAPACK_dhgeqz(
+    char const* job, char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double* H, lapack_int const* ldh,
+    double* T, lapack_int const* ldt,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* Q, lapack_int const* ldq,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz,SHGEQZ)
+void LAPACK_shgeqz(
+    char const* job, char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float* H, lapack_int const* ldh,
+    float* T, lapack_int const* ldt,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* Q, lapack_int const* ldq,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz,ZHGEQZ)
+void LAPACK_zhgeqz(
+    char const* job, char const* compq, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_double* H, lapack_int const* ldh,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chpcon LAPACK_GLOBAL(chpcon,CHPCON)
+void LAPACK_chpcon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon,ZHPCON)
+void LAPACK_zhpcon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_chpev LAPACK_GLOBAL(chpev,CHPEV)
+void LAPACK_chpev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhpev LAPACK_GLOBAL(zhpev,ZHPEV)
+void LAPACK_zhpev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chpevd LAPACK_GLOBAL(chpevd,CHPEVD)
+void LAPACK_chpevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd,ZHPEVD)
+void LAPACK_zhpevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_chpevx LAPACK_GLOBAL(chpevx,CHPEVX)
+void LAPACK_chpevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx,ZHPEVX)
+void LAPACK_zhpevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_chpgst LAPACK_GLOBAL(chpgst,CHPGST)
+void LAPACK_chpgst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    lapack_complex_float const* BP,
+    lapack_int* info );
+
+#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst,ZHPGST)
+void LAPACK_zhpgst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    lapack_complex_double const* BP,
+    lapack_int* info );
+
+#define LAPACK_chpgv LAPACK_GLOBAL(chpgv,CHPGV)
+void LAPACK_chpgv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    lapack_complex_float* BP,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv,ZHPGV)
+void LAPACK_zhpgv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    lapack_complex_double* BP,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd,CHPGVD)
+void LAPACK_chpgvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    lapack_complex_float* BP,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd,ZHPGVD)
+void LAPACK_zhpgvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    lapack_complex_double* BP,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx,CHPGVX)
+void LAPACK_chpgvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    lapack_complex_float* BP,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx,ZHPGVX)
+void LAPACK_zhpgvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    lapack_complex_double* BP,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_chprfs LAPACK_GLOBAL(chprfs,CHPRFS)
+void LAPACK_chprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float const* AFP, lapack_int const* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs,ZHPRFS)
+void LAPACK_zhprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double const* AFP, lapack_int const* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chpsv LAPACK_GLOBAL(chpsv,CHPSV)
+void LAPACK_chpsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* AP, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv,ZHPSV)
+void LAPACK_zhpsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* AP, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx,CHPSVX)
+void LAPACK_chpsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float* AFP, lapack_int* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx,ZHPSVX)
+void LAPACK_zhpsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double* AFP, lapack_int* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_chptrd LAPACK_GLOBAL(chptrd,CHPTRD)
+void LAPACK_chptrd(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    float* D,
+    float* E,
+    lapack_complex_float* tau,
+    lapack_int* info );
+
+#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd,ZHPTRD)
+void LAPACK_zhptrd(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    double* D,
+    double* E,
+    lapack_complex_double* tau,
+    lapack_int* info );
+
+#define LAPACK_chptrf LAPACK_GLOBAL(chptrf,CHPTRF)
+void LAPACK_chptrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf,ZHPTRF)
+void LAPACK_zhptrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_chptri LAPACK_GLOBAL(chptri,CHPTRI)
+void LAPACK_chptri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP, lapack_int const* ipiv,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zhptri LAPACK_GLOBAL(zhptri,ZHPTRI)
+void LAPACK_zhptri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP, lapack_int const* ipiv,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_chptrs LAPACK_GLOBAL(chptrs,CHPTRS)
+void LAPACK_chptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs,ZHPTRS)
+void LAPACK_zhptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_chsein LAPACK_GLOBAL(chsein,CHSEIN)
+void LAPACK_chsein(
+    char const* side, char const* eigsrc, char const* initv,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_float const* H, lapack_int const* ldh,
+    lapack_complex_float* W,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_float* work,
+    float* rwork, lapack_int* IFAILL, lapack_int* IFAILR,
+    lapack_int* info );
+
+#define LAPACK_dhsein LAPACK_GLOBAL(dhsein,DHSEIN)
+void LAPACK_dhsein(
+    char const* side, char const* eigsrc, char const* initv,
+    lapack_logical* select,
+    lapack_int const* n,
+    double const* H, lapack_int const* ldh,
+    double* WR,
+    double const* WI,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    double* work, lapack_int* IFAILL, lapack_int* IFAILR,
+    lapack_int* info );
+
+#define LAPACK_shsein LAPACK_GLOBAL(shsein,SHSEIN)
+void LAPACK_shsein(
+    char const* side, char const* eigsrc, char const* initv,
+    lapack_logical* select,
+    lapack_int const* n,
+    float const* H, lapack_int const* ldh,
+    float* WR,
+    float const* WI,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    float* work, lapack_int* IFAILL, lapack_int* IFAILR,
+    lapack_int* info );
+
+#define LAPACK_zhsein LAPACK_GLOBAL(zhsein,ZHSEIN)
+void LAPACK_zhsein(
+    char const* side, char const* eigsrc, char const* initv,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_double const* H, lapack_int const* ldh,
+    lapack_complex_double* W,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_double* work,
+    double* rwork, lapack_int* IFAILL, lapack_int* IFAILR,
+    lapack_int* info );
+
+#define LAPACK_chseqr LAPACK_GLOBAL(chseqr,CHSEQR)
+void LAPACK_chseqr(
+    char const* job, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_float* H, lapack_int const* ldh,
+    lapack_complex_float* W,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr,DHSEQR)
+void LAPACK_dhseqr(
+    char const* job, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double* H, lapack_int const* ldh,
+    double* WR,
+    double* WI,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_shseqr LAPACK_GLOBAL(shseqr,SHSEQR)
+void LAPACK_shseqr(
+    char const* job, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float* H, lapack_int const* ldh,
+    float* WR,
+    float* WI,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr,ZHSEQR)
+void LAPACK_zhseqr(
+    char const* job, char const* compz,
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_double* H, lapack_int const* ldh,
+    lapack_complex_double* W,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_clacgv LAPACK_GLOBAL(clacgv,CLACGV)
+void LAPACK_clacgv(
+    lapack_int const* n,
+    lapack_complex_float* X, lapack_int const* incx );
+
+#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv,ZLACGV)
+void LAPACK_zlacgv(
+    lapack_int const* n,
+    lapack_complex_double* X, lapack_int const* incx );
+
+#define LAPACK_clacn2 LAPACK_GLOBAL(clacn2,CLACN2)
+void LAPACK_clacn2(
+    lapack_int const* n,
+    lapack_complex_float* V,
+    lapack_complex_float* X,
+    float* est, lapack_int* kase, lapack_int* ISAVE );
+
+#define LAPACK_dlacn2 LAPACK_GLOBAL(dlacn2,DLACN2)
+void LAPACK_dlacn2(
+    lapack_int const* n,
+    double* V,
+    double* X, lapack_int* ISGN,
+    double* est, lapack_int* kase, lapack_int* ISAVE );
+
+#define LAPACK_slacn2 LAPACK_GLOBAL(slacn2,SLACN2)
+void LAPACK_slacn2(
+    lapack_int const* n,
+    float* V,
+    float* X, lapack_int* ISGN,
+    float* est, lapack_int* kase, lapack_int* ISAVE );
+
+#define LAPACK_zlacn2 LAPACK_GLOBAL(zlacn2,ZLACN2)
+void LAPACK_zlacn2(
+    lapack_int const* n,
+    lapack_complex_double* V,
+    lapack_complex_double* X,
+    double* est, lapack_int* kase, lapack_int* ISAVE );
+
+#define LAPACK_clacp2 LAPACK_GLOBAL(clacp2,CLACP2)
+void LAPACK_clacp2(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb );
+
+#define LAPACK_zlacp2 LAPACK_GLOBAL(zlacp2,ZLACP2)
+void LAPACK_zlacp2(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb );
+
+#define LAPACK_clacpy LAPACK_GLOBAL(clacpy,CLACPY)
+void LAPACK_clacpy(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb );
+
+#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy,DLACPY)
+void LAPACK_dlacpy(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb );
+
+#define LAPACK_slacpy LAPACK_GLOBAL(slacpy,SLACPY)
+void LAPACK_slacpy(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb );
+
+#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy,ZLACPY)
+void LAPACK_zlacpy(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb );
+
+#define LAPACK_clacrm LAPACK_GLOBAL(clacrm,CLACRM)
+void LAPACK_clacrm(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    lapack_complex_float* C, lapack_int const* ldc,
+    float* rwork );
+
+#define LAPACK_zlacrm LAPACK_GLOBAL(zlacrm,ZLACRM)
+void LAPACK_zlacrm(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    lapack_complex_double* C, lapack_int const* ldc,
+    double* rwork );
+
+#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c,ZLAG2C)
+void LAPACK_zlag2c(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_float* SA, lapack_int const* ldsa,
+    lapack_int* info );
+
+#define LAPACK_slag2d LAPACK_GLOBAL(slag2d,SLAG2D)
+void LAPACK_slag2d(
+    lapack_int const* m, lapack_int const* n,
+    float const* SA, lapack_int const* ldsa,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s,DLAG2S)
+void LAPACK_dlag2s(
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    float* SA, lapack_int const* ldsa,
+    lapack_int* info );
+
+#define LAPACK_clag2z LAPACK_GLOBAL(clag2z,CLAG2Z)
+void LAPACK_clag2z(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* SA, lapack_int const* ldsa,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_clagge LAPACK_GLOBAL(clagge,CLAGGE)
+void LAPACK_clagge(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    float const* D,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* iseed,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dlagge LAPACK_GLOBAL(dlagge,DLAGGE)
+void LAPACK_dlagge(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    double const* D,
+    double* A, lapack_int const* lda, lapack_int* iseed,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_slagge LAPACK_GLOBAL(slagge,SLAGGE)
+void LAPACK_slagge(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    float const* D,
+    float* A, lapack_int const* lda, lapack_int* iseed,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zlagge LAPACK_GLOBAL(zlagge,ZLAGGE)
+void LAPACK_zlagge(
+    lapack_int const* m, lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    double const* D,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* iseed,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_claghe LAPACK_GLOBAL(claghe,CLAGHE)
+void LAPACK_claghe(
+    lapack_int const* n, lapack_int const* k,
+    float const* D,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* iseed,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe,ZLAGHE)
+void LAPACK_zlaghe(
+    lapack_int const* n, lapack_int const* k,
+    double const* D,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* iseed,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_clagsy LAPACK_GLOBAL(clagsy,CLAGSY)
+void LAPACK_clagsy(
+    lapack_int const* n, lapack_int const* k,
+    float const* D,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* iseed,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy,DLAGSY)
+void LAPACK_dlagsy(
+    lapack_int const* n, lapack_int const* k,
+    double const* D,
+    double* A, lapack_int const* lda, lapack_int* iseed,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_slagsy LAPACK_GLOBAL(slagsy,SLAGSY)
+void LAPACK_slagsy(
+    lapack_int const* n, lapack_int const* k,
+    float const* D,
+    float* A, lapack_int const* lda, lapack_int* iseed,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy,ZLAGSY)
+void LAPACK_zlagsy(
+    lapack_int const* n, lapack_int const* k,
+    double const* D,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* iseed,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_dlamch LAPACK_GLOBAL(dlamch,DLAMCH)
+double LAPACK_dlamch(
+    char const* cmach );
+
+#define LAPACK_slamch LAPACK_GLOBAL(slamch,SLAMCH)
+lapack_float_return LAPACK_slamch(
+    char const* cmach );
+
+#define LAPACK_clangb LAPACK_GLOBAL(clangb,CLANGB)
+lapack_float_return LAPACK_clangb(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* work );
+
+#define LAPACK_dlangb LAPACK_GLOBAL(dlangb,DLANGB)
+double LAPACK_dlangb(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    double const* AB, lapack_int const* ldab,
+    double* work );
+
+#define LAPACK_slangb LAPACK_GLOBAL(slangb,SLANGB)
+lapack_float_return LAPACK_slangb(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    float const* AB, lapack_int const* ldab,
+    float* work );
+
+#define LAPACK_zlangb LAPACK_GLOBAL(zlangb,ZLANGB)
+double LAPACK_zlangb(
+    char const* norm,
+    lapack_int const* n, lapack_int const* kl, lapack_int const* ku,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* work );
+
+#define LAPACK_clange LAPACK_GLOBAL(clange,CLANGE)
+lapack_float_return LAPACK_clange(
+    char const* norm,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_dlange LAPACK_GLOBAL(dlange,DLANGE)
+double LAPACK_dlange(
+    char const* norm,
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_slange LAPACK_GLOBAL(slange,SLANGE)
+lapack_float_return LAPACK_slange(
+    char const* norm,
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_zlange LAPACK_GLOBAL(zlange,ZLANGE)
+double LAPACK_zlange(
+    char const* norm,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_clangt LAPACK_GLOBAL(clangt,CLANGT)
+lapack_float_return LAPACK_clangt(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_float const* DL,
+    lapack_complex_float const* D,
+    lapack_complex_float const* DU );
+
+#define LAPACK_dlangt LAPACK_GLOBAL(dlangt,DLANGT)
+double LAPACK_dlangt(
+    char const* norm,
+    lapack_int const* n,
+    double const* DL,
+    double const* D,
+    double const* DU );
+
+#define LAPACK_slangt LAPACK_GLOBAL(slangt,SLANGT)
+lapack_float_return LAPACK_slangt(
+    char const* norm,
+    lapack_int const* n,
+    float const* DL,
+    float const* D,
+    float const* DU );
+
+#define LAPACK_zlangt LAPACK_GLOBAL(zlangt,ZLANGT)
+double LAPACK_zlangt(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_double const* DL,
+    lapack_complex_double const* D,
+    lapack_complex_double const* DU );
+
+#define LAPACK_clanhb LAPACK_GLOBAL(clanhb,CLANHB)
+lapack_float_return LAPACK_clanhb(
+    char const* norm, char const* uplo,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* work );
+
+#define LAPACK_zlanhb LAPACK_GLOBAL(zlanhb,ZLANHB)
+double LAPACK_zlanhb(
+    char const* norm, char const* uplo,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* work );
+
+#define LAPACK_clanhe LAPACK_GLOBAL(clanhe,CLANHE)
+lapack_float_return LAPACK_clanhe(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe,ZLANHE)
+double LAPACK_zlanhe(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_clanhp LAPACK_GLOBAL(clanhp,CLANHP)
+lapack_float_return LAPACK_clanhp(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    float* work );
+
+#define LAPACK_zlanhp LAPACK_GLOBAL(zlanhp,ZLANHP)
+double LAPACK_zlanhp(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    double* work );
+
+#define LAPACK_clanhs LAPACK_GLOBAL(clanhs,CLANHS)
+lapack_float_return LAPACK_clanhs(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_dlanhs LAPACK_GLOBAL(dlanhs,DLANHS)
+double LAPACK_dlanhs(
+    char const* norm,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_slanhs LAPACK_GLOBAL(slanhs,SLANHS)
+lapack_float_return LAPACK_slanhs(
+    char const* norm,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_zlanhs LAPACK_GLOBAL(zlanhs,ZLANHS)
+double LAPACK_zlanhs(
+    char const* norm,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_clanht LAPACK_GLOBAL(clanht,CLANHT)
+lapack_float_return LAPACK_clanht(
+    char const* norm,
+    lapack_int const* n,
+    float const* D,
+    lapack_complex_float const* E );
+
+#define LAPACK_zlanht LAPACK_GLOBAL(zlanht,ZLANHT)
+double LAPACK_zlanht(
+    char const* norm,
+    lapack_int const* n,
+    double const* D,
+    lapack_complex_double const* E );
+
+#define LAPACK_clansb LAPACK_GLOBAL(clansb,CLANSB)
+lapack_float_return LAPACK_clansb(
+    char const* norm, char const* uplo,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* work );
+
+#define LAPACK_dlansb LAPACK_GLOBAL(dlansb,DLANSB)
+double LAPACK_dlansb(
+    char const* norm, char const* uplo,
+    lapack_int const* n, lapack_int const* k,
+    double const* AB, lapack_int const* ldab,
+    double* work );
+
+#define LAPACK_slansb LAPACK_GLOBAL(slansb,SLANSB)
+lapack_float_return LAPACK_slansb(
+    char const* norm, char const* uplo,
+    lapack_int const* n, lapack_int const* k,
+    float const* AB, lapack_int const* ldab,
+    float* work );
+
+#define LAPACK_zlansb LAPACK_GLOBAL(zlansb,ZLANSB)
+double LAPACK_zlansb(
+    char const* norm, char const* uplo,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* work );
+
+#define LAPACK_clansp LAPACK_GLOBAL(clansp,CLANSP)
+lapack_float_return LAPACK_clansp(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    float* work );
+
+#define LAPACK_dlansp LAPACK_GLOBAL(dlansp,DLANSP)
+double LAPACK_dlansp(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    double const* AP,
+    double* work );
+
+#define LAPACK_slansp LAPACK_GLOBAL(slansp,SLANSP)
+lapack_float_return LAPACK_slansp(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    float const* AP,
+    float* work );
+
+#define LAPACK_zlansp LAPACK_GLOBAL(zlansp,ZLANSP)
+double LAPACK_zlansp(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    double* work );
+
+#define LAPACK_dlanst LAPACK_GLOBAL(dlanst,DLANST)
+double LAPACK_dlanst(
+    char const* norm,
+    lapack_int const* n,
+    double const* D,
+    double const* E );
+
+#define LAPACK_slanst LAPACK_GLOBAL(slanst,SLANST)
+lapack_float_return LAPACK_slanst(
+    char const* norm,
+    lapack_int const* n,
+    float const* D,
+    float const* E );
+
+#define LAPACK_clansy LAPACK_GLOBAL(clansy,CLANSY)
+lapack_float_return LAPACK_clansy(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_dlansy LAPACK_GLOBAL(dlansy,DLANSY)
+double LAPACK_dlansy(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_slansy LAPACK_GLOBAL(slansy,SLANSY)
+lapack_float_return LAPACK_slansy(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_zlansy LAPACK_GLOBAL(zlansy,ZLANSY)
+double LAPACK_zlansy(
+    char const* norm, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_clantb LAPACK_GLOBAL(clantb,CLANTB)
+lapack_float_return LAPACK_clantb(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* work );
+
+#define LAPACK_dlantb LAPACK_GLOBAL(dlantb,DLANTB)
+double LAPACK_dlantb(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* k,
+    double const* AB, lapack_int const* ldab,
+    double* work );
+
+#define LAPACK_slantb LAPACK_GLOBAL(slantb,SLANTB)
+lapack_float_return LAPACK_slantb(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* k,
+    float const* AB, lapack_int const* ldab,
+    float* work );
+
+#define LAPACK_zlantb LAPACK_GLOBAL(zlantb,ZLANTB)
+double LAPACK_zlantb(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* work );
+
+#define LAPACK_clantp LAPACK_GLOBAL(clantp,CLANTP)
+lapack_float_return LAPACK_clantp(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    float* work );
+
+#define LAPACK_dlantp LAPACK_GLOBAL(dlantp,DLANTP)
+double LAPACK_dlantp(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    double const* AP,
+    double* work );
+
+#define LAPACK_slantp LAPACK_GLOBAL(slantp,SLANTP)
+lapack_float_return LAPACK_slantp(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    float const* AP,
+    float* work );
+
+#define LAPACK_zlantp LAPACK_GLOBAL(zlantp,ZLANTP)
+double LAPACK_zlantp(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    double* work );
+
+#define LAPACK_clantr LAPACK_GLOBAL(clantr,CLANTR)
+lapack_float_return LAPACK_clantr(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_dlantr LAPACK_GLOBAL(dlantr,DLANTR)
+double LAPACK_dlantr(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_slantr LAPACK_GLOBAL(slantr,SLANTR)
+lapack_float_return LAPACK_slantr(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* work );
+
+#define LAPACK_zlantr LAPACK_GLOBAL(zlantr,ZLANTR)
+double LAPACK_zlantr(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* work );
+
+#define LAPACK_clapmr LAPACK_GLOBAL(clapmr,CLAPMR)
+void LAPACK_clapmr(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr,DLAPMR)
+void LAPACK_dlapmr(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    double* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_slapmr LAPACK_GLOBAL(slapmr,SLAPMR)
+void LAPACK_slapmr(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    float* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr,ZLAPMR)
+void LAPACK_zlapmr(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_clapmt LAPACK_GLOBAL(clapmt,CLAPMT)
+void LAPACK_clapmt(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_dlapmt LAPACK_GLOBAL(dlapmt,DLAPMT)
+void LAPACK_dlapmt(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    double* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_slapmt LAPACK_GLOBAL(slapmt,SLAPMT)
+void LAPACK_slapmt(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    float* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_zlapmt LAPACK_GLOBAL(zlapmt,ZLAPMT)
+void LAPACK_zlapmt(
+    lapack_logical const* forwrd, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* X, lapack_int const* ldx, lapack_int* K );
+
+#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2,DLAPY2)
+double LAPACK_dlapy2(
+    double const* x,
+    double const* y );
+
+#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2,SLAPY2)
+lapack_float_return LAPACK_slapy2(
+    float const* x,
+    float const* y );
+
+#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3,DLAPY3)
+double LAPACK_dlapy3(
+    double const* x,
+    double const* y,
+    double const* z );
+
+#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3,SLAPY3)
+lapack_float_return LAPACK_slapy3(
+    float const* x,
+    float const* y,
+    float const* z );
+
+#define LAPACK_clarcm LAPACK_GLOBAL(clarcm,CLARCM)
+void LAPACK_clarcm(
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* C, lapack_int const* ldc,
+    float* rwork );
+
+#define LAPACK_zlarcm LAPACK_GLOBAL(zlarcm,ZLARCM)
+void LAPACK_zlarcm(
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* C, lapack_int const* ldc,
+    double* rwork );
+
+#define LAPACK_clarf LAPACK_GLOBAL(clarf,CLARF)
+void LAPACK_clarf(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* V, lapack_int const* incv,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work );
+
+#define LAPACK_dlarf LAPACK_GLOBAL(dlarf,DLARF)
+void LAPACK_dlarf(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    double const* V, lapack_int const* incv,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work );
+
+#define LAPACK_slarf LAPACK_GLOBAL(slarf,SLARF)
+void LAPACK_slarf(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    float const* V, lapack_int const* incv,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work );
+
+#define LAPACK_zlarf LAPACK_GLOBAL(zlarf,ZLARF)
+void LAPACK_zlarf(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* V, lapack_int const* incv,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work );
+
+#define LAPACK_clarfb LAPACK_GLOBAL(clarfb,CLARFB)
+void LAPACK_clarfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* V, lapack_int const* ldv,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* ldwork );
+
+#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb,DLARFB)
+void LAPACK_dlarfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* V, lapack_int const* ldv,
+    double const* T, lapack_int const* ldt,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* ldwork );
+
+#define LAPACK_slarfb LAPACK_GLOBAL(slarfb,SLARFB)
+void LAPACK_slarfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* V, lapack_int const* ldv,
+    float const* T, lapack_int const* ldt,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* ldwork );
+
+#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb,ZLARFB)
+void LAPACK_zlarfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* V, lapack_int const* ldv,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* ldwork );
+
+#define LAPACK_clarfg LAPACK_GLOBAL(clarfg,CLARFG)
+void LAPACK_clarfg(
+    lapack_int const* n,
+    lapack_complex_float* alpha,
+    lapack_complex_float* X, lapack_int const* incx,
+    lapack_complex_float* tau );
+
+#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg,DLARFG)
+void LAPACK_dlarfg(
+    lapack_int const* n,
+    double* alpha,
+    double* X, lapack_int const* incx,
+    double* tau );
+
+#define LAPACK_slarfg LAPACK_GLOBAL(slarfg,SLARFG)
+void LAPACK_slarfg(
+    lapack_int const* n,
+    float* alpha,
+    float* X, lapack_int const* incx,
+    float* tau );
+
+#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg,ZLARFG)
+void LAPACK_zlarfg(
+    lapack_int const* n,
+    lapack_complex_double* alpha,
+    lapack_complex_double* X, lapack_int const* incx,
+    lapack_complex_double* tau );
+
+#define LAPACK_clarft LAPACK_GLOBAL(clarft,CLARFT)
+void LAPACK_clarft(
+    char const* direct, char const* storev,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* V, lapack_int const* ldv,
+    lapack_complex_float const* tau,
+    lapack_complex_float* T, lapack_int const* ldt );
+
+#define LAPACK_dlarft LAPACK_GLOBAL(dlarft,DLARFT)
+void LAPACK_dlarft(
+    char const* direct, char const* storev,
+    lapack_int const* n, lapack_int const* k,
+    double const* V, lapack_int const* ldv,
+    double const* tau,
+    double* T, lapack_int const* ldt );
+
+#define LAPACK_slarft LAPACK_GLOBAL(slarft,SLARFT)
+void LAPACK_slarft(
+    char const* direct, char const* storev,
+    lapack_int const* n, lapack_int const* k,
+    float const* V, lapack_int const* ldv,
+    float const* tau,
+    float* T, lapack_int const* ldt );
+
+#define LAPACK_zlarft LAPACK_GLOBAL(zlarft,ZLARFT)
+void LAPACK_zlarft(
+    char const* direct, char const* storev,
+    lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* V, lapack_int const* ldv,
+    lapack_complex_double const* tau,
+    lapack_complex_double* T, lapack_int const* ldt );
+
+#define LAPACK_clarfx LAPACK_GLOBAL(clarfx,CLARFX)
+void LAPACK_clarfx(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* V,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work );
+
+#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx,DLARFX)
+void LAPACK_dlarfx(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    double const* V,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work );
+
+#define LAPACK_slarfx LAPACK_GLOBAL(slarfx,SLARFX)
+void LAPACK_slarfx(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    float const* V,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work );
+
+#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx,ZLARFX)
+void LAPACK_zlarfx(
+    char const* side,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* V,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work );
+
+#define LAPACK_clarnv LAPACK_GLOBAL(clarnv,CLARNV)
+void LAPACK_clarnv(
+    lapack_int const* idist, lapack_int* iseed, lapack_int const* n,
+    lapack_complex_float* X );
+
+#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv,DLARNV)
+void LAPACK_dlarnv(
+    lapack_int const* idist, lapack_int* iseed, lapack_int const* n,
+    double* X );
+
+#define LAPACK_slarnv LAPACK_GLOBAL(slarnv,SLARNV)
+void LAPACK_slarnv(
+    lapack_int const* idist, lapack_int* iseed, lapack_int const* n,
+    float* X );
+
+#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv,ZLARNV)
+void LAPACK_zlarnv(
+    lapack_int const* idist, lapack_int* iseed, lapack_int const* n,
+    lapack_complex_double* X );
+
+#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp,DLARTGP)
+void LAPACK_dlartgp(
+    double const* f,
+    double const* g,
+    double* cs,
+    double* sn,
+    double* r );
+
+#define LAPACK_slartgp LAPACK_GLOBAL(slartgp,SLARTGP)
+void LAPACK_slartgp(
+    float const* f,
+    float const* g,
+    float* cs,
+    float* sn,
+    float* r );
+
+#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs,DLARTGS)
+void LAPACK_dlartgs(
+    double const* x,
+    double const* y,
+    double const* sigma,
+    double* cs,
+    double* sn );
+
+#define LAPACK_slartgs LAPACK_GLOBAL(slartgs,SLARTGS)
+void LAPACK_slartgs(
+    float const* x,
+    float const* y,
+    float const* sigma,
+    float* cs,
+    float* sn );
+
+#define LAPACK_clascl LAPACK_GLOBAL(clascl,CLASCL)
+void LAPACK_clascl(
+    char const* type,
+    lapack_int const* kl, lapack_int const* ku,
+    float const* cfrom,
+    float const* cto, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dlascl LAPACK_GLOBAL(dlascl,DLASCL)
+void LAPACK_dlascl(
+    char const* type,
+    lapack_int const* kl, lapack_int const* ku,
+    double const* cfrom,
+    double const* cto, lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_slascl LAPACK_GLOBAL(slascl,SLASCL)
+void LAPACK_slascl(
+    char const* type,
+    lapack_int const* kl, lapack_int const* ku,
+    float const* cfrom,
+    float const* cto, lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_zlascl LAPACK_GLOBAL(zlascl,ZLASCL)
+void LAPACK_zlascl(
+    char const* type,
+    lapack_int const* kl, lapack_int const* ku,
+    double const* cfrom,
+    double const* cto, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_claset LAPACK_GLOBAL(claset,CLASET)
+void LAPACK_claset(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* alpha,
+    lapack_complex_float const* beta,
+    lapack_complex_float* A, lapack_int const* lda );
+
+#define LAPACK_dlaset LAPACK_GLOBAL(dlaset,DLASET)
+void LAPACK_dlaset(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    double const* alpha,
+    double const* beta,
+    double* A, lapack_int const* lda );
+
+#define LAPACK_slaset LAPACK_GLOBAL(slaset,SLASET)
+void LAPACK_slaset(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    float const* alpha,
+    float const* beta,
+    float* A, lapack_int const* lda );
+
+#define LAPACK_zlaset LAPACK_GLOBAL(zlaset,ZLASET)
+void LAPACK_zlaset(
+    char const* uplo,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* alpha,
+    lapack_complex_double const* beta,
+    lapack_complex_double* A, lapack_int const* lda );
+
+#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt,DLASRT)
+void LAPACK_dlasrt(
+    char const* id,
+    lapack_int const* n,
+    double* D,
+    lapack_int* info );
+
+#define LAPACK_slasrt LAPACK_GLOBAL(slasrt,SLASRT)
+void LAPACK_slasrt(
+    char const* id,
+    lapack_int const* n,
+    float* D,
+    lapack_int* info );
+
+#define LAPACK_classq LAPACK_GLOBAL(classq,CLASSQ)
+void LAPACK_classq(
+    lapack_int const* n,
+    lapack_complex_float const* X, lapack_int const* incx,
+    float* scale,
+    float* sumsq );
+
+#define LAPACK_dlassq LAPACK_GLOBAL(dlassq,DLASSQ)
+void LAPACK_dlassq(
+    lapack_int const* n,
+    double const* X, lapack_int const* incx,
+    double* scale,
+    double* sumsq );
+
+#define LAPACK_slassq LAPACK_GLOBAL(slassq,SLASSQ)
+void LAPACK_slassq(
+    lapack_int const* n,
+    float const* X, lapack_int const* incx,
+    float* scale,
+    float* sumsq );
+
+#define LAPACK_zlassq LAPACK_GLOBAL(zlassq,ZLASSQ)
+void LAPACK_zlassq(
+    lapack_int const* n,
+    lapack_complex_double const* X, lapack_int const* incx,
+    double* scale,
+    double* sumsq );
+
+#define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP)
+void LAPACK_claswp(
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx );
+
+#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP)
+void LAPACK_dlaswp(
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx );
+
+#define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP)
+void LAPACK_slaswp(
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx );
+
+#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP)
+void LAPACK_zlaswp(
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* k1, lapack_int const* k2, lapack_int const* ipiv, lapack_int const* incx );
+
+#define LAPACK_clatms LAPACK_GLOBAL(clatms,CLATMS)
+void LAPACK_clatms(
+    lapack_int const* m, lapack_int const* n, char const* dist,
+    lapack_int* iseed, char const* sym,
+    float* D,
+    lapack_int const* mode,
+    float const* cond,
+    float const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack,
+    lapack_complex_float* A,
+    lapack_int const* lda,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dlatms LAPACK_GLOBAL(dlatms,DLATMS)
+void LAPACK_dlatms(
+    lapack_int const* m, lapack_int const* n, char const* dist,
+    lapack_int* iseed, char const* sym,
+    double* D,
+    lapack_int const* mode,
+    double const* cond,
+    double const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack,
+    double* A,
+    lapack_int const* lda,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_slatms LAPACK_GLOBAL(slatms,SLATMS)
+void LAPACK_slatms(
+    lapack_int const* m, lapack_int const* n, char const* dist,
+    lapack_int* iseed, char const* sym,
+    float* D,
+    lapack_int const* mode,
+    float const* cond,
+    float const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack,
+    float* A,
+    lapack_int const* lda,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zlatms LAPACK_GLOBAL(zlatms,ZLATMS)
+void LAPACK_zlatms(
+    lapack_int const* m, lapack_int const* n, char const* dist,
+    lapack_int* iseed, char const* sym,
+    double* D,
+    lapack_int const* mode,
+    double const* cond,
+    double const* dmax, lapack_int const* kl, lapack_int const* ku, char const* pack,
+    lapack_complex_double* A,
+    lapack_int const* lda,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_clauum LAPACK_GLOBAL(clauum,CLAUUM)
+void LAPACK_clauum(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dlauum LAPACK_GLOBAL(dlauum,DLAUUM)
+void LAPACK_dlauum(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_slauum LAPACK_GLOBAL(slauum,SLAUUM)
+void LAPACK_slauum(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_zlauum LAPACK_GLOBAL(zlauum,ZLAUUM)
+void LAPACK_zlauum(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_ilaver LAPACK_GLOBAL(ilaver,ILAVER)
+void LAPACK_ilaver(
+    lapack_int* vers_major, lapack_int* vers_minor, lapack_int* vers_patch );
+
+#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr,DOPGTR)
+void LAPACK_dopgtr(
+    char const* uplo,
+    lapack_int const* n,
+    double const* AP,
+    double const* tau,
+    double* Q, lapack_int const* ldq,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr,SOPGTR)
+void LAPACK_sopgtr(
+    char const* uplo,
+    lapack_int const* n,
+    float const* AP,
+    float const* tau,
+    float* Q, lapack_int const* ldq,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr,DOPMTR)
+void LAPACK_dopmtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    double const* AP,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr,SOPMTR)
+void LAPACK_sopmtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    float const* AP,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb,DORBDB)
+void LAPACK_dorbdb(
+    char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    double* X11, lapack_int const* ldx11,
+    double* X12, lapack_int const* ldx12,
+    double* X21, lapack_int const* ldx21,
+    double* X22, lapack_int const* ldx22,
+    double* theta,
+    double* phi,
+    double* TAUP1,
+    double* TAUP2,
+    double* TAUQ1,
+    double* TAUQ2,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb,SORBDB)
+void LAPACK_sorbdb(
+    char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    float* X11, lapack_int const* ldx11,
+    float* X12, lapack_int const* ldx12,
+    float* X21, lapack_int const* ldx21,
+    float* X22, lapack_int const* ldx22,
+    float* theta,
+    float* phi,
+    float* TAUP1,
+    float* TAUP2,
+    float* TAUQ1,
+    float* TAUQ2,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd,DORCSD)
+void LAPACK_dorcsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    double* X11, lapack_int const* ldx11,
+    double* X12, lapack_int const* ldx12,
+    double* X21, lapack_int const* ldx21,
+    double* X22, lapack_int const* ldx22,
+    double* theta,
+    double* U1, lapack_int const* ldu1,
+    double* U2, lapack_int const* ldu2,
+    double* V1T, lapack_int const* ldv1t,
+    double* V2T, lapack_int const* ldv2t,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd,SORCSD)
+void LAPACK_sorcsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    float* X11, lapack_int const* ldx11,
+    float* X12, lapack_int const* ldx12,
+    float* X21, lapack_int const* ldx21,
+    float* X22, lapack_int const* ldx22,
+    float* theta,
+    float* U1, lapack_int const* ldu1,
+    float* U2, lapack_int const* ldu2,
+    float* V1T, lapack_int const* ldv1t,
+    float* V2T, lapack_int const* ldv2t,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dorcsd2by1 LAPACK_GLOBAL(dorcsd2by1,DORCSD2BY1)
+void LAPACK_dorcsd2by1(
+    char const* jobu1, char const* jobu2, char const* jobv1t,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    double* X11, lapack_int const* ldx11,
+    double* X21, lapack_int const* ldx21,
+    double* theta,
+    double* U1, lapack_int const* ldu1,
+    double* U2, lapack_int const* ldu2,
+    double* V1T, lapack_int const* ldv1t,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sorcsd2by1 LAPACK_GLOBAL(sorcsd2by1,SORCSD2BY1)
+void LAPACK_sorcsd2by1(
+    char const* jobu1, char const* jobu2, char const* jobv1t,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    float* X11, lapack_int const* ldx11,
+    float* X21, lapack_int const* ldx21,
+    float* theta,
+    float* U1, lapack_int const* ldu1,
+    float* U2, lapack_int const* ldu2,
+    float* V1T, lapack_int const* ldv1t,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr,DORGBR)
+void LAPACK_dorgbr(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double* A, lapack_int const* lda,
+    double const* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr,SORGBR)
+void LAPACK_sorgbr(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float* A, lapack_int const* lda,
+    float const* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dorghr LAPACK_GLOBAL(dorghr,DORGHR)
+void LAPACK_dorghr(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double* A, lapack_int const* lda,
+    double const* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorghr LAPACK_GLOBAL(sorghr,SORGHR)
+void LAPACK_sorghr(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float* A, lapack_int const* lda,
+    float const* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dorglq LAPACK_GLOBAL(dorglq,DORGLQ)
+void LAPACK_dorglq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double* A, lapack_int const* lda,
+    double const* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorglq LAPACK_GLOBAL(sorglq,SORGLQ)
+void LAPACK_sorglq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float* A, lapack_int const* lda,
+    float const* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dorgql LAPACK_GLOBAL(dorgql,DORGQL)
+void LAPACK_dorgql(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double* A, lapack_int const* lda,
+    double const* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgql LAPACK_GLOBAL(sorgql,SORGQL)
+void LAPACK_sorgql(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float* A, lapack_int const* lda,
+    float const* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr,DORGQR)
+void LAPACK_dorgqr(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double* A, lapack_int const* lda,
+    double const* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr,SORGQR)
+void LAPACK_sorgqr(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float* A, lapack_int const* lda,
+    float const* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq,DORGRQ)
+void LAPACK_dorgrq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double* A, lapack_int const* lda,
+    double const* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq,SORGRQ)
+void LAPACK_sorgrq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float* A, lapack_int const* lda,
+    float const* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr,DORGTR)
+void LAPACK_dorgtr(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr,SORGTR)
+void LAPACK_sorgtr(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
+void LAPACK_dormbr(
+    char const* vect, char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormbr LAPACK_GLOBAL(sormbr,SORMBR)
+void LAPACK_sormbr(
+    char const* vect, char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormhr LAPACK_GLOBAL(dormhr,DORMHR)
+void LAPACK_dormhr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormhr LAPACK_GLOBAL(sormhr,SORMHR)
+void LAPACK_sormhr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormlq LAPACK_GLOBAL(dormlq,DORMLQ)
+void LAPACK_dormlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormlq LAPACK_GLOBAL(sormlq,SORMLQ)
+void LAPACK_sormlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormql LAPACK_GLOBAL(dormql,DORMQL)
+void LAPACK_dormql(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormql LAPACK_GLOBAL(sormql,SORMQL)
+void LAPACK_sormql(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormqr LAPACK_GLOBAL(dormqr,DORMQR)
+void LAPACK_dormqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormqr LAPACK_GLOBAL(sormqr,SORMQR)
+void LAPACK_sormqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormrq LAPACK_GLOBAL(dormrq,DORMRQ)
+void LAPACK_dormrq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormrq LAPACK_GLOBAL(sormrq,SORMRQ)
+void LAPACK_sormrq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormrz LAPACK_GLOBAL(dormrz,DORMRZ)
+void LAPACK_dormrz(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormrz LAPACK_GLOBAL(sormrz,SORMRZ)
+void LAPACK_sormrz(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dormtr LAPACK_GLOBAL(dormtr,DORMTR)
+void LAPACK_dormtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* tau,
+    double* C, lapack_int const* ldc,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_sormtr LAPACK_GLOBAL(sormtr,SORMTR)
+void LAPACK_sormtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* tau,
+    float* C, lapack_int const* ldc,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon,CPBCON)
+void LAPACK_cpbcon(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon,DPBCON)
+void LAPACK_dpbcon(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double const* AB, lapack_int const* ldab,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_spbcon LAPACK_GLOBAL(spbcon,SPBCON)
+void LAPACK_spbcon(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float const* AB, lapack_int const* ldab,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon,ZPBCON)
+void LAPACK_zpbcon(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ,CPBEQU)
+void LAPACK_cpbequ(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ,DPBEQU)
+void LAPACK_dpbequ(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double const* AB, lapack_int const* ldab,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_spbequ LAPACK_GLOBAL(spbequ,SPBEQU)
+void LAPACK_spbequ(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float const* AB, lapack_int const* ldab,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ,ZPBEQU)
+void LAPACK_zpbequ(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs,CPBRFS)
+void LAPACK_cpbrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    lapack_complex_float const* AFB, lapack_int const* ldafb,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs,DPBRFS)
+void LAPACK_dpbrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    double const* AB, lapack_int const* ldab,
+    double const* AFB, lapack_int const* ldafb,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs,SPBRFS)
+void LAPACK_spbrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    float const* AB, lapack_int const* ldab,
+    float const* AFB, lapack_int const* ldafb,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs,ZPBRFS)
+void LAPACK_zpbrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    lapack_complex_double const* AFB, lapack_int const* ldafb,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf,CPBSTF)
+void LAPACK_cpbstf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf,DPBSTF)
+void LAPACK_dpbstf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_spbstf LAPACK_GLOBAL(spbstf,SPBSTF)
+void LAPACK_spbstf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf,ZPBSTF)
+void LAPACK_zpbstf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv,CPBSV)
+void LAPACK_cpbsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv,DPBSV)
+void LAPACK_dpbsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    double* AB, lapack_int const* ldab,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_spbsv LAPACK_GLOBAL(spbsv,SPBSV)
+void LAPACK_spbsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    float* AB, lapack_int const* ldab,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv,ZPBSV)
+void LAPACK_zpbsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx,CPBSVX)
+void LAPACK_cpbsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_complex_float* AFB, lapack_int const* ldafb, char* equed,
+    float* S,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx,DPBSVX)
+void LAPACK_dpbsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    double* AB, lapack_int const* ldab,
+    double* AFB, lapack_int const* ldafb, char* equed,
+    double* S,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx,SPBSVX)
+void LAPACK_spbsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    float* AB, lapack_int const* ldab,
+    float* AFB, lapack_int const* ldafb, char* equed,
+    float* S,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx,ZPBSVX)
+void LAPACK_zpbsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_complex_double* AFB, lapack_int const* ldafb, char* equed,
+    double* S,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf,CPBTRF)
+void LAPACK_cpbtrf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf,DPBTRF)
+void LAPACK_dpbtrf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf,SPBTRF)
+void LAPACK_spbtrf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf,ZPBTRF)
+void LAPACK_zpbtrf(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double* AB, lapack_int const* ldab,
+    lapack_int* info );
+
+#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs,CPBTRS)
+void LAPACK_cpbtrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs,DPBTRS)
+void LAPACK_dpbtrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    double const* AB, lapack_int const* ldab,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs,SPBTRS)
+void LAPACK_spbtrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    float const* AB, lapack_int const* ldab,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs,ZPBTRS)
+void LAPACK_zpbtrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf,CPFTRF)
+void LAPACK_cpftrf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A,
+    lapack_int* info );
+
+#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf,DPFTRF)
+void LAPACK_dpftrf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    double* A,
+    lapack_int* info );
+
+#define LAPACK_spftrf LAPACK_GLOBAL(spftrf,SPFTRF)
+void LAPACK_spftrf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    float* A,
+    lapack_int* info );
+
+#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf,ZPFTRF)
+void LAPACK_zpftrf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A,
+    lapack_int* info );
+
+#define LAPACK_cpftri LAPACK_GLOBAL(cpftri,CPFTRI)
+void LAPACK_cpftri(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A,
+    lapack_int* info );
+
+#define LAPACK_dpftri LAPACK_GLOBAL(dpftri,DPFTRI)
+void LAPACK_dpftri(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    double* A,
+    lapack_int* info );
+
+#define LAPACK_spftri LAPACK_GLOBAL(spftri,SPFTRI)
+void LAPACK_spftri(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    float* A,
+    lapack_int* info );
+
+#define LAPACK_zpftri LAPACK_GLOBAL(zpftri,ZPFTRI)
+void LAPACK_zpftri(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A,
+    lapack_int* info );
+
+#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs,CPFTRS)
+void LAPACK_cpftrs(
+    char const* transr, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs,DPFTRS)
+void LAPACK_dpftrs(
+    char const* transr, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_spftrs LAPACK_GLOBAL(spftrs,SPFTRS)
+void LAPACK_spftrs(
+    char const* transr, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs,ZPFTRS)
+void LAPACK_zpftrs(
+    char const* transr, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cpocon LAPACK_GLOBAL(cpocon,CPOCON)
+void LAPACK_cpocon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dpocon LAPACK_GLOBAL(dpocon,DPOCON)
+void LAPACK_dpocon(
+    char const* uplo,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_spocon LAPACK_GLOBAL(spocon,SPOCON)
+void LAPACK_spocon(
+    char const* uplo,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zpocon LAPACK_GLOBAL(zpocon,ZPOCON)
+void LAPACK_zpocon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ,CPOEQU)
+void LAPACK_cpoequ(
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ,DPOEQU)
+void LAPACK_dpoequ(
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_spoequ LAPACK_GLOBAL(spoequ,SPOEQU)
+void LAPACK_spoequ(
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ,ZPOEQU)
+void LAPACK_zpoequ(
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb,CPOEQUB)
+void LAPACK_cpoequb(
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb,DPOEQUB)
+void LAPACK_dpoequb(
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_spoequb LAPACK_GLOBAL(spoequb,SPOEQUB)
+void LAPACK_spoequb(
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb,ZPOEQUB)
+void LAPACK_zpoequb(
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cporfs LAPACK_GLOBAL(cporfs,CPORFS)
+void LAPACK_cporfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dporfs LAPACK_GLOBAL(dporfs,DPORFS)
+void LAPACK_dporfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* AF, lapack_int const* ldaf,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sporfs LAPACK_GLOBAL(sporfs,SPORFS)
+void LAPACK_sporfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* AF, lapack_int const* ldaf,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zporfs LAPACK_GLOBAL(zporfs,ZPORFS)
+void LAPACK_zporfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx,CPORFSX)
+void LAPACK_cporfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf,
+    float* S,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx,DPORFSX)
+void LAPACK_dporfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* AF, lapack_int const* ldaf,
+    double* S,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx,SPORFSX)
+void LAPACK_sporfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* AF, lapack_int const* ldaf,
+    float* S,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx,ZPORFSX)
+void LAPACK_zporfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf,
+    double* S,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cposv LAPACK_GLOBAL(cposv,CPOSV)
+void LAPACK_cposv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dposv LAPACK_GLOBAL(dposv,DPOSV)
+void LAPACK_dposv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sposv LAPACK_GLOBAL(sposv,SPOSV)
+void LAPACK_sposv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zposv LAPACK_GLOBAL(zposv,ZPOSV)
+void LAPACK_zposv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsposv LAPACK_GLOBAL(dsposv,DSPOSV)
+void LAPACK_dsposv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* work,
+    float* swork, lapack_int* iter,
+    lapack_int* info );
+
+#define LAPACK_zcposv LAPACK_GLOBAL(zcposv,ZCPOSV)
+void LAPACK_zcposv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    lapack_complex_double* work,
+    lapack_complex_float* swork,
+    double* rwork, lapack_int* iter,
+    lapack_int* info );
+
+#define LAPACK_cposvx LAPACK_GLOBAL(cposvx,CPOSVX)
+void LAPACK_cposvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, char* equed,
+    float* S,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dposvx LAPACK_GLOBAL(dposvx,DPOSVX)
+void LAPACK_dposvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* AF, lapack_int const* ldaf, char* equed,
+    double* S,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sposvx LAPACK_GLOBAL(sposvx,SPOSVX)
+void LAPACK_sposvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* AF, lapack_int const* ldaf, char* equed,
+    float* S,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zposvx LAPACK_GLOBAL(zposvx,ZPOSVX)
+void LAPACK_zposvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, char* equed,
+    double* S,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx,CPOSVXX)
+void LAPACK_cposvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, char* equed,
+    float* S,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx,DPOSVXX)
+void LAPACK_dposvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* AF, lapack_int const* ldaf, char* equed,
+    double* S,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx,SPOSVXX)
+void LAPACK_sposvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* AF, lapack_int const* ldaf, char* equed,
+    float* S,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx,ZPOSVXX)
+void LAPACK_zposvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, char* equed,
+    double* S,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpotf2 LAPACK_GLOBAL(cpotf2,CPOTF2)
+void LAPACK_cpotf2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dpotf2 LAPACK_GLOBAL(dpotf2,DPOTF2)
+void LAPACK_dpotf2(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_spotf2 LAPACK_GLOBAL(spotf2,SPOTF2)
+void LAPACK_spotf2(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_zpotf2 LAPACK_GLOBAL(zpotf2,ZPOTF2)
+void LAPACK_zpotf2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf,CPOTRF)
+void LAPACK_cpotrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf,DPOTRF)
+void LAPACK_dpotrf(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_spotrf LAPACK_GLOBAL(spotrf,SPOTRF)
+void LAPACK_spotrf(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf,ZPOTRF)
+void LAPACK_zpotrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_cpotrf2 LAPACK_GLOBAL(cpotrf2,CPOTRF2)
+void LAPACK_cpotrf2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dpotrf2 LAPACK_GLOBAL(dpotrf2,DPOTRF2)
+void LAPACK_dpotrf2(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_spotrf2 LAPACK_GLOBAL(spotrf2,SPOTRF2)
+void LAPACK_spotrf2(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_zpotrf2 LAPACK_GLOBAL(zpotrf2,ZPOTRF2)
+void LAPACK_zpotrf2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_cpotri LAPACK_GLOBAL(cpotri,CPOTRI)
+void LAPACK_cpotri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dpotri LAPACK_GLOBAL(dpotri,DPOTRI)
+void LAPACK_dpotri(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_spotri LAPACK_GLOBAL(spotri,SPOTRI)
+void LAPACK_spotri(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_zpotri LAPACK_GLOBAL(zpotri,ZPOTRI)
+void LAPACK_zpotri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs,CPOTRS)
+void LAPACK_cpotrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs,DPOTRS)
+void LAPACK_dpotrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_spotrs LAPACK_GLOBAL(spotrs,SPOTRS)
+void LAPACK_spotrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs,ZPOTRS)
+void LAPACK_zpotrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cppcon LAPACK_GLOBAL(cppcon,CPPCON)
+void LAPACK_cppcon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dppcon LAPACK_GLOBAL(dppcon,DPPCON)
+void LAPACK_dppcon(
+    char const* uplo,
+    lapack_int const* n,
+    double const* AP,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sppcon LAPACK_GLOBAL(sppcon,SPPCON)
+void LAPACK_sppcon(
+    char const* uplo,
+    lapack_int const* n,
+    float const* AP,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zppcon LAPACK_GLOBAL(zppcon,ZPPCON)
+void LAPACK_zppcon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cppequ LAPACK_GLOBAL(cppequ,CPPEQU)
+void LAPACK_cppequ(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_dppequ LAPACK_GLOBAL(dppequ,DPPEQU)
+void LAPACK_dppequ(
+    char const* uplo,
+    lapack_int const* n,
+    double const* AP,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_sppequ LAPACK_GLOBAL(sppequ,SPPEQU)
+void LAPACK_sppequ(
+    char const* uplo,
+    lapack_int const* n,
+    float const* AP,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_int* info );
+
+#define LAPACK_zppequ LAPACK_GLOBAL(zppequ,ZPPEQU)
+void LAPACK_zppequ(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_int* info );
+
+#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs,CPPRFS)
+void LAPACK_cpprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float const* AFP,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs,DPPRFS)
+void LAPACK_dpprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* AP,
+    double const* AFP,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_spprfs LAPACK_GLOBAL(spprfs,SPPRFS)
+void LAPACK_spprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* AP,
+    float const* AFP,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs,ZPPRFS)
+void LAPACK_zpprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double const* AFP,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cppsv LAPACK_GLOBAL(cppsv,CPPSV)
+void LAPACK_cppsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* AP,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dppsv LAPACK_GLOBAL(dppsv,DPPSV)
+void LAPACK_dppsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* AP,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sppsv LAPACK_GLOBAL(sppsv,SPPSV)
+void LAPACK_sppsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* AP,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zppsv LAPACK_GLOBAL(zppsv,ZPPSV)
+void LAPACK_zppsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* AP,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx,CPPSVX)
+void LAPACK_cppsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* AP,
+    lapack_complex_float* AFP, char* equed,
+    float* S,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx,DPPSVX)
+void LAPACK_dppsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* AP,
+    double* AFP, char* equed,
+    double* S,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx,SPPSVX)
+void LAPACK_sppsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* AP,
+    float* AFP, char* equed,
+    float* S,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx,ZPPSVX)
+void LAPACK_zppsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* AP,
+    lapack_complex_double* AFP, char* equed,
+    double* S,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf,CPPTRF)
+void LAPACK_cpptrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    lapack_int* info );
+
+#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf,DPPTRF)
+void LAPACK_dpptrf(
+    char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    lapack_int* info );
+
+#define LAPACK_spptrf LAPACK_GLOBAL(spptrf,SPPTRF)
+void LAPACK_spptrf(
+    char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    lapack_int* info );
+
+#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf,ZPPTRF)
+void LAPACK_zpptrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    lapack_int* info );
+
+#define LAPACK_cpptri LAPACK_GLOBAL(cpptri,CPPTRI)
+void LAPACK_cpptri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    lapack_int* info );
+
+#define LAPACK_dpptri LAPACK_GLOBAL(dpptri,DPPTRI)
+void LAPACK_dpptri(
+    char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    lapack_int* info );
+
+#define LAPACK_spptri LAPACK_GLOBAL(spptri,SPPTRI)
+void LAPACK_spptri(
+    char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    lapack_int* info );
+
+#define LAPACK_zpptri LAPACK_GLOBAL(zpptri,ZPPTRI)
+void LAPACK_zpptri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    lapack_int* info );
+
+#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs,CPPTRS)
+void LAPACK_cpptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs,DPPTRS)
+void LAPACK_dpptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* AP,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_spptrs LAPACK_GLOBAL(spptrs,SPPTRS)
+void LAPACK_spptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* AP,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs,ZPPTRS)
+void LAPACK_zpptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf,CPSTRF)
+void LAPACK_cpstrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank,
+    float const* tol,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf,DPSTRF)
+void LAPACK_dpstrf(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank,
+    double const* tol,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_spstrf LAPACK_GLOBAL(spstrf,SPSTRF)
+void LAPACK_spstrf(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank,
+    float const* tol,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf,ZPSTRF)
+void LAPACK_zpstrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* piv, lapack_int* rank,
+    double const* tol,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_cptcon LAPACK_GLOBAL(cptcon,CPTCON)
+void LAPACK_cptcon(
+    lapack_int const* n,
+    float const* D,
+    lapack_complex_float const* E,
+    float const* anorm,
+    float* rcond,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dptcon LAPACK_GLOBAL(dptcon,DPTCON)
+void LAPACK_dptcon(
+    lapack_int const* n,
+    double const* D,
+    double const* E,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sptcon LAPACK_GLOBAL(sptcon,SPTCON)
+void LAPACK_sptcon(
+    lapack_int const* n,
+    float const* D,
+    float const* E,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zptcon LAPACK_GLOBAL(zptcon,ZPTCON)
+void LAPACK_zptcon(
+    lapack_int const* n,
+    double const* D,
+    lapack_complex_double const* E,
+    double const* anorm,
+    double* rcond,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr,CPTEQR)
+void LAPACK_cpteqr(
+    char const* compz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr,DPTEQR)
+void LAPACK_dpteqr(
+    char const* compz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_spteqr LAPACK_GLOBAL(spteqr,SPTEQR)
+void LAPACK_spteqr(
+    char const* compz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr,ZPTEQR)
+void LAPACK_zpteqr(
+    char const* compz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs,CPTRFS)
+void LAPACK_cptrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* D,
+    lapack_complex_float const* E,
+    float const* DF,
+    lapack_complex_float const* EF,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs,DPTRFS)
+void LAPACK_dptrfs(
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* D,
+    double const* E,
+    double const* DF,
+    double const* EF,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs,SPTRFS)
+void LAPACK_sptrfs(
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* D,
+    float const* E,
+    float const* DF,
+    float const* EF,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs,ZPTRFS)
+void LAPACK_zptrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* D,
+    lapack_complex_double const* E,
+    double const* DF,
+    lapack_complex_double const* EF,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cptsv LAPACK_GLOBAL(cptsv,CPTSV)
+void LAPACK_cptsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    float* D,
+    lapack_complex_float* E,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dptsv LAPACK_GLOBAL(dptsv,DPTSV)
+void LAPACK_dptsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    double* D,
+    double* E,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sptsv LAPACK_GLOBAL(sptsv,SPTSV)
+void LAPACK_sptsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    float* D,
+    float* E,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zptsv LAPACK_GLOBAL(zptsv,ZPTSV)
+void LAPACK_zptsv(
+    lapack_int const* n, lapack_int const* nrhs,
+    double* D,
+    lapack_complex_double* E,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx,CPTSVX)
+void LAPACK_cptsvx(
+    char const* fact,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* D,
+    lapack_complex_float const* E,
+    float* DF,
+    lapack_complex_float* EF,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx,DPTSVX)
+void LAPACK_dptsvx(
+    char const* fact,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* D,
+    double const* E,
+    double* DF,
+    double* EF,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx,SPTSVX)
+void LAPACK_sptsvx(
+    char const* fact,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* D,
+    float const* E,
+    float* DF,
+    float* EF,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx,ZPTSVX)
+void LAPACK_zptsvx(
+    char const* fact,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* D,
+    lapack_complex_double const* E,
+    double* DF,
+    lapack_complex_double* EF,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf,CPTTRF)
+void LAPACK_cpttrf(
+    lapack_int const* n,
+    float* D,
+    lapack_complex_float* E,
+    lapack_int* info );
+
+#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf,DPTTRF)
+void LAPACK_dpttrf(
+    lapack_int const* n,
+    double* D,
+    double* E,
+    lapack_int* info );
+
+#define LAPACK_spttrf LAPACK_GLOBAL(spttrf,SPTTRF)
+void LAPACK_spttrf(
+    lapack_int const* n,
+    float* D,
+    float* E,
+    lapack_int* info );
+
+#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf,ZPTTRF)
+void LAPACK_zpttrf(
+    lapack_int const* n,
+    double* D,
+    lapack_complex_double* E,
+    lapack_int* info );
+
+#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs,CPTTRS)
+void LAPACK_cpttrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* D,
+    lapack_complex_float const* E,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs,DPTTRS)
+void LAPACK_dpttrs(
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* D,
+    double const* E,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_spttrs LAPACK_GLOBAL(spttrs,SPTTRS)
+void LAPACK_spttrs(
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* D,
+    float const* E,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs,ZPTTRS)
+void LAPACK_zpttrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* D,
+    lapack_complex_double const* E,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsbev LAPACK_GLOBAL(dsbev,DSBEV)
+void LAPACK_dsbev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssbev LAPACK_GLOBAL(ssbev,SSBEV)
+void LAPACK_ssbev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dsbev_2stage LAPACK_GLOBAL(dsbev_2stage,DSBEV_2STAGE)
+void LAPACK_dsbev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssbev_2stage LAPACK_GLOBAL(ssbev_2stage,SSBEV_2STAGE)
+void LAPACK_ssbev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd,DSBEVD)
+void LAPACK_dsbevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd,SSBEVD)
+void LAPACK_ssbevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsbevd_2stage LAPACK_GLOBAL(dsbevd_2stage,DSBEVD_2STAGE)
+void LAPACK_dsbevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssbevd_2stage LAPACK_GLOBAL(ssbevd_2stage,SSBEVD_2STAGE)
+void LAPACK_ssbevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx,DSBEVX)
+void LAPACK_dsbevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    double* Q, lapack_int const* ldq,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx,SSBEVX)
+void LAPACK_ssbevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    float* Q, lapack_int const* ldq,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_dsbevx_2stage LAPACK_GLOBAL(dsbevx_2stage,DSBEVX_2STAGE)
+void LAPACK_dsbevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    double* Q, lapack_int const* ldq,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_ssbevx_2stage LAPACK_GLOBAL(ssbevx_2stage,SSBEVX_2STAGE)
+void LAPACK_ssbevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    float* Q, lapack_int const* ldq,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst,DSBGST)
+void LAPACK_dsbgst(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    double* AB, lapack_int const* ldab,
+    double const* BB, lapack_int const* ldbb,
+    double* X, lapack_int const* ldx,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst,SSBGST)
+void LAPACK_ssbgst(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    float* AB, lapack_int const* ldab,
+    float const* BB, lapack_int const* ldbb,
+    float* X, lapack_int const* ldx,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv,DSBGV)
+void LAPACK_dsbgv(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    double* AB, lapack_int const* ldab,
+    double* BB, lapack_int const* ldbb,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv,SSBGV)
+void LAPACK_ssbgv(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    float* AB, lapack_int const* ldab,
+    float* BB, lapack_int const* ldbb,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd,DSBGVD)
+void LAPACK_dsbgvd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    double* AB, lapack_int const* ldab,
+    double* BB, lapack_int const* ldbb,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd,SSBGVD)
+void LAPACK_ssbgvd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    float* AB, lapack_int const* ldab,
+    float* BB, lapack_int const* ldbb,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx,DSBGVX)
+void LAPACK_dsbgvx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    double* AB, lapack_int const* ldab,
+    double* BB, lapack_int const* ldbb,
+    double* Q, lapack_int const* ldq,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx,SSBGVX)
+void LAPACK_ssbgvx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n, lapack_int const* ka, lapack_int const* kb,
+    float* AB, lapack_int const* ldab,
+    float* BB, lapack_int const* ldbb,
+    float* Q, lapack_int const* ldq,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd,DSBTRD)
+void LAPACK_dsbtrd(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    double* AB, lapack_int const* ldab,
+    double* D,
+    double* E,
+    double* Q, lapack_int const* ldq,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd,SSBTRD)
+void LAPACK_ssbtrd(
+    char const* vect, char const* uplo,
+    lapack_int const* n, lapack_int const* kd,
+    float* AB, lapack_int const* ldab,
+    float* D,
+    float* E,
+    float* Q, lapack_int const* ldq,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk,DSFRK)
+void LAPACK_dsfrk(
+    char const* transr, char const* uplo, char const* trans,
+    lapack_int const* n, lapack_int const* k,
+    double const* alpha,
+    double const* A, lapack_int const* lda,
+    double const* beta,
+    double* C );
+
+#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk,SSFRK)
+void LAPACK_ssfrk(
+    char const* transr, char const* uplo, char const* trans,
+    lapack_int const* n, lapack_int const* k,
+    float const* alpha,
+    float const* A, lapack_int const* lda,
+    float const* beta,
+    float* C );
+
+#define LAPACK_cspcon LAPACK_GLOBAL(cspcon,CSPCON)
+void LAPACK_cspcon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dspcon LAPACK_GLOBAL(dspcon,DSPCON)
+void LAPACK_dspcon(
+    char const* uplo,
+    lapack_int const* n,
+    double const* AP, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sspcon LAPACK_GLOBAL(sspcon,SSPCON)
+void LAPACK_sspcon(
+    char const* uplo,
+    lapack_int const* n,
+    float const* AP, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zspcon LAPACK_GLOBAL(zspcon,ZSPCON)
+void LAPACK_zspcon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_dspev LAPACK_GLOBAL(dspev,DSPEV)
+void LAPACK_dspev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sspev LAPACK_GLOBAL(sspev,SSPEV)
+void LAPACK_sspev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dspevd LAPACK_GLOBAL(dspevd,DSPEVD)
+void LAPACK_dspevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sspevd LAPACK_GLOBAL(sspevd,SSPEVD)
+void LAPACK_sspevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dspevx LAPACK_GLOBAL(dspevx,DSPEVX)
+void LAPACK_dspevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_sspevx LAPACK_GLOBAL(sspevx,SSPEVX)
+void LAPACK_sspevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_dspgst LAPACK_GLOBAL(dspgst,DSPGST)
+void LAPACK_dspgst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double const* BP,
+    lapack_int* info );
+
+#define LAPACK_sspgst LAPACK_GLOBAL(sspgst,SSPGST)
+void LAPACK_sspgst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float const* BP,
+    lapack_int* info );
+
+#define LAPACK_dspgv LAPACK_GLOBAL(dspgv,DSPGV)
+void LAPACK_dspgv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double* BP,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sspgv LAPACK_GLOBAL(sspgv,SSPGV)
+void LAPACK_sspgv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float* BP,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd,DSPGVD)
+void LAPACK_dspgvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double* BP,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd,SSPGVD)
+void LAPACK_sspgvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float* BP,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx,DSPGVX)
+void LAPACK_dspgvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double* BP,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx,SSPGVX)
+void LAPACK_sspgvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float* BP,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_csprfs LAPACK_GLOBAL(csprfs,CSPRFS)
+void LAPACK_csprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float const* AFP, lapack_int const* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs,DSPRFS)
+void LAPACK_dsprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* AP,
+    double const* AFP, lapack_int const* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs,SSPRFS)
+void LAPACK_ssprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* AP,
+    float const* AFP, lapack_int const* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs,ZSPRFS)
+void LAPACK_zsprfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double const* AFP, lapack_int const* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_cspsv LAPACK_GLOBAL(cspsv,CSPSV)
+void LAPACK_cspsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* AP, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dspsv LAPACK_GLOBAL(dspsv,DSPSV)
+void LAPACK_dspsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* AP, lapack_int* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_sspsv LAPACK_GLOBAL(sspsv,SSPSV)
+void LAPACK_sspsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* AP, lapack_int* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zspsv LAPACK_GLOBAL(zspsv,ZSPSV)
+void LAPACK_zspsv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* AP, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx,CSPSVX)
+void LAPACK_cspsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float* AFP, lapack_int* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx,DSPSVX)
+void LAPACK_dspsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* AP,
+    double* AFP, lapack_int* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx,SSPSVX)
+void LAPACK_sspsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* AP,
+    float* AFP, lapack_int* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx,ZSPSVX)
+void LAPACK_zspsvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double* AFP, lapack_int* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd,DSPTRD)
+void LAPACK_dsptrd(
+    char const* uplo,
+    lapack_int const* n,
+    double* AP,
+    double* D,
+    double* E,
+    double* tau,
+    lapack_int* info );
+
+#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd,SSPTRD)
+void LAPACK_ssptrd(
+    char const* uplo,
+    lapack_int const* n,
+    float* AP,
+    float* D,
+    float* E,
+    float* tau,
+    lapack_int* info );
+
+#define LAPACK_csptrf LAPACK_GLOBAL(csptrf,CSPTRF)
+void LAPACK_csptrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf,DSPTRF)
+void LAPACK_dsptrf(
+    char const* uplo,
+    lapack_int const* n,
+    double* AP, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf,SSPTRF)
+void LAPACK_ssptrf(
+    char const* uplo,
+    lapack_int const* n,
+    float* AP, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf,ZSPTRF)
+void LAPACK_zsptrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP, lapack_int* ipiv,
+    lapack_int* info );
+
+#define LAPACK_csptri LAPACK_GLOBAL(csptri,CSPTRI)
+void LAPACK_csptri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* AP, lapack_int const* ipiv,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dsptri LAPACK_GLOBAL(dsptri,DSPTRI)
+void LAPACK_dsptri(
+    char const* uplo,
+    lapack_int const* n,
+    double* AP, lapack_int const* ipiv,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssptri LAPACK_GLOBAL(ssptri,SSPTRI)
+void LAPACK_ssptri(
+    char const* uplo,
+    lapack_int const* n,
+    float* AP, lapack_int const* ipiv,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zsptri LAPACK_GLOBAL(zsptri,ZSPTRI)
+void LAPACK_zsptri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* AP, lapack_int const* ipiv,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_csptrs LAPACK_GLOBAL(csptrs,CSPTRS)
+void LAPACK_csptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs,DSPTRS)
+void LAPACK_dsptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* AP, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs,SSPTRS)
+void LAPACK_ssptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* AP, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs,ZSPTRS)
+void LAPACK_zsptrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dstebz LAPACK_GLOBAL(dstebz,DSTEBZ)
+void LAPACK_dstebz(
+    char const* range, char const* order,
+    lapack_int const* n,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol,
+    double const* D,
+    double const* E, lapack_int* m, lapack_int* nsplit,
+    double* W, lapack_int* IBLOCK, lapack_int* ISPLIT,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_sstebz LAPACK_GLOBAL(sstebz,SSTEBZ)
+void LAPACK_sstebz(
+    char const* range, char const* order,
+    lapack_int const* n,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol,
+    float const* D,
+    float const* E, lapack_int* m, lapack_int* nsplit,
+    float* W, lapack_int* IBLOCK, lapack_int* ISPLIT,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cstedc LAPACK_GLOBAL(cstedc,CSTEDC)
+void LAPACK_cstedc(
+    char const* compz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dstedc LAPACK_GLOBAL(dstedc,DSTEDC)
+void LAPACK_dstedc(
+    char const* compz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sstedc LAPACK_GLOBAL(sstedc,SSTEDC)
+void LAPACK_sstedc(
+    char const* compz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zstedc LAPACK_GLOBAL(zstedc,ZSTEDC)
+void LAPACK_zstedc(
+    char const* compz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_cstegr LAPACK_GLOBAL(cstegr,CSTEGR)
+void LAPACK_cstegr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dstegr LAPACK_GLOBAL(dstegr,DSTEGR)
+void LAPACK_dstegr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sstegr LAPACK_GLOBAL(sstegr,SSTEGR)
+void LAPACK_sstegr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zstegr LAPACK_GLOBAL(zstegr,ZSTEGR)
+void LAPACK_zstegr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_cstein LAPACK_GLOBAL(cstein,CSTEIN)
+void LAPACK_cstein(
+    lapack_int const* n,
+    float const* D,
+    float const* E, lapack_int const* m,
+    float const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_dstein LAPACK_GLOBAL(dstein,DSTEIN)
+void LAPACK_dstein(
+    lapack_int const* n,
+    double const* D,
+    double const* E, lapack_int const* m,
+    double const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_sstein LAPACK_GLOBAL(sstein,SSTEIN)
+void LAPACK_sstein(
+    lapack_int const* n,
+    float const* D,
+    float const* E, lapack_int const* m,
+    float const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_zstein LAPACK_GLOBAL(zstein,ZSTEIN)
+void LAPACK_zstein(
+    lapack_int const* n,
+    double const* D,
+    double const* E, lapack_int const* m,
+    double const* W, lapack_int const* IBLOCK, lapack_int const* ISPLIT,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_cstemr LAPACK_GLOBAL(cstemr,CSTEMR)
+void LAPACK_cstemr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m,
+    float* W,
+    lapack_complex_float* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dstemr LAPACK_GLOBAL(dstemr,DSTEMR)
+void LAPACK_dstemr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sstemr LAPACK_GLOBAL(sstemr,SSTEMR)
+void LAPACK_sstemr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_zstemr LAPACK_GLOBAL(zstemr,ZSTEMR)
+void LAPACK_zstemr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu, lapack_int* m,
+    double* W,
+    lapack_complex_double* Z, lapack_int const* ldz, lapack_int const* nzc, lapack_int* ISUPPZ, lapack_logical* tryrac,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_csteqr LAPACK_GLOBAL(csteqr,CSTEQR)
+void LAPACK_csteqr(
+    char const* compz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    lapack_complex_float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr,DSTEQR)
+void LAPACK_dsteqr(
+    char const* compz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr,SSTEQR)
+void LAPACK_ssteqr(
+    char const* compz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr,ZSTEQR)
+void LAPACK_zsteqr(
+    char const* compz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    lapack_complex_double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_dsterf LAPACK_GLOBAL(dsterf,DSTERF)
+void LAPACK_dsterf(
+    lapack_int const* n,
+    double* D,
+    double* E,
+    lapack_int* info );
+
+#define LAPACK_ssterf LAPACK_GLOBAL(ssterf,SSTERF)
+void LAPACK_ssterf(
+    lapack_int const* n,
+    float* D,
+    float* E,
+    lapack_int* info );
+
+#define LAPACK_dstev LAPACK_GLOBAL(dstev,DSTEV)
+void LAPACK_dstev(
+    char const* jobz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_sstev LAPACK_GLOBAL(sstev,SSTEV)
+void LAPACK_sstev(
+    char const* jobz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_dstevd LAPACK_GLOBAL(dstevd,DSTEVD)
+void LAPACK_dstevd(
+    char const* jobz,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sstevd LAPACK_GLOBAL(sstevd,SSTEVD)
+void LAPACK_sstevd(
+    char const* jobz,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dstevr LAPACK_GLOBAL(dstevr,DSTEVR)
+void LAPACK_dstevr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_sstevr LAPACK_GLOBAL(sstevr,SSTEVR)
+void LAPACK_sstevr(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dstevx LAPACK_GLOBAL(dstevx,DSTEVX)
+void LAPACK_dstevx(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    double* D,
+    double* E,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_sstevx LAPACK_GLOBAL(sstevx,SSTEVX)
+void LAPACK_sstevx(
+    char const* jobz, char const* range,
+    lapack_int const* n,
+    float* D,
+    float* E,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_csycon LAPACK_GLOBAL(csycon,CSYCON)
+void LAPACK_csycon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dsycon LAPACK_GLOBAL(dsycon,DSYCON)
+void LAPACK_dsycon(
+    char const* uplo,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ssycon LAPACK_GLOBAL(ssycon,SSYCON)
+void LAPACK_ssycon(
+    char const* uplo,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zsycon LAPACK_GLOBAL(zsycon,ZSYCON)
+void LAPACK_zsycon(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_csycon_3 LAPACK_GLOBAL(csycon_3,CSYCON_3)
+void LAPACK_csycon_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* E, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dsycon_3 LAPACK_GLOBAL(dsycon_3,DSYCON_3)
+void LAPACK_dsycon_3(
+    char const* uplo,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* E, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ssycon_3 LAPACK_GLOBAL(ssycon_3,SSYCON_3)
+void LAPACK_ssycon_3(
+    char const* uplo,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* E, lapack_int const* ipiv,
+    float const* anorm,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zsycon_3 LAPACK_GLOBAL(zsycon_3,ZSYCON_3)
+void LAPACK_zsycon_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* E, lapack_int const* ipiv,
+    double const* anorm,
+    double* rcond,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_csyconv LAPACK_GLOBAL(csyconv,CSYCONV)
+void LAPACK_csyconv(
+    char const* uplo, char const* way,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* E,
+    lapack_int* info );
+
+#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv,DSYCONV)
+void LAPACK_dsyconv(
+    char const* uplo, char const* way,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* E,
+    lapack_int* info );
+
+#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv,SSYCONV)
+void LAPACK_ssyconv(
+    char const* uplo, char const* way,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* E,
+    lapack_int* info );
+
+#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv,ZSYCONV)
+void LAPACK_zsyconv(
+    char const* uplo, char const* way,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* E,
+    lapack_int* info );
+
+#define LAPACK_csyequb LAPACK_GLOBAL(csyequb,CSYEQUB)
+void LAPACK_csyequb(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* S,
+    float* scond,
+    float* amax,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb,DSYEQUB)
+void LAPACK_dsyequb(
+    char const* uplo,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* S,
+    double* scond,
+    double* amax,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb,SSYEQUB)
+void LAPACK_ssyequb(
+    char const* uplo,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* S,
+    float* scond,
+    float* amax,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb,ZSYEQUB)
+void LAPACK_zsyequb(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* S,
+    double* scond,
+    double* amax,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_dsyev LAPACK_GLOBAL(dsyev,DSYEV)
+void LAPACK_dsyev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* W,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssyev LAPACK_GLOBAL(ssyev,SSYEV)
+void LAPACK_ssyev(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* W,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsyev_2stage LAPACK_GLOBAL(dsyev_2stage,DSYEV_2STAGE)
+void LAPACK_dsyev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* W,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssyev_2stage LAPACK_GLOBAL(ssyev_2stage,SSYEV_2STAGE)
+void LAPACK_ssyev_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* W,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd,DSYEVD)
+void LAPACK_dsyevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* W,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd,SSYEVD)
+void LAPACK_ssyevd(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* W,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsyevd_2stage LAPACK_GLOBAL(dsyevd_2stage,DSYEVD_2STAGE)
+void LAPACK_dsyevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* W,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssyevd_2stage LAPACK_GLOBAL(ssyevd_2stage,SSYEVD_2STAGE)
+void LAPACK_ssyevd_2stage(
+    char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* W,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr,DSYEVR)
+void LAPACK_dsyevr(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr,SSYEVR)
+void LAPACK_ssyevr(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsyevr_2stage LAPACK_GLOBAL(dsyevr_2stage,DSYEVR_2STAGE)
+void LAPACK_dsyevr_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssyevr_2stage LAPACK_GLOBAL(ssyevr_2stage,SSYEVR_2STAGE)
+void LAPACK_ssyevr_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz, lapack_int* ISUPPZ,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx,DSYEVX)
+void LAPACK_dsyevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx,SSYEVX)
+void LAPACK_ssyevx(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_dsyevx_2stage LAPACK_GLOBAL(dsyevx_2stage,DSYEVX_2STAGE)
+void LAPACK_dsyevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_ssyevx_2stage LAPACK_GLOBAL(ssyevx_2stage,SSYEVX_2STAGE)
+void LAPACK_ssyevx_2stage(
+    char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_dsygst LAPACK_GLOBAL(dsygst,DSYGST)
+void LAPACK_dsygst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ssygst LAPACK_GLOBAL(ssygst,SSYGST)
+void LAPACK_ssygst(
+    lapack_int const* itype, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsygv LAPACK_GLOBAL(dsygv,DSYGV)
+void LAPACK_dsygv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* W,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssygv LAPACK_GLOBAL(ssygv,SSYGV)
+void LAPACK_ssygv(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* W,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsygv_2stage LAPACK_GLOBAL(dsygv_2stage,DSYGV_2STAGE)
+void LAPACK_dsygv_2stage(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* W,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssygv_2stage LAPACK_GLOBAL(ssygv_2stage,SSYGV_2STAGE)
+void LAPACK_ssygv_2stage(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* W,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd,DSYGVD)
+void LAPACK_dsygvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* W,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd,SSYGVD)
+void LAPACK_ssygvd(
+    lapack_int const* itype, char const* jobz, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* W,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx,DSYGVX)
+void LAPACK_dsygvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double const* vl,
+    double const* vu, lapack_int const* il, lapack_int const* iu,
+    double const* abstol, lapack_int* m,
+    double* W,
+    double* Z, lapack_int const* ldz,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx,SSYGVX)
+void LAPACK_ssygvx(
+    lapack_int const* itype, char const* jobz, char const* range, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float const* vl,
+    float const* vu, lapack_int const* il, lapack_int const* iu,
+    float const* abstol, lapack_int* m,
+    float* W,
+    float* Z, lapack_int const* ldz,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int* IFAIL,
+    lapack_int* info );
+
+#define LAPACK_csyr LAPACK_GLOBAL(csyr,CSYR)
+void LAPACK_csyr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* alpha,
+    lapack_complex_float const* X, lapack_int const* incx,
+    lapack_complex_float* A, lapack_int const* lda );
+
+#define LAPACK_zsyr LAPACK_GLOBAL(zsyr,ZSYR)
+void LAPACK_zsyr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* alpha,
+    lapack_complex_double const* X, lapack_int const* incx,
+    lapack_complex_double* A, lapack_int const* lda );
+
+#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs,CSYRFS)
+void LAPACK_csyrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs,DSYRFS)
+void LAPACK_dsyrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs,SSYRFS)
+void LAPACK_ssyrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs,ZSYRFS)
+void LAPACK_zsyrfs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx,CSYRFSX)
+void LAPACK_csyrfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    float* S,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx,DSYRFSX)
+void LAPACK_dsyrfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    double* S,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx,SSYRFSX)
+void LAPACK_ssyrfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    float* S,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx,ZSYRFSX)
+void LAPACK_zsyrfsx(
+    char const* uplo, char const* equed,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv,
+    double* S,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_csysv LAPACK_GLOBAL(csysv,CSYSV)
+void LAPACK_csysv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsysv LAPACK_GLOBAL(dsysv,DSYSV)
+void LAPACK_dsysv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssysv LAPACK_GLOBAL(ssysv,SSYSV)
+void LAPACK_ssysv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsysv LAPACK_GLOBAL(zsysv,ZSYSV)
+void LAPACK_zsysv(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csysv_aa LAPACK_GLOBAL(csysv_aa,CSYSV_AA)
+void LAPACK_csysv_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsysv_aa LAPACK_GLOBAL(dsysv_aa,DSYSV_AA)
+void LAPACK_dsysv_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssysv_aa LAPACK_GLOBAL(ssysv_aa,SSYSV_AA)
+void LAPACK_ssysv_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsysv_aa LAPACK_GLOBAL(zsysv_aa,ZSYSV_AA)
+void LAPACK_zsysv_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csysv_aa_2stage LAPACK_GLOBAL(csysv_aa_2stage,CSYSV_AA_2STAGE)
+void LAPACK_csysv_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsysv_aa_2stage LAPACK_GLOBAL(dsysv_aa_2stage,DSYSV_AA_2STAGE)
+void LAPACK_dsysv_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssysv_aa_2stage LAPACK_GLOBAL(ssysv_aa_2stage,SSYSV_AA_2STAGE)
+void LAPACK_ssysv_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsysv_aa_2stage LAPACK_GLOBAL(zsysv_aa_2stage,ZSYSV_AA_2STAGE)
+void LAPACK_zsysv_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csysv_rk LAPACK_GLOBAL(csysv_rk,CSYSV_RK)
+void LAPACK_csysv_rk(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* E, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsysv_rk LAPACK_GLOBAL(dsysv_rk,DSYSV_RK)
+void LAPACK_dsysv_rk(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* E, lapack_int* ipiv,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssysv_rk LAPACK_GLOBAL(ssysv_rk,SSYSV_RK)
+void LAPACK_ssysv_rk(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* E, lapack_int* ipiv,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsysv_rk LAPACK_GLOBAL(zsysv_rk,ZSYSV_RK)
+void LAPACK_zsysv_rk(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* E, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csysv_rook LAPACK_GLOBAL(csysv_rook,CSYSV_ROOK)
+void LAPACK_csysv_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsysv_rook LAPACK_GLOBAL(dsysv_rook,DSYSV_ROOK)
+void LAPACK_dsysv_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssysv_rook LAPACK_GLOBAL(ssysv_rook,SSYSV_ROOK)
+void LAPACK_ssysv_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsysv_rook LAPACK_GLOBAL(zsysv_rook,ZSYSV_ROOK)
+void LAPACK_zsysv_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csysvx LAPACK_GLOBAL(csysvx,CSYSVX)
+void LAPACK_csysvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx,DSYSVX)
+void LAPACK_dsysvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double* AF, lapack_int const* ldaf, lapack_int* ipiv,
+    double const* B, lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx,SSYSVX)
+void LAPACK_ssysvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float* AF, lapack_int const* ldaf, lapack_int* ipiv,
+    float const* B, lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* ferr,
+    float* berr,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx,ZSYSVX)
+void LAPACK_zsysvx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx,CSYSVXX)
+void LAPACK_csysvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    float* S,
+    lapack_complex_float* B,
+    lapack_int const* ldb,
+    lapack_complex_float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx,DSYSVXX)
+void LAPACK_dsysvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda,
+    double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    double* S,
+    double* B,
+    lapack_int const* ldb,
+    double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx,SSYSVXX)
+void LAPACK_ssysvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda,
+    float* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    float* S,
+    float* B,
+    lapack_int const* ldb,
+    float* X, lapack_int const* ldx,
+    float* rcond,
+    float* rpvgrw,
+    float* berr, lapack_int const* n_err_bnds,
+    float* err_bnds_norm,
+    float* err_bnds_comp, lapack_int const* nparams,
+    float* params,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx,ZSYSVXX)
+void LAPACK_zsysvxx(
+    char const* fact, char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* AF, lapack_int const* ldaf, lapack_int* ipiv, char* equed,
+    double* S,
+    lapack_complex_double* B,
+    lapack_int const* ldb,
+    lapack_complex_double* X, lapack_int const* ldx,
+    double* rcond,
+    double* rpvgrw,
+    double* berr, lapack_int const* n_err_bnds,
+    double* err_bnds_norm,
+    double* err_bnds_comp, lapack_int const* nparams,
+    double* params,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr,CSYSWAPR)
+void LAPACK_csyswapr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 );
+
+#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr,DSYSWAPR)
+void LAPACK_dsyswapr(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 );
+
+#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr,SSYSWAPR)
+void LAPACK_ssyswapr(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 );
+
+#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr,ZSYSWAPR)
+void LAPACK_zsyswapr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* i1, lapack_int const* i2 );
+
+#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd,DSYTRD)
+void LAPACK_dsytrd(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* D,
+    double* E,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd,SSYTRD)
+void LAPACK_ssytrd(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* D,
+    float* E,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytrd_2stage LAPACK_GLOBAL(dsytrd_2stage,DSYTRD_2STAGE)
+void LAPACK_dsytrd_2stage(
+    char const* vect, char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* D,
+    double* E,
+    double* tau,
+    double* HOUS2, lapack_int const* lhous2,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrd_2stage LAPACK_GLOBAL(ssytrd_2stage,SSYTRD_2STAGE)
+void LAPACK_ssytrd_2stage(
+    char const* vect, char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* D,
+    float* E,
+    float* tau,
+    float* HOUS2, lapack_int const* lhous2,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytrf LAPACK_GLOBAL(csytrf,CSYTRF)
+void LAPACK_csytrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf,DSYTRF)
+void LAPACK_dsytrf(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf,SSYTRF)
+void LAPACK_ssytrf(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf,ZSYTRF)
+void LAPACK_zsytrf(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytrf_aa LAPACK_GLOBAL(csytrf_aa,CSYTRF_AA)
+void LAPACK_csytrf_aa(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytrf_aa LAPACK_GLOBAL(dsytrf_aa,DSYTRF_AA)
+void LAPACK_dsytrf_aa(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrf_aa LAPACK_GLOBAL(ssytrf_aa,SSYTRF_AA)
+void LAPACK_ssytrf_aa(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytrf_aa LAPACK_GLOBAL(zsytrf_aa,ZSYTRF_AA)
+void LAPACK_zsytrf_aa(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytrf_aa_2stage LAPACK_GLOBAL(csytrf_aa_2stage,CSYTRF_AA_2STAGE)
+void LAPACK_csytrf_aa_2stage(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytrf_aa_2stage LAPACK_GLOBAL(dsytrf_aa_2stage,DSYTRF_AA_2STAGE)
+void LAPACK_dsytrf_aa_2stage(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrf_aa_2stage LAPACK_GLOBAL(ssytrf_aa_2stage,SSYTRF_AA_2STAGE)
+void LAPACK_ssytrf_aa_2stage(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytrf_aa_2stage LAPACK_GLOBAL(zsytrf_aa_2stage,ZSYTRF_AA_2STAGE)
+void LAPACK_zsytrf_aa_2stage(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* TB, lapack_int const* ltb, lapack_int* ipiv, lapack_int* ipiv2,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytrf_rk LAPACK_GLOBAL(csytrf_rk,CSYTRF_RK)
+void LAPACK_csytrf_rk(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* E, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytrf_rk LAPACK_GLOBAL(dsytrf_rk,DSYTRF_RK)
+void LAPACK_dsytrf_rk(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* E, lapack_int* ipiv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrf_rk LAPACK_GLOBAL(ssytrf_rk,SSYTRF_RK)
+void LAPACK_ssytrf_rk(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* E, lapack_int* ipiv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytrf_rk LAPACK_GLOBAL(zsytrf_rk,ZSYTRF_RK)
+void LAPACK_zsytrf_rk(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* E, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytrf_rook LAPACK_GLOBAL(csytrf_rook,CSYTRF_ROOK)
+void LAPACK_csytrf_rook(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytrf_rook LAPACK_GLOBAL(dsytrf_rook,DSYTRF_ROOK)
+void LAPACK_dsytrf_rook(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int* ipiv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrf_rook LAPACK_GLOBAL(ssytrf_rook,SSYTRF_ROOK)
+void LAPACK_ssytrf_rook(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int* ipiv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytrf_rook LAPACK_GLOBAL(zsytrf_rook,ZSYTRF_ROOK)
+void LAPACK_zsytrf_rook(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytri LAPACK_GLOBAL(csytri,CSYTRI)
+void LAPACK_csytri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dsytri LAPACK_GLOBAL(dsytri,DSYTRI)
+void LAPACK_dsytri(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssytri LAPACK_GLOBAL(ssytri,SSYTRI)
+void LAPACK_ssytri(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zsytri LAPACK_GLOBAL(zsytri,ZSYTRI)
+void LAPACK_zsytri(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2,CSYTRI2)
+void LAPACK_csytri2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2,DSYTRI2)
+void LAPACK_dsytri2(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2,SSYTRI2)
+void LAPACK_ssytri2(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2,ZSYTRI2)
+void LAPACK_zsytri2(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x,CSYTRI2X)
+void LAPACK_csytri2x(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* work, lapack_int const* nb,
+    lapack_int* info );
+
+#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x,DSYTRI2X)
+void LAPACK_dsytri2x(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* work, lapack_int const* nb,
+    lapack_int* info );
+
+#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x,SSYTRI2X)
+void LAPACK_ssytri2x(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* work, lapack_int const* nb,
+    lapack_int* info );
+
+#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x,ZSYTRI2X)
+void LAPACK_zsytri2x(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* work, lapack_int const* nb,
+    lapack_int* info );
+
+#define LAPACK_csytri_3 LAPACK_GLOBAL(csytri_3,CSYTRI_3)
+void LAPACK_csytri_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* E, lapack_int const* ipiv,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytri_3 LAPACK_GLOBAL(dsytri_3,DSYTRI_3)
+void LAPACK_dsytri_3(
+    char const* uplo,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double const* E, lapack_int const* ipiv,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytri_3 LAPACK_GLOBAL(ssytri_3,SSYTRI_3)
+void LAPACK_ssytri_3(
+    char const* uplo,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float const* E, lapack_int const* ipiv,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytri_3 LAPACK_GLOBAL(zsytri_3,ZSYTRI_3)
+void LAPACK_zsytri_3(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* E, lapack_int const* ipiv,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytrs LAPACK_GLOBAL(csytrs,CSYTRS)
+void LAPACK_csytrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs,DSYTRS)
+void LAPACK_dsytrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs,SSYTRS)
+void LAPACK_ssytrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs,ZSYTRS)
+void LAPACK_zsytrs(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2,CSYTRS2)
+void LAPACK_csytrs2(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2,DSYTRS2)
+void LAPACK_dsytrs2(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2,SSYTRS2)
+void LAPACK_ssytrs2(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2,ZSYTRS2)
+void LAPACK_zsytrs2(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_csytrs_3 LAPACK_GLOBAL(csytrs_3,CSYTRS_3)
+void LAPACK_csytrs_3(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* E, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsytrs_3 LAPACK_GLOBAL(dsytrs_3,DSYTRS_3)
+void LAPACK_dsytrs_3(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* E, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ssytrs_3 LAPACK_GLOBAL(ssytrs_3,SSYTRS_3)
+void LAPACK_ssytrs_3(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* E, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zsytrs_3 LAPACK_GLOBAL(zsytrs_3,ZSYTRS_3)
+void LAPACK_zsytrs_3(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* E, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_csytrs_aa LAPACK_GLOBAL(csytrs_aa,CSYTRS_AA)
+void LAPACK_csytrs_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dsytrs_aa LAPACK_GLOBAL(dsytrs_aa,DSYTRS_AA)
+void LAPACK_dsytrs_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ssytrs_aa LAPACK_GLOBAL(ssytrs_aa,SSYTRS_AA)
+void LAPACK_ssytrs_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zsytrs_aa LAPACK_GLOBAL(zsytrs_aa,ZSYTRS_AA)
+void LAPACK_zsytrs_aa(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_csytrs_aa_2stage LAPACK_GLOBAL(csytrs_aa_2stage,CSYTRS_AA_2STAGE)
+void LAPACK_csytrs_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsytrs_aa_2stage LAPACK_GLOBAL(dsytrs_aa_2stage,DSYTRS_AA_2STAGE)
+void LAPACK_dsytrs_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ssytrs_aa_2stage LAPACK_GLOBAL(ssytrs_aa_2stage,SSYTRS_AA_2STAGE)
+void LAPACK_ssytrs_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zsytrs_aa_2stage LAPACK_GLOBAL(zsytrs_aa_2stage,ZSYTRS_AA_2STAGE)
+void LAPACK_zsytrs_aa_2stage(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* TB, lapack_int const* ltb, lapack_int const* ipiv, lapack_int const* ipiv2,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_csytrs_rook LAPACK_GLOBAL(csytrs_rook,CSYTRS_ROOK)
+void LAPACK_csytrs_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dsytrs_rook LAPACK_GLOBAL(dsytrs_rook,DSYTRS_ROOK)
+void LAPACK_dsytrs_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ssytrs_rook LAPACK_GLOBAL(ssytrs_rook,SSYTRS_ROOK)
+void LAPACK_ssytrs_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda, lapack_int const* ipiv,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_zsytrs_rook LAPACK_GLOBAL(zsytrs_rook,ZSYTRS_ROOK)
+void LAPACK_zsytrs_rook(
+    char const* uplo,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda, lapack_int const* ipiv,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon,CTBCON)
+void LAPACK_ctbcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon,DTBCON)
+void LAPACK_dtbcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* kd,
+    double const* AB, lapack_int const* ldab,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_stbcon LAPACK_GLOBAL(stbcon,STBCON)
+void LAPACK_stbcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* kd,
+    float const* AB, lapack_int const* ldab,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon,ZTBCON)
+void LAPACK_ztbcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n, lapack_int const* kd,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs,CTBRFS)
+void LAPACK_ctbrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float const* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs,DTBRFS)
+void LAPACK_dtbrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    double const* AB, lapack_int const* ldab,
+    double const* B, lapack_int const* ldb,
+    double const* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs,STBRFS)
+void LAPACK_stbrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    float const* AB, lapack_int const* ldab,
+    float const* B, lapack_int const* ldb,
+    float const* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs,ZTBRFS)
+void LAPACK_ztbrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double const* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs,CTBTRS)
+void LAPACK_ctbtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_float const* AB, lapack_int const* ldab,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs,DTBTRS)
+void LAPACK_dtbtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    double const* AB, lapack_int const* ldab,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs,STBTRS)
+void LAPACK_stbtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    float const* AB, lapack_int const* ldab,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs,ZTBTRS)
+void LAPACK_ztbtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* kd, lapack_int const* nrhs,
+    lapack_complex_double const* AB, lapack_int const* ldab,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm,CTFSM)
+void LAPACK_ctfsm(
+    char const* transr, char const* side, char const* uplo, char const* trans, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* alpha,
+    lapack_complex_float const* A,
+    lapack_complex_float* B, lapack_int const* ldb );
+
+#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm,DTFSM)
+void LAPACK_dtfsm(
+    char const* transr, char const* side, char const* uplo, char const* trans, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    double const* alpha,
+    double const* A,
+    double* B, lapack_int const* ldb );
+
+#define LAPACK_stfsm LAPACK_GLOBAL(stfsm,STFSM)
+void LAPACK_stfsm(
+    char const* transr, char const* side, char const* uplo, char const* trans, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    float const* alpha,
+    float const* A,
+    float* B, lapack_int const* ldb );
+
+#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm,ZTFSM)
+void LAPACK_ztfsm(
+    char const* transr, char const* side, char const* uplo, char const* trans, char const* diag,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* alpha,
+    lapack_complex_double const* A,
+    lapack_complex_double* B, lapack_int const* ldb );
+
+#define LAPACK_ctftri LAPACK_GLOBAL(ctftri,CTFTRI)
+void LAPACK_ctftri(
+    char const* transr, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_float* A,
+    lapack_int* info );
+
+#define LAPACK_dtftri LAPACK_GLOBAL(dtftri,DTFTRI)
+void LAPACK_dtftri(
+    char const* transr, char const* uplo, char const* diag,
+    lapack_int const* n,
+    double* A,
+    lapack_int* info );
+
+#define LAPACK_stftri LAPACK_GLOBAL(stftri,STFTRI)
+void LAPACK_stftri(
+    char const* transr, char const* uplo, char const* diag,
+    lapack_int const* n,
+    float* A,
+    lapack_int* info );
+
+#define LAPACK_ztftri LAPACK_GLOBAL(ztftri,ZTFTRI)
+void LAPACK_ztftri(
+    char const* transr, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_double* A,
+    lapack_int* info );
+
+#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp,CTFTTP)
+void LAPACK_ctfttp(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* ARF,
+    lapack_complex_float* AP,
+    lapack_int* info );
+
+#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp,DTFTTP)
+void LAPACK_dtfttp(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    double const* ARF,
+    double* AP,
+    lapack_int* info );
+
+#define LAPACK_stfttp LAPACK_GLOBAL(stfttp,STFTTP)
+void LAPACK_stfttp(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    float const* ARF,
+    float* AP,
+    lapack_int* info );
+
+#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp,ZTFTTP)
+void LAPACK_ztfttp(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* ARF,
+    lapack_complex_double* AP,
+    lapack_int* info );
+
+#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr,CTFTTR)
+void LAPACK_ctfttr(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* ARF,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr,DTFTTR)
+void LAPACK_dtfttr(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    double const* ARF,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_stfttr LAPACK_GLOBAL(stfttr,STFTTR)
+void LAPACK_stfttr(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    float const* ARF,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr,ZTFTTR)
+void LAPACK_ztfttr(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* ARF,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc,CTGEVC)
+void LAPACK_ctgevc(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_float const* S, lapack_int const* lds,
+    lapack_complex_float const* P, lapack_int const* ldp,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc,DTGEVC)
+void LAPACK_dtgevc(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    double const* S, lapack_int const* lds,
+    double const* P, lapack_int const* ldp,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_stgevc LAPACK_GLOBAL(stgevc,STGEVC)
+void LAPACK_stgevc(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    float const* S, lapack_int const* lds,
+    float const* P, lapack_int const* ldp,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc,ZTGEVC)
+void LAPACK_ztgevc(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_double const* S, lapack_int const* lds,
+    lapack_complex_double const* P, lapack_int const* ldp,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc,CTGEXC)
+void LAPACK_ctgexc(
+    lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* Z, lapack_int const* ldz, lapack_int const* ifst, lapack_int* ilst,
+    lapack_int* info );
+
+#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc,DTGEXC)
+void LAPACK_dtgexc(
+    lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* Q, lapack_int const* ldq,
+    double* Z, lapack_int const* ldz, lapack_int* ifst, lapack_int* ilst,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_stgexc LAPACK_GLOBAL(stgexc,STGEXC)
+void LAPACK_stgexc(
+    lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* Q, lapack_int const* ldq,
+    float* Z, lapack_int const* ldz, lapack_int* ifst, lapack_int* ilst,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc,ZTGEXC)
+void LAPACK_ztgexc(
+    lapack_logical const* wantq, lapack_logical const* wantz, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* Z, lapack_int const* ldz, lapack_int const* ifst, lapack_int* ilst,
+    lapack_int* info );
+
+#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen,CTGSEN)
+void LAPACK_ctgsen(
+    lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* alpha,
+    lapack_complex_float* beta,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* Z, lapack_int const* ldz, lapack_int* m,
+    float* pl,
+    float* pr,
+    float* DIF,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen,DTGSEN)
+void LAPACK_dtgsen(
+    lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* alphar,
+    double* alphai,
+    double* beta,
+    double* Q, lapack_int const* ldq,
+    double* Z, lapack_int const* ldz, lapack_int* m,
+    double* pl,
+    double* pr,
+    double* DIF,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_stgsen LAPACK_GLOBAL(stgsen,STGSEN)
+void LAPACK_stgsen(
+    lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* alphar,
+    float* alphai,
+    float* beta,
+    float* Q, lapack_int const* ldq,
+    float* Z, lapack_int const* ldz, lapack_int* m,
+    float* pl,
+    float* pr,
+    float* DIF,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen,ZTGSEN)
+void LAPACK_ztgsen(
+    lapack_int const* ijob, lapack_logical const* wantq, lapack_logical const* wantz, lapack_logical const* select, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* alpha,
+    lapack_complex_double* beta,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* Z, lapack_int const* ldz, lapack_int* m,
+    double* pl,
+    double* pr,
+    double* DIF,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja,CTGSJA)
+void LAPACK_ctgsja(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    float const* tola,
+    float const* tolb,
+    float* alpha,
+    float* beta,
+    lapack_complex_float* U, lapack_int const* ldu,
+    lapack_complex_float* V, lapack_int const* ldv,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* work, lapack_int* ncycle,
+    lapack_int* info );
+
+#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja,DTGSJA)
+void LAPACK_dtgsja(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double const* tola,
+    double const* tolb,
+    double* alpha,
+    double* beta,
+    double* U, lapack_int const* ldu,
+    double* V, lapack_int const* ldv,
+    double* Q, lapack_int const* ldq,
+    double* work, lapack_int* ncycle,
+    lapack_int* info );
+
+#define LAPACK_stgsja LAPACK_GLOBAL(stgsja,STGSJA)
+void LAPACK_stgsja(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float const* tola,
+    float const* tolb,
+    float* alpha,
+    float* beta,
+    float* U, lapack_int const* ldu,
+    float* V, lapack_int const* ldv,
+    float* Q, lapack_int const* ldq,
+    float* work, lapack_int* ncycle,
+    lapack_int* info );
+
+#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja,ZTGSJA)
+void LAPACK_ztgsja(
+    char const* jobu, char const* jobv, char const* jobq,
+    lapack_int const* m, lapack_int const* p, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    double const* tola,
+    double const* tolb,
+    double* alpha,
+    double* beta,
+    lapack_complex_double* U, lapack_int const* ldu,
+    lapack_complex_double* V, lapack_int const* ldv,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* work, lapack_int* ncycle,
+    lapack_int* info );
+
+#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna,CTGSNA)
+void LAPACK_ctgsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float const* VL, lapack_int const* ldvl,
+    lapack_complex_float const* VR, lapack_int const* ldvr,
+    float* S,
+    float* DIF, lapack_int const* mm, lapack_int* m,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna,DTGSNA)
+void LAPACK_dtgsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    double const* VL, lapack_int const* ldvl,
+    double const* VR, lapack_int const* ldvr,
+    double* S,
+    double* DIF, lapack_int const* mm, lapack_int* m,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_stgsna LAPACK_GLOBAL(stgsna,STGSNA)
+void LAPACK_stgsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    float const* VL, lapack_int const* ldvl,
+    float const* VR, lapack_int const* ldvr,
+    float* S,
+    float* DIF, lapack_int const* mm, lapack_int* m,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna,ZTGSNA)
+void LAPACK_ztgsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double const* VL, lapack_int const* ldvl,
+    lapack_complex_double const* VR, lapack_int const* ldvr,
+    double* S,
+    double* DIF, lapack_int const* mm, lapack_int* m,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl,CTGSYL)
+void LAPACK_ctgsyl(
+    char const* trans,
+    lapack_int const* ijob, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float const* D, lapack_int const* ldd,
+    lapack_complex_float const* E, lapack_int const* lde,
+    lapack_complex_float* F, lapack_int const* ldf,
+    float* dif,
+    float* scale,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl,DTGSYL)
+void LAPACK_dtgsyl(
+    char const* trans,
+    lapack_int const* ijob, lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    double* C, lapack_int const* ldc,
+    double const* D, lapack_int const* ldd,
+    double const* E, lapack_int const* lde,
+    double* F, lapack_int const* ldf,
+    double* dif,
+    double* scale,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl,STGSYL)
+void LAPACK_stgsyl(
+    char const* trans,
+    lapack_int const* ijob, lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    float* C, lapack_int const* ldc,
+    float const* D, lapack_int const* ldd,
+    float const* E, lapack_int const* lde,
+    float* F, lapack_int const* ldf,
+    float* dif,
+    float* scale,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl,ZTGSYL)
+void LAPACK_ztgsyl(
+    char const* trans,
+    lapack_int const* ijob, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double const* D, lapack_int const* ldd,
+    lapack_complex_double const* E, lapack_int const* lde,
+    lapack_complex_double* F, lapack_int const* ldf,
+    double* dif,
+    double* scale,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon,CTPCON)
+void LAPACK_ctpcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon,DTPCON)
+void LAPACK_dtpcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    double const* AP,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_stpcon LAPACK_GLOBAL(stpcon,STPCON)
+void LAPACK_stpcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    float const* AP,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon,ZTPCON)
+void LAPACK_ztpcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctplqt LAPACK_GLOBAL(ctplqt,CTPLQT)
+void LAPACK_ctplqt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dtplqt LAPACK_GLOBAL(dtplqt,DTPLQT)
+void LAPACK_dtplqt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* T, lapack_int const* ldt,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_stplqt LAPACK_GLOBAL(stplqt,STPLQT)
+void LAPACK_stplqt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* T, lapack_int const* ldt,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_ztplqt LAPACK_GLOBAL(ztplqt,ZTPLQT)
+void LAPACK_ztplqt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* mb,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_ctplqt2 LAPACK_GLOBAL(ctplqt2,CTPLQT2)
+void LAPACK_ctplqt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_dtplqt2 LAPACK_GLOBAL(dtplqt2,DTPLQT2)
+void LAPACK_dtplqt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_stplqt2 LAPACK_GLOBAL(stplqt2,STPLQT2)
+void LAPACK_stplqt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_ztplqt2 LAPACK_GLOBAL(ztplqt2,ZTPLQT2)
+void LAPACK_ztplqt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_ctpmlqt LAPACK_GLOBAL(ctpmlqt,CTPMLQT)
+void LAPACK_ctpmlqt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb,
+    lapack_complex_float const* V, lapack_int const* ldv,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dtpmlqt LAPACK_GLOBAL(dtpmlqt,DTPMLQT)
+void LAPACK_dtpmlqt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb,
+    double const* V, lapack_int const* ldv,
+    double const* T, lapack_int const* ldt,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_stpmlqt LAPACK_GLOBAL(stpmlqt,STPMLQT)
+void LAPACK_stpmlqt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb,
+    float const* V, lapack_int const* ldv,
+    float const* T, lapack_int const* ldt,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_ztpmlqt LAPACK_GLOBAL(ztpmlqt,ZTPMLQT)
+void LAPACK_ztpmlqt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* mb,
+    lapack_complex_double const* V, lapack_int const* ldv,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt,CTPMQRT)
+void LAPACK_ctpmqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb,
+    lapack_complex_float const* V, lapack_int const* ldv,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt,DTPMQRT)
+void LAPACK_dtpmqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb,
+    double const* V, lapack_int const* ldv,
+    double const* T, lapack_int const* ldt,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt,STPMQRT)
+void LAPACK_stpmqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb,
+    float const* V, lapack_int const* ldv,
+    float const* T, lapack_int const* ldt,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt,ZTPMQRT)
+void LAPACK_ztpmqrt(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l, lapack_int const* nb,
+    lapack_complex_double const* V, lapack_int const* ldv,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt,CTPQRT)
+void LAPACK_ctpqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt,DTPQRT)
+void LAPACK_dtpqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* T, lapack_int const* ldt,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_stpqrt LAPACK_GLOBAL(stpqrt,STPQRT)
+void LAPACK_stpqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* T, lapack_int const* ldt,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt,ZTPQRT)
+void LAPACK_ztpqrt(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l, lapack_int const* nb,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2,CTPQRT2)
+void LAPACK_ctpqrt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2,DTPQRT2)
+void LAPACK_dtpqrt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2,STPQRT2)
+void LAPACK_stpqrt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2,ZTPQRT2)
+void LAPACK_ztpqrt2(
+    lapack_int const* m, lapack_int const* n, lapack_int const* l,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_int* info );
+
+#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb,CTPRFB)
+void LAPACK_ctprfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    lapack_complex_float const* V, lapack_int const* ldv,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_complex_float* work, lapack_int const* ldwork );
+
+#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb,DTPRFB)
+void LAPACK_dtprfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    double const* V, lapack_int const* ldv,
+    double const* T, lapack_int const* ldt,
+    double* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    double* work, lapack_int const* ldwork );
+
+#define LAPACK_stprfb LAPACK_GLOBAL(stprfb,STPRFB)
+void LAPACK_stprfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    float const* V, lapack_int const* ldv,
+    float const* T, lapack_int const* ldt,
+    float* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    float* work, lapack_int const* ldwork );
+
+#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb,ZTPRFB)
+void LAPACK_ztprfb(
+    char const* side, char const* trans, char const* direct, char const* storev,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    lapack_complex_double const* V, lapack_int const* ldv,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_complex_double* work, lapack_int const* ldwork );
+
+#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs,CTPRFS)
+void LAPACK_ctprfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float const* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs,DTPRFS)
+void LAPACK_dtprfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* AP,
+    double const* B, lapack_int const* ldb,
+    double const* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_stprfs LAPACK_GLOBAL(stprfs,STPRFS)
+void LAPACK_stprfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* AP,
+    float const* B, lapack_int const* ldb,
+    float const* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs,ZTPRFS)
+void LAPACK_ztprfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double const* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctptri LAPACK_GLOBAL(ctptri,CTPTRI)
+void LAPACK_ctptri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_float* AP,
+    lapack_int* info );
+
+#define LAPACK_dtptri LAPACK_GLOBAL(dtptri,DTPTRI)
+void LAPACK_dtptri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    double* AP,
+    lapack_int* info );
+
+#define LAPACK_stptri LAPACK_GLOBAL(stptri,STPTRI)
+void LAPACK_stptri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    float* AP,
+    lapack_int* info );
+
+#define LAPACK_ztptri LAPACK_GLOBAL(ztptri,ZTPTRI)
+void LAPACK_ztptri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_double* AP,
+    lapack_int* info );
+
+#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs,CTPTRS)
+void LAPACK_ctptrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* AP,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs,DTPTRS)
+void LAPACK_dtptrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* AP,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_stptrs LAPACK_GLOBAL(stptrs,STPTRS)
+void LAPACK_stptrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* AP,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs,ZTPTRS)
+void LAPACK_ztptrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* AP,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf,CTPTTF)
+void LAPACK_ctpttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    lapack_complex_float* ARF,
+    lapack_int* info );
+
+#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf,DTPTTF)
+void LAPACK_dtpttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    double const* AP,
+    double* ARF,
+    lapack_int* info );
+
+#define LAPACK_stpttf LAPACK_GLOBAL(stpttf,STPTTF)
+void LAPACK_stpttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    float const* AP,
+    float* ARF,
+    lapack_int* info );
+
+#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf,ZTPTTF)
+void LAPACK_ztpttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    lapack_complex_double* ARF,
+    lapack_int* info );
+
+#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr,CTPTTR)
+void LAPACK_ctpttr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr,DTPTTR)
+void LAPACK_dtpttr(
+    char const* uplo,
+    lapack_int const* n,
+    double const* AP,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_stpttr LAPACK_GLOBAL(stpttr,STPTTR)
+void LAPACK_stpttr(
+    char const* uplo,
+    lapack_int const* n,
+    float const* AP,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr,ZTPTTR)
+void LAPACK_ztpttr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon,CTRCON)
+void LAPACK_ctrcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    float* rcond,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon,DTRCON)
+void LAPACK_dtrcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* rcond,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_strcon LAPACK_GLOBAL(strcon,STRCON)
+void LAPACK_strcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* rcond,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon,ZTRCON)
+void LAPACK_ztrcon(
+    char const* norm, char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    double* rcond,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc,CTREVC)
+void LAPACK_ctrevc(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc,DTREVC)
+void LAPACK_dtrevc(
+    char const* side, char const* howmny,
+    lapack_logical* select,
+    lapack_int const* n,
+    double const* T, lapack_int const* ldt,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_strevc LAPACK_GLOBAL(strevc,STREVC)
+void LAPACK_strevc(
+    char const* side, char const* howmny,
+    lapack_logical* select,
+    lapack_int const* n,
+    float const* T, lapack_int const* ldt,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc,ZTREVC)
+void LAPACK_ztrevc(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctrevc3 LAPACK_GLOBAL(ctrevc3,CTREVC3)
+void LAPACK_ctrevc3(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* VL, lapack_int const* ldvl,
+    lapack_complex_float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_dtrevc3 LAPACK_GLOBAL(dtrevc3,DTREVC3)
+void LAPACK_dtrevc3(
+    char const* side, char const* howmny,
+    lapack_logical* select,
+    lapack_int const* n,
+    double const* T, lapack_int const* ldt,
+    double* VL, lapack_int const* ldvl,
+    double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_strevc3 LAPACK_GLOBAL(strevc3,STREVC3)
+void LAPACK_strevc3(
+    char const* side, char const* howmny,
+    lapack_logical* select,
+    lapack_int const* n,
+    float const* T, lapack_int const* ldt,
+    float* VL, lapack_int const* ldvl,
+    float* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ztrevc3 LAPACK_GLOBAL(ztrevc3,ZTREVC3)
+void LAPACK_ztrevc3(
+    char const* side, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* VL, lapack_int const* ldvl,
+    lapack_complex_double* VR, lapack_int const* ldvr, lapack_int const* mm, lapack_int* m,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* info );
+
+#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc,CTREXC)
+void LAPACK_ctrexc(
+    char const* compq,
+    lapack_int const* n,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* Q, lapack_int const* ldq, lapack_int const* ifst, lapack_int const* ilst,
+    lapack_int* info );
+
+#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc,DTREXC)
+void LAPACK_dtrexc(
+    char const* compq,
+    lapack_int const* n,
+    double* T, lapack_int const* ldt,
+    double* Q, lapack_int const* ldq, lapack_int* ifst, lapack_int* ilst,
+    double* work,
+    lapack_int* info );
+
+#define LAPACK_strexc LAPACK_GLOBAL(strexc,STREXC)
+void LAPACK_strexc(
+    char const* compq,
+    lapack_int const* n,
+    float* T, lapack_int const* ldt,
+    float* Q, lapack_int const* ldq, lapack_int* ifst, lapack_int* ilst,
+    float* work,
+    lapack_int* info );
+
+#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc,ZTREXC)
+void LAPACK_ztrexc(
+    char const* compq,
+    lapack_int const* n,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* Q, lapack_int const* ldq, lapack_int const* ifst, lapack_int const* ilst,
+    lapack_int* info );
+
+#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs,CTRRFS)
+void LAPACK_ctrrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float const* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    lapack_complex_float* work,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs,DTRRFS)
+void LAPACK_dtrrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    double const* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    double* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_strrfs LAPACK_GLOBAL(strrfs,STRRFS)
+void LAPACK_strrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    float const* X, lapack_int const* ldx,
+    float* ferr,
+    float* berr,
+    float* work,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs,ZTRRFS)
+void LAPACK_ztrrfs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double const* X, lapack_int const* ldx,
+    double* ferr,
+    double* berr,
+    lapack_complex_double* work,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen,CTRSEN)
+void LAPACK_ctrsen(
+    char const* job, char const* compq,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_float* T, lapack_int const* ldt,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* W, lapack_int* m,
+    float* s,
+    float* sep,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen,DTRSEN)
+void LAPACK_dtrsen(
+    char const* job, char const* compq,
+    lapack_logical const* select,
+    lapack_int const* n,
+    double* T, lapack_int const* ldt,
+    double* Q, lapack_int const* ldq,
+    double* WR,
+    double* WI, lapack_int* m,
+    double* s,
+    double* sep,
+    double* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_strsen LAPACK_GLOBAL(strsen,STRSEN)
+void LAPACK_strsen(
+    char const* job, char const* compq,
+    lapack_logical const* select,
+    lapack_int const* n,
+    float* T, lapack_int const* ldt,
+    float* Q, lapack_int const* ldq,
+    float* WR,
+    float* WI, lapack_int* m,
+    float* s,
+    float* sep,
+    float* work, lapack_int const* lwork,
+    lapack_int* iwork, lapack_int const* liwork,
+    lapack_int* info );
+
+#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen,ZTRSEN)
+void LAPACK_ztrsen(
+    char const* job, char const* compq,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_double* T, lapack_int const* ldt,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* W, lapack_int* m,
+    double* s,
+    double* sep,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna,CTRSNA)
+void LAPACK_ctrsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_float const* T, lapack_int const* ldt,
+    lapack_complex_float const* VL, lapack_int const* ldvl,
+    lapack_complex_float const* VR, lapack_int const* ldvr,
+    float* S,
+    float* SEP, lapack_int const* mm, lapack_int* m,
+    lapack_complex_float* work, lapack_int const* ldwork,
+    float* rwork,
+    lapack_int* info );
+
+#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna,DTRSNA)
+void LAPACK_dtrsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    double const* T, lapack_int const* ldt,
+    double const* VL, lapack_int const* ldvl,
+    double const* VR, lapack_int const* ldvr,
+    double* S,
+    double* SEP, lapack_int const* mm, lapack_int* m,
+    double* work, lapack_int const* ldwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_strsna LAPACK_GLOBAL(strsna,STRSNA)
+void LAPACK_strsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    float const* T, lapack_int const* ldt,
+    float const* VL, lapack_int const* ldvl,
+    float const* VR, lapack_int const* ldvr,
+    float* S,
+    float* SEP, lapack_int const* mm, lapack_int* m,
+    float* work, lapack_int const* ldwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna,ZTRSNA)
+void LAPACK_ztrsna(
+    char const* job, char const* howmny,
+    lapack_logical const* select,
+    lapack_int const* n,
+    lapack_complex_double const* T, lapack_int const* ldt,
+    lapack_complex_double const* VL, lapack_int const* ldvl,
+    lapack_complex_double const* VR, lapack_int const* ldvr,
+    double* S,
+    double* SEP, lapack_int const* mm, lapack_int* m,
+    lapack_complex_double* work, lapack_int const* ldwork,
+    double* rwork,
+    lapack_int* info );
+
+#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl,CTRSYL)
+void LAPACK_ctrsyl(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* B, lapack_int const* ldb,
+    lapack_complex_float* C, lapack_int const* ldc,
+    float* scale,
+    lapack_int* info );
+
+#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl,DTRSYL)
+void LAPACK_dtrsyl(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double const* B, lapack_int const* ldb,
+    double* C, lapack_int const* ldc,
+    double* scale,
+    lapack_int* info );
+
+#define LAPACK_strsyl LAPACK_GLOBAL(strsyl,STRSYL)
+void LAPACK_strsyl(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float const* B, lapack_int const* ldb,
+    float* C, lapack_int const* ldc,
+    float* scale,
+    lapack_int* info );
+
+#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl,ZTRSYL)
+void LAPACK_ztrsyl(
+    char const* trana, char const* tranb,
+    lapack_int const* isgn, lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* B, lapack_int const* ldb,
+    lapack_complex_double* C, lapack_int const* ldc,
+    double* scale,
+    lapack_int* info );
+
+#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri,CTRTRI)
+void LAPACK_ctrtri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri,DTRTRI)
+void LAPACK_dtrtri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_strtri LAPACK_GLOBAL(strtri,STRTRI)
+void LAPACK_strtri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    float* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri,ZTRTRI)
+void LAPACK_ztrtri(
+    char const* uplo, char const* diag,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_int* info );
+
+#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs,CTRTRS)
+void LAPACK_ctrtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs,DTRTRS)
+void LAPACK_dtrtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    double const* A, lapack_int const* lda,
+    double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_strtrs LAPACK_GLOBAL(strtrs,STRTRS)
+void LAPACK_strtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    float const* A, lapack_int const* lda,
+    float* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs,ZTRTRS)
+void LAPACK_ztrtrs(
+    char const* uplo, char const* trans, char const* diag,
+    lapack_int const* n, lapack_int const* nrhs,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* B, lapack_int const* ldb,
+    lapack_int* info );
+
+#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf,CTRTTF)
+void LAPACK_ctrttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* ARF,
+    lapack_int* info );
+
+#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf,DTRTTF)
+void LAPACK_dtrttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* ARF,
+    lapack_int* info );
+
+#define LAPACK_strttf LAPACK_GLOBAL(strttf,STRTTF)
+void LAPACK_strttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* ARF,
+    lapack_int* info );
+
+#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf,ZTRTTF)
+void LAPACK_ztrttf(
+    char const* transr, char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* ARF,
+    lapack_int* info );
+
+#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp,CTRTTP)
+void LAPACK_ctrttp(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float* AP,
+    lapack_int* info );
+
+#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp,DTRTTP)
+void LAPACK_dtrttp(
+    char const* uplo,
+    lapack_int const* n,
+    double const* A, lapack_int const* lda,
+    double* AP,
+    lapack_int* info );
+
+#define LAPACK_strttp LAPACK_GLOBAL(strttp,STRTTP)
+void LAPACK_strttp(
+    char const* uplo,
+    lapack_int const* n,
+    float const* A, lapack_int const* lda,
+    float* AP,
+    lapack_int* info );
+
+#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp,ZTRTTP)
+void LAPACK_ztrttp(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double* AP,
+    lapack_int* info );
+
+#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf,CTZRZF)
+void LAPACK_ctzrzf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf,DTZRZF)
+void LAPACK_dtzrzf(
+    lapack_int const* m, lapack_int const* n,
+    double* A, lapack_int const* lda,
+    double* tau,
+    double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf,STZRZF)
+void LAPACK_stzrzf(
+    lapack_int const* m, lapack_int const* n,
+    float* A, lapack_int const* lda,
+    float* tau,
+    float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf,ZTZRZF)
+void LAPACK_ztzrzf(
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb,CUNBDB)
+void LAPACK_cunbdb(
+    char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    lapack_complex_float* X11, lapack_int const* ldx11,
+    lapack_complex_float* X12, lapack_int const* ldx12,
+    lapack_complex_float* X21, lapack_int const* ldx21,
+    lapack_complex_float* X22, lapack_int const* ldx22,
+    float* theta,
+    float* phi,
+    lapack_complex_float* TAUP1,
+    lapack_complex_float* TAUP2,
+    lapack_complex_float* TAUQ1,
+    lapack_complex_float* TAUQ2,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb,ZUNBDB)
+void LAPACK_zunbdb(
+    char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    lapack_complex_double* X11, lapack_int const* ldx11,
+    lapack_complex_double* X12, lapack_int const* ldx12,
+    lapack_complex_double* X21, lapack_int const* ldx21,
+    lapack_complex_double* X22, lapack_int const* ldx22,
+    double* theta,
+    double* phi,
+    lapack_complex_double* TAUP1,
+    lapack_complex_double* TAUP2,
+    lapack_complex_double* TAUQ1,
+    lapack_complex_double* TAUQ2,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd,CUNCSD)
+void LAPACK_cuncsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    lapack_complex_float* X11, lapack_int const* ldx11,
+    lapack_complex_float* X12, lapack_int const* ldx12,
+    lapack_complex_float* X21, lapack_int const* ldx21,
+    lapack_complex_float* X22, lapack_int const* ldx22,
+    float* theta,
+    lapack_complex_float* U1, lapack_int const* ldu1,
+    lapack_complex_float* U2, lapack_int const* ldu2,
+    lapack_complex_float* V1T, lapack_int const* ldv1t,
+    lapack_complex_float* V2T, lapack_int const* ldv2t,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd,ZUNCSD)
+void LAPACK_zuncsd(
+    char const* jobu1, char const* jobu2, char const* jobv1t, char const* jobv2t, char const* trans, char const* signs,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    lapack_complex_double* X11, lapack_int const* ldx11,
+    lapack_complex_double* X12, lapack_int const* ldx12,
+    lapack_complex_double* X21, lapack_int const* ldx21,
+    lapack_complex_double* X22, lapack_int const* ldx22,
+    double* theta,
+    lapack_complex_double* U1, lapack_int const* ldu1,
+    lapack_complex_double* U2, lapack_int const* ldu2,
+    lapack_complex_double* V1T, lapack_int const* ldv1t,
+    lapack_complex_double* V2T, lapack_int const* ldv2t,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cuncsd2by1 LAPACK_GLOBAL(cuncsd2by1,CUNCSD2BY1)
+void LAPACK_cuncsd2by1(
+    char const* jobu1, char const* jobu2, char const* jobv1t,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    lapack_complex_float* X11, lapack_int const* ldx11,
+    lapack_complex_float* X21, lapack_int const* ldx21,
+    float* theta,
+    lapack_complex_float* U1, lapack_int const* ldu1,
+    lapack_complex_float* U2, lapack_int const* ldu2,
+    lapack_complex_float* V1T, lapack_int const* ldv1t,
+    lapack_complex_float* work, lapack_int const* lwork,
+    float* rwork, lapack_int const* lrwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_zuncsd2by1 LAPACK_GLOBAL(zuncsd2by1,ZUNCSD2BY1)
+void LAPACK_zuncsd2by1(
+    char const* jobu1, char const* jobu2, char const* jobv1t,
+    lapack_int const* m, lapack_int const* p, lapack_int const* q,
+    lapack_complex_double* X11, lapack_int const* ldx11,
+    lapack_complex_double* X21, lapack_int const* ldx21,
+    double* theta,
+    lapack_complex_double* U1, lapack_int const* ldu1,
+    lapack_complex_double* U2, lapack_int const* ldu2,
+    lapack_complex_double* V1T, lapack_int const* ldv1t,
+    lapack_complex_double* work, lapack_int const* lwork,
+    double* rwork, lapack_int const* lrwork,
+    lapack_int* iwork,
+    lapack_int* info );
+
+#define LAPACK_cungbr LAPACK_GLOBAL(cungbr,CUNGBR)
+void LAPACK_cungbr(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungbr LAPACK_GLOBAL(zungbr,ZUNGBR)
+void LAPACK_zungbr(
+    char const* vect,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunghr LAPACK_GLOBAL(cunghr,CUNGHR)
+void LAPACK_cunghr(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunghr LAPACK_GLOBAL(zunghr,ZUNGHR)
+void LAPACK_zunghr(
+    lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunglq LAPACK_GLOBAL(cunglq,CUNGLQ)
+void LAPACK_cunglq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunglq LAPACK_GLOBAL(zunglq,ZUNGLQ)
+void LAPACK_zunglq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cungql LAPACK_GLOBAL(cungql,CUNGQL)
+void LAPACK_cungql(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungql LAPACK_GLOBAL(zungql,ZUNGQL)
+void LAPACK_zungql(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cungqr LAPACK_GLOBAL(cungqr,CUNGQR)
+void LAPACK_cungqr(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungqr LAPACK_GLOBAL(zungqr,ZUNGQR)
+void LAPACK_zungqr(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cungrq LAPACK_GLOBAL(cungrq,CUNGRQ)
+void LAPACK_cungrq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungrq LAPACK_GLOBAL(zungrq,ZUNGRQ)
+void LAPACK_zungrq(
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cungtr LAPACK_GLOBAL(cungtr,CUNGTR)
+void LAPACK_cungtr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zungtr LAPACK_GLOBAL(zungtr,ZUNGTR)
+void LAPACK_zungtr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
+void LAPACK_cunmbr(
+    char const* vect, char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr,ZUNMBR)
+void LAPACK_zunmbr(
+    char const* vect, char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr,CUNMHR)
+void LAPACK_cunmhr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr,ZUNMHR)
+void LAPACK_zunmhr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* ilo, lapack_int const* ihi,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq,CUNMLQ)
+void LAPACK_cunmlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq,ZUNMLQ)
+void LAPACK_zunmlq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmql LAPACK_GLOBAL(cunmql,CUNMQL)
+void LAPACK_cunmql(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmql LAPACK_GLOBAL(zunmql,ZUNMQL)
+void LAPACK_zunmql(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr,CUNMQR)
+void LAPACK_cunmqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr,ZUNMQR)
+void LAPACK_zunmqr(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq,CUNMRQ)
+void LAPACK_cunmrq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq,ZUNMRQ)
+void LAPACK_zunmrq(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz,CUNMRZ)
+void LAPACK_cunmrz(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz,ZUNMRZ)
+void LAPACK_zunmrz(
+    char const* side, char const* trans,
+    lapack_int const* m, lapack_int const* n, lapack_int const* k, lapack_int const* l,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr,CUNMTR)
+void LAPACK_cunmtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* A, lapack_int const* lda,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr,ZUNMTR)
+void LAPACK_zunmtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* A, lapack_int const* lda,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work, lapack_int const* lwork,
+    lapack_int* info );
+
+#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr,CUPGTR)
+void LAPACK_cupgtr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_float const* AP,
+    lapack_complex_float const* tau,
+    lapack_complex_float* Q, lapack_int const* ldq,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr,ZUPGTR)
+void LAPACK_zupgtr(
+    char const* uplo,
+    lapack_int const* n,
+    lapack_complex_double const* AP,
+    lapack_complex_double const* tau,
+    lapack_complex_double* Q, lapack_int const* ldq,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr,CUPMTR)
+void LAPACK_cupmtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_float const* AP,
+    lapack_complex_float const* tau,
+    lapack_complex_float* C, lapack_int const* ldc,
+    lapack_complex_float* work,
+    lapack_int* info );
+
+#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr,ZUPMTR)
+void LAPACK_zupmtr(
+    char const* side, char const* uplo, char const* trans,
+    lapack_int const* m, lapack_int const* n,
+    lapack_complex_double const* AP,
+    lapack_complex_double const* tau,
+    lapack_complex_double* C, lapack_int const* ldc,
+    lapack_complex_double* work,
+    lapack_int* info );
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* LAPACK_H */
diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h
index c5ea465e0..6eb0b696b 100644
--- a/lapack-netlib/LAPACKE/include/lapacke.h
+++ b/lapack-netlib/LAPACKE/include/lapacke.h
@@ -34,81 +34,7 @@
 #ifndef _LAPACKE_H_
 #define _LAPACKE_H_
 
-/*
-*  Turn on HAVE_LAPACK_CONFIG_H to redefine C-LAPACK datatypes
-*/
-#ifdef HAVE_LAPACK_CONFIG_H
-#include "lapacke_config.h"
-#endif
-
-#include <stdlib.h>
-
-#ifndef lapack_int
-#define lapack_int     int
-#endif
-
-#ifndef lapack_logical
-#define lapack_logical lapack_int
-#endif
-
-/* Complex types are structures equivalent to the
-* Fortran complex types COMPLEX(4) and COMPLEX(8).
-*
-* One can also redefine the types with his own types
-* for example by including in the code definitions like
-*
-* #define lapack_complex_float std::complex<float>
-* #define lapack_complex_double std::complex<double>
-*
-* or define these types in the command line:
-*
-* -Dlapack_complex_float="std::complex<float>"
-* -Dlapack_complex_double="std::complex<double>"
-*/
-
-#ifndef LAPACK_COMPLEX_CUSTOM
-
-/* Complex type (single precision) */
-#ifndef lapack_complex_float
-#ifndef __cplusplus
-#include <complex.h>
-#else
-#include <complex>
-#endif
-#define lapack_complex_float    float _Complex
-#endif
-
-#ifndef lapack_complex_float_real
-#define lapack_complex_float_real(z)       (creal(z))
-#endif
-
-#ifndef lapack_complex_float_imag
-#define lapack_complex_float_imag(z)       (cimag(z))
-#endif
-
-lapack_complex_float lapack_make_complex_float( float re, float im );
-
-/* Complex type (double precision) */
-#ifndef lapack_complex_double
-#ifndef __cplusplus
-#include <complex.h>
-#else
-#include <complex>
-#endif
-#define lapack_complex_double   double _Complex
-#endif
-
-#ifndef lapack_complex_double_real
-#define lapack_complex_double_real(z)      (creal(z))
-#endif
-
-#ifndef lapack_complex_double_imag
-#define lapack_complex_double_imag(z)       (cimag(z))
-#endif
-
-lapack_complex_double lapack_make_complex_double( double re, double im );
-
-#endif
+#include "lapack.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -130,29 +56,8 @@ extern "C" {
 #define LAPACK_WORK_MEMORY_ERROR       -1010
 #define LAPACK_TRANSPOSE_MEMORY_ERROR  -1011
 
-/* Callback logical functions of one, two, or three arguments are used
-*  to select eigenvalues to sort to the top left of the Schur form.
-*  The value is selected if function returns TRUE (non-zero). */
-
-typedef lapack_logical (*LAPACK_S_SELECT2) ( const float*, const float* );
-typedef lapack_logical (*LAPACK_S_SELECT3)
-    ( const float*, const float*, const float* );
-typedef lapack_logical (*LAPACK_D_SELECT2) ( const double*, const double* );
-typedef lapack_logical (*LAPACK_D_SELECT3)
-    ( const double*, const double*, const double* );
-
-typedef lapack_logical (*LAPACK_C_SELECT1) ( const lapack_complex_float* );
-typedef lapack_logical (*LAPACK_C_SELECT2)
-    ( const lapack_complex_float*, const lapack_complex_float* );
-typedef lapack_logical (*LAPACK_Z_SELECT1) ( const lapack_complex_double* );
-typedef lapack_logical (*LAPACK_Z_SELECT2)
-    ( const lapack_complex_double*, const lapack_complex_double* );
-
-#include "lapacke_mangling.h"
-
-#define LAPACK_lsame LAPACK_GLOBAL(lsame,LSAME)
-lapack_logical LAPACK_lsame( char* ca,  char* cb,
-                              lapack_int lca, lapack_int lcb );
+lapack_complex_float lapack_make_complex_float( float re, float im );
+lapack_complex_double lapack_make_complex_double( double re, double im );
 
 /* C-LAPACK function prototypes */
 
@@ -1034,6 +939,25 @@ lapack_int LAPACKE_zgesvdx( int matrix_layout, char jobu, char jobvt, char range
                            lapack_complex_double* vt, lapack_int ldvt,
                            lapack_int* superb );
 
+lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, float* a, lapack_int lda,
+                           float* s, float* u, lapack_int ldu, float* v,
+                           lapack_int ldv, lapack_int* numrank );
+lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* s, double* u, lapack_int ldu,
+                           double* v, lapack_int ldv, lapack_int* numrank);
+lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u,
+                           lapack_int ldu, lapack_complex_float* v,
+                           lapack_int ldv, lapack_int* numrank );
+lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp, char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u,
+                           lapack_int ldu, lapack_complex_double* v,
+                           lapack_int ldv, lapack_int* numrank );
+                           
 lapack_int LAPACKE_sgesvj( int matrix_layout, char joba, char jobu, char jobv,
                            lapack_int m, lapack_int n, float* a, lapack_int lda,
                            float* sva, lapack_int mv, float* v, lapack_int ldv,
@@ -1943,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo,
 
 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, const lapack_complex_float* b,
+                           lapack_int lda, lapack_complex_float* b,
                            lapack_int ldb );
 lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, const lapack_complex_double* b,
+                           lapack_int lda, lapack_complex_double* b,
                            lapack_int ldb );
 
 lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz,
@@ -5824,6 +5748,45 @@ lapack_int LAPACKE_zgesvdx_work( int matrix_layout, char jobu, char jobvt, char
                                  lapack_complex_double* work, lapack_int lwork,
                                  double* rwork, lapack_int* iwork );
 
+lapack_int LAPACKE_sgesvdq_work( int matrix_layout, char joba, char jobp,
+                                char jobr, char jobu, char jobv,
+                                lapack_int m, lapack_int n, float* a,
+                                lapack_int lda, float* s, float* u,
+                                lapack_int ldu, float* v, lapack_int ldv,
+                                lapack_int* numrank,
+                                lapack_int* iwork, lapack_int liwork,
+                                float* work, lapack_int lwork,
+                                float* rwork, lapack_int lrwork);
+lapack_int LAPACKE_dgesvdq_work( int matrix_layout, char joba, char jobp,
+                                char jobr, char jobu, char jobv,
+                                lapack_int m, lapack_int n, double* a,
+                                lapack_int lda, double* s, double* u,
+                                lapack_int ldu, double* v, lapack_int ldv,
+                                lapack_int* numrank,
+                                lapack_int* iwork, lapack_int liwork,
+                                double* work, lapack_int lwork,
+                                double* rwork, lapack_int lrwork);
+lapack_int LAPACKE_cgesvdq_work( int matrix_layout, char joba, char jobp,
+                                char jobr, char jobu, char jobv,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_float* a, lapack_int lda,
+                                float* s, lapack_complex_float* u,
+                                lapack_int ldu, lapack_complex_float* v,
+                                lapack_int ldv, lapack_int* numrank,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_complex_float* cwork, lapack_int lcwork,
+                                float* rwork, lapack_int lrwork);
+lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp,
+                                char jobr, char jobu, char jobv,
+                                lapack_int m, lapack_int n,
+                                lapack_complex_double* a, lapack_int lda,
+                                double* s, lapack_complex_double* u,
+                                lapack_int ldu, lapack_complex_double* v,
+                                lapack_int ldv, lapack_int* numrank,
+                                lapack_int* iwork, lapack_int liwork,
+                                lapack_complex_double* cwork, lapack_int lcwork,
+                                double* rwork, lapack_int lrwork);
+                                
 lapack_int LAPACKE_sgesvj_work( int matrix_layout, char joba, char jobu,
                                 char jobv, lapack_int m, lapack_int n, float* a,
                                 lapack_int lda, float* sva, lapack_int mv,
@@ -6969,11 +6932,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range,
 
 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, const lapack_complex_float* b,
+                                lapack_int lda, lapack_complex_float* b,
                                 lapack_int ldb );
 lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, const lapack_complex_double* b,
+                                lapack_int lda, lapack_complex_double* b,
                                 lapack_int ldb );
 
 lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz,
@@ -10590,11 +10553,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv,
                                   lapack_complex_float* work, lapack_int nb );
 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int nrhs, lapack_complex_float* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_float* b, lapack_int ldb );
 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int nrhs, lapack_complex_float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_float* b, lapack_int ldb,
                                  lapack_complex_float* work );
@@ -10755,10 +10718,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv, double* work,
                                   lapack_int nb );
 lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const double* a, lapack_int lda,
+                            lapack_int nrhs, double* a, lapack_int lda,
                             const lapack_int* ipiv, double* b, lapack_int ldb );
 lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, const double* a,
+                                 lapack_int nrhs, double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  double* b, lapack_int ldb, double* work );
 lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2,
@@ -10850,10 +10813,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv, float* work,
                                   lapack_int nb );
 lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const float* a, lapack_int lda,
+                            lapack_int nrhs, float* a, lapack_int lda,
                             const lapack_int* ipiv, float* b, lapack_int ldb );
 lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, const float* a,
+                                 lapack_int nrhs, float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  float* b, lapack_int ldb, float* work );
 lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2,
@@ -10935,11 +10898,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n,
                                   const lapack_int* ipiv,
                                   lapack_complex_double* work, lapack_int nb );
 lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int nrhs, lapack_complex_double* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_double* b, lapack_int ldb );
 lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, const lapack_complex_double* a,
+                                 lapack_int nrhs, lapack_complex_double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work );
@@ -12609,6848 +12572,6 @@ lapack_int LAPACKE_zhetrs_aa_2stage_work( int matrix_layout, char uplo, lapack_i
                                lapack_int lda, lapack_complex_double* tb,
                                lapack_int ltb, lapack_int* ipiv, lapack_int* ipiv2,
                                lapack_complex_double* b, lapack_int ldb );
-
-#define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF)
-#define LAPACK_dgetrf LAPACK_GLOBAL(dgetrf,DGETRF)
-#define LAPACK_cgetrf LAPACK_GLOBAL(cgetrf,CGETRF)
-#define LAPACK_zgetrf LAPACK_GLOBAL(zgetrf,ZGETRF)
-#define LAPACK_sgetrf2 LAPACK_GLOBAL(sgetrf2,SGETRF2)
-#define LAPACK_dgetrf2 LAPACK_GLOBAL(dgetrf2,DGETRF2)
-#define LAPACK_cgetrf2 LAPACK_GLOBAL(cgetrf2,CGETRF2)
-#define LAPACK_zgetrf2 LAPACK_GLOBAL(zgetrf2,ZGETRF2)
-#define LAPACK_sgbtrf LAPACK_GLOBAL(sgbtrf,SGBTRF)
-#define LAPACK_dgbtrf LAPACK_GLOBAL(dgbtrf,DGBTRF)
-#define LAPACK_cgbtrf LAPACK_GLOBAL(cgbtrf,CGBTRF)
-#define LAPACK_zgbtrf LAPACK_GLOBAL(zgbtrf,ZGBTRF)
-#define LAPACK_sgttrf LAPACK_GLOBAL(sgttrf,SGTTRF)
-#define LAPACK_dgttrf LAPACK_GLOBAL(dgttrf,DGTTRF)
-#define LAPACK_cgttrf LAPACK_GLOBAL(cgttrf,CGTTRF)
-#define LAPACK_zgttrf LAPACK_GLOBAL(zgttrf,ZGTTRF)
-#define LAPACK_spotrf LAPACK_GLOBAL(spotrf,SPOTRF)
-#define LAPACK_dpotrf LAPACK_GLOBAL(dpotrf,DPOTRF)
-#define LAPACK_cpotrf LAPACK_GLOBAL(cpotrf,CPOTRF)
-#define LAPACK_zpotrf LAPACK_GLOBAL(zpotrf,ZPOTRF)
-#define LAPACK_spotrf2 LAPACK_GLOBAL(spotrf2,SPOTRF2)
-#define LAPACK_dpotrf2 LAPACK_GLOBAL(dpotrf2,DPOTRF2)
-#define LAPACK_cpotrf2 LAPACK_GLOBAL(cpotrf2,CPOTRF2)
-#define LAPACK_zpotrf2 LAPACK_GLOBAL(zpotrf2,ZPOTRF2)
-#define LAPACK_dpstrf LAPACK_GLOBAL(dpstrf,DPSTRF)
-#define LAPACK_spstrf LAPACK_GLOBAL(spstrf,SPSTRF)
-#define LAPACK_zpstrf LAPACK_GLOBAL(zpstrf,ZPSTRF)
-#define LAPACK_cpstrf LAPACK_GLOBAL(cpstrf,CPSTRF)
-#define LAPACK_dpftrf LAPACK_GLOBAL(dpftrf,DPFTRF)
-#define LAPACK_spftrf LAPACK_GLOBAL(spftrf,SPFTRF)
-#define LAPACK_zpftrf LAPACK_GLOBAL(zpftrf,ZPFTRF)
-#define LAPACK_cpftrf LAPACK_GLOBAL(cpftrf,CPFTRF)
-#define LAPACK_spptrf LAPACK_GLOBAL(spptrf,SPPTRF)
-#define LAPACK_dpptrf LAPACK_GLOBAL(dpptrf,DPPTRF)
-#define LAPACK_cpptrf LAPACK_GLOBAL(cpptrf,CPPTRF)
-#define LAPACK_zpptrf LAPACK_GLOBAL(zpptrf,ZPPTRF)
-#define LAPACK_spbtrf LAPACK_GLOBAL(spbtrf,SPBTRF)
-#define LAPACK_dpbtrf LAPACK_GLOBAL(dpbtrf,DPBTRF)
-#define LAPACK_cpbtrf LAPACK_GLOBAL(cpbtrf,CPBTRF)
-#define LAPACK_zpbtrf LAPACK_GLOBAL(zpbtrf,ZPBTRF)
-#define LAPACK_spttrf LAPACK_GLOBAL(spttrf,SPTTRF)
-#define LAPACK_dpttrf LAPACK_GLOBAL(dpttrf,DPTTRF)
-#define LAPACK_cpttrf LAPACK_GLOBAL(cpttrf,CPTTRF)
-#define LAPACK_zpttrf LAPACK_GLOBAL(zpttrf,ZPTTRF)
-#define LAPACK_ssytrf LAPACK_GLOBAL(ssytrf,SSYTRF)
-#define LAPACK_ssytrf_rook LAPACK_GLOBAL(ssytrf_rook,SSYTRF_ROOK)
-#define LAPACK_dsytrf LAPACK_GLOBAL(dsytrf,DSYTRF)
-#define LAPACK_dsytrf_rook LAPACK_GLOBAL(dsytrf_rook,DSYTRF_ROOK)
-#define LAPACK_csytrf LAPACK_GLOBAL(csytrf,CSYTRF)
-#define LAPACK_csytrf_rook LAPACK_GLOBAL(csytrf_rook,CSYTRF_ROOK)
-#define LAPACK_zsytrf LAPACK_GLOBAL(zsytrf,ZSYTRF)
-#define LAPACK_zsytrf_rook LAPACK_GLOBAL(zsytrf_rook,ZSYTRF_ROOK)
-#define LAPACK_chetrf LAPACK_GLOBAL(chetrf,CHETRF)
-#define LAPACK_chetrf_rook LAPACK_GLOBAL(chetrf_rook,CHETRF_ROOK)
-#define LAPACK_zhetrf LAPACK_GLOBAL(zhetrf,ZHETRF)
-#define LAPACK_zhetrf_rook LAPACK_GLOBAL(zhetrf_rook,ZHETRF_ROOK)
-#define LAPACK_ssptrf LAPACK_GLOBAL(ssptrf,SSPTRF)
-#define LAPACK_dsptrf LAPACK_GLOBAL(dsptrf,DSPTRF)
-#define LAPACK_csptrf LAPACK_GLOBAL(csptrf,CSPTRF)
-#define LAPACK_zsptrf LAPACK_GLOBAL(zsptrf,ZSPTRF)
-#define LAPACK_chptrf LAPACK_GLOBAL(chptrf,CHPTRF)
-#define LAPACK_zhptrf LAPACK_GLOBAL(zhptrf,ZHPTRF)
-#define LAPACK_sgetrs LAPACK_GLOBAL(sgetrs,SGETRS)
-#define LAPACK_dgetrs LAPACK_GLOBAL(dgetrs,DGETRS)
-#define LAPACK_cgetrs LAPACK_GLOBAL(cgetrs,CGETRS)
-#define LAPACK_zgetrs LAPACK_GLOBAL(zgetrs,ZGETRS)
-#define LAPACK_sgbtrs LAPACK_GLOBAL(sgbtrs,SGBTRS)
-#define LAPACK_dgbtrs LAPACK_GLOBAL(dgbtrs,DGBTRS)
-#define LAPACK_cgbtrs LAPACK_GLOBAL(cgbtrs,CGBTRS)
-#define LAPACK_zgbtrs LAPACK_GLOBAL(zgbtrs,ZGBTRS)
-#define LAPACK_sgttrs LAPACK_GLOBAL(sgttrs,SGTTRS)
-#define LAPACK_dgttrs LAPACK_GLOBAL(dgttrs,DGTTRS)
-#define LAPACK_cgttrs LAPACK_GLOBAL(cgttrs,CGTTRS)
-#define LAPACK_zgttrs LAPACK_GLOBAL(zgttrs,ZGTTRS)
-#define LAPACK_spotrs LAPACK_GLOBAL(spotrs,SPOTRS)
-#define LAPACK_dpotrs LAPACK_GLOBAL(dpotrs,DPOTRS)
-#define LAPACK_cpotrs LAPACK_GLOBAL(cpotrs,CPOTRS)
-#define LAPACK_zpotrs LAPACK_GLOBAL(zpotrs,ZPOTRS)
-#define LAPACK_dpftrs LAPACK_GLOBAL(dpftrs,DPFTRS)
-#define LAPACK_spftrs LAPACK_GLOBAL(spftrs,SPFTRS)
-#define LAPACK_zpftrs LAPACK_GLOBAL(zpftrs,ZPFTRS)
-#define LAPACK_cpftrs LAPACK_GLOBAL(cpftrs,CPFTRS)
-#define LAPACK_spptrs LAPACK_GLOBAL(spptrs,SPPTRS)
-#define LAPACK_dpptrs LAPACK_GLOBAL(dpptrs,DPPTRS)
-#define LAPACK_cpptrs LAPACK_GLOBAL(cpptrs,CPPTRS)
-#define LAPACK_zpptrs LAPACK_GLOBAL(zpptrs,ZPPTRS)
-#define LAPACK_spbtrs LAPACK_GLOBAL(spbtrs,SPBTRS)
-#define LAPACK_dpbtrs LAPACK_GLOBAL(dpbtrs,DPBTRS)
-#define LAPACK_cpbtrs LAPACK_GLOBAL(cpbtrs,CPBTRS)
-#define LAPACK_zpbtrs LAPACK_GLOBAL(zpbtrs,ZPBTRS)
-#define LAPACK_spttrs LAPACK_GLOBAL(spttrs,SPTTRS)
-#define LAPACK_dpttrs LAPACK_GLOBAL(dpttrs,DPTTRS)
-#define LAPACK_cpttrs LAPACK_GLOBAL(cpttrs,CPTTRS)
-#define LAPACK_zpttrs LAPACK_GLOBAL(zpttrs,ZPTTRS)
-#define LAPACK_ssytrs LAPACK_GLOBAL(ssytrs,SSYTRS)
-#define LAPACK_ssytrs_rook LAPACK_GLOBAL(ssytrs_rook,SSYTRS_ROOK)
-#define LAPACK_dsytrs LAPACK_GLOBAL(dsytrs,DSYTRS)
-#define LAPACK_dsytrs_rook LAPACK_GLOBAL(dsytrs_rook,DSYTRS_ROOK)
-#define LAPACK_csytrs LAPACK_GLOBAL(csytrs,CSYTRS)
-#define LAPACK_csytrs_rook LAPACK_GLOBAL(csytrs_rook,CSYTRS_ROOK)
-#define LAPACK_zsytrs LAPACK_GLOBAL(zsytrs,ZSYTRS)
-#define LAPACK_zsytrs_rook LAPACK_GLOBAL(zsytrs_rook,ZSYTRS_ROOK)
-#define LAPACK_chetrs LAPACK_GLOBAL(chetrs,CHETRS)
-#define LAPACK_chetrs_rook LAPACK_GLOBAL(chetrs_rook,CHETRS_ROOK)
-#define LAPACK_zhetrs LAPACK_GLOBAL(zhetrs,ZHETRS)
-#define LAPACK_zhetrs_rook LAPACK_GLOBAL(zhetrs_rook,ZHETRS_ROOK)
-#define LAPACK_ssptrs LAPACK_GLOBAL(ssptrs,SSPTRS)
-#define LAPACK_dsptrs LAPACK_GLOBAL(dsptrs,DSPTRS)
-#define LAPACK_csptrs LAPACK_GLOBAL(csptrs,CSPTRS)
-#define LAPACK_zsptrs LAPACK_GLOBAL(zsptrs,ZSPTRS)
-#define LAPACK_chptrs LAPACK_GLOBAL(chptrs,CHPTRS)
-#define LAPACK_zhptrs LAPACK_GLOBAL(zhptrs,ZHPTRS)
-#define LAPACK_strtrs LAPACK_GLOBAL(strtrs,STRTRS)
-#define LAPACK_dtrtrs LAPACK_GLOBAL(dtrtrs,DTRTRS)
-#define LAPACK_ctrtrs LAPACK_GLOBAL(ctrtrs,CTRTRS)
-#define LAPACK_ztrtrs LAPACK_GLOBAL(ztrtrs,ZTRTRS)
-#define LAPACK_stptrs LAPACK_GLOBAL(stptrs,STPTRS)
-#define LAPACK_dtptrs LAPACK_GLOBAL(dtptrs,DTPTRS)
-#define LAPACK_ctptrs LAPACK_GLOBAL(ctptrs,CTPTRS)
-#define LAPACK_ztptrs LAPACK_GLOBAL(ztptrs,ZTPTRS)
-#define LAPACK_stbtrs LAPACK_GLOBAL(stbtrs,STBTRS)
-#define LAPACK_dtbtrs LAPACK_GLOBAL(dtbtrs,DTBTRS)
-#define LAPACK_ctbtrs LAPACK_GLOBAL(ctbtrs,CTBTRS)
-#define LAPACK_ztbtrs LAPACK_GLOBAL(ztbtrs,ZTBTRS)
-#define LAPACK_sgecon LAPACK_GLOBAL(sgecon,SGECON)
-#define LAPACK_dgecon LAPACK_GLOBAL(dgecon,DGECON)
-#define LAPACK_cgecon LAPACK_GLOBAL(cgecon,CGECON)
-#define LAPACK_zgecon LAPACK_GLOBAL(zgecon,ZGECON)
-#define LAPACK_sgbcon LAPACK_GLOBAL(sgbcon,SGBCON)
-#define LAPACK_dgbcon LAPACK_GLOBAL(dgbcon,DGBCON)
-#define LAPACK_cgbcon LAPACK_GLOBAL(cgbcon,CGBCON)
-#define LAPACK_zgbcon LAPACK_GLOBAL(zgbcon,ZGBCON)
-#define LAPACK_sgtcon LAPACK_GLOBAL(sgtcon,SGTCON)
-#define LAPACK_dgtcon LAPACK_GLOBAL(dgtcon,DGTCON)
-#define LAPACK_cgtcon LAPACK_GLOBAL(cgtcon,CGTCON)
-#define LAPACK_zgtcon LAPACK_GLOBAL(zgtcon,ZGTCON)
-#define LAPACK_spocon LAPACK_GLOBAL(spocon,SPOCON)
-#define LAPACK_dpocon LAPACK_GLOBAL(dpocon,DPOCON)
-#define LAPACK_cpocon LAPACK_GLOBAL(cpocon,CPOCON)
-#define LAPACK_zpocon LAPACK_GLOBAL(zpocon,ZPOCON)
-#define LAPACK_sppcon LAPACK_GLOBAL(sppcon,SPPCON)
-#define LAPACK_dppcon LAPACK_GLOBAL(dppcon,DPPCON)
-#define LAPACK_cppcon LAPACK_GLOBAL(cppcon,CPPCON)
-#define LAPACK_zppcon LAPACK_GLOBAL(zppcon,ZPPCON)
-#define LAPACK_spbcon LAPACK_GLOBAL(spbcon,SPBCON)
-#define LAPACK_dpbcon LAPACK_GLOBAL(dpbcon,DPBCON)
-#define LAPACK_cpbcon LAPACK_GLOBAL(cpbcon,CPBCON)
-#define LAPACK_zpbcon LAPACK_GLOBAL(zpbcon,ZPBCON)
-#define LAPACK_sptcon LAPACK_GLOBAL(sptcon,SPTCON)
-#define LAPACK_dptcon LAPACK_GLOBAL(dptcon,DPTCON)
-#define LAPACK_cptcon LAPACK_GLOBAL(cptcon,CPTCON)
-#define LAPACK_zptcon LAPACK_GLOBAL(zptcon,ZPTCON)
-#define LAPACK_ssycon LAPACK_GLOBAL(ssycon,SSYCON)
-#define LAPACK_dsycon LAPACK_GLOBAL(dsycon,DSYCON)
-#define LAPACK_csycon LAPACK_GLOBAL(csycon,CSYCON)
-#define LAPACK_zsycon LAPACK_GLOBAL(zsycon,ZSYCON)
-#define LAPACK_checon LAPACK_GLOBAL(checon,CHECON)
-#define LAPACK_zhecon LAPACK_GLOBAL(zhecon,ZHECON)
-#define LAPACK_sspcon LAPACK_GLOBAL(sspcon,SSPCON)
-#define LAPACK_dspcon LAPACK_GLOBAL(dspcon,DSPCON)
-#define LAPACK_cspcon LAPACK_GLOBAL(cspcon,CSPCON)
-#define LAPACK_zspcon LAPACK_GLOBAL(zspcon,ZSPCON)
-#define LAPACK_chpcon LAPACK_GLOBAL(chpcon,CHPCON)
-#define LAPACK_zhpcon LAPACK_GLOBAL(zhpcon,ZHPCON)
-#define LAPACK_strcon LAPACK_GLOBAL(strcon,STRCON)
-#define LAPACK_dtrcon LAPACK_GLOBAL(dtrcon,DTRCON)
-#define LAPACK_ctrcon LAPACK_GLOBAL(ctrcon,CTRCON)
-#define LAPACK_ztrcon LAPACK_GLOBAL(ztrcon,ZTRCON)
-#define LAPACK_stpcon LAPACK_GLOBAL(stpcon,STPCON)
-#define LAPACK_dtpcon LAPACK_GLOBAL(dtpcon,DTPCON)
-#define LAPACK_ctpcon LAPACK_GLOBAL(ctpcon,CTPCON)
-#define LAPACK_ztpcon LAPACK_GLOBAL(ztpcon,ZTPCON)
-#define LAPACK_stbcon LAPACK_GLOBAL(stbcon,STBCON)
-#define LAPACK_dtbcon LAPACK_GLOBAL(dtbcon,DTBCON)
-#define LAPACK_ctbcon LAPACK_GLOBAL(ctbcon,CTBCON)
-#define LAPACK_ztbcon LAPACK_GLOBAL(ztbcon,ZTBCON)
-#define LAPACK_sgerfs LAPACK_GLOBAL(sgerfs,SGERFS)
-#define LAPACK_dgerfs LAPACK_GLOBAL(dgerfs,DGERFS)
-#define LAPACK_cgerfs LAPACK_GLOBAL(cgerfs,CGERFS)
-#define LAPACK_zgerfs LAPACK_GLOBAL(zgerfs,ZGERFS)
-#define LAPACK_dgerfsx LAPACK_GLOBAL(dgerfsx,DGERFSX)
-#define LAPACK_sgerfsx LAPACK_GLOBAL(sgerfsx,SGERFSX)
-#define LAPACK_zgerfsx LAPACK_GLOBAL(zgerfsx,ZGERFSX)
-#define LAPACK_cgerfsx LAPACK_GLOBAL(cgerfsx,CGERFSX)
-#define LAPACK_sgbrfs LAPACK_GLOBAL(sgbrfs,SGBRFS)
-#define LAPACK_dgbrfs LAPACK_GLOBAL(dgbrfs,DGBRFS)
-#define LAPACK_cgbrfs LAPACK_GLOBAL(cgbrfs,CGBRFS)
-#define LAPACK_zgbrfs LAPACK_GLOBAL(zgbrfs,ZGBRFS)
-#define LAPACK_dgbrfsx LAPACK_GLOBAL(dgbrfsx,DGBRFSX)
-#define LAPACK_sgbrfsx LAPACK_GLOBAL(sgbrfsx,SGBRFSX)
-#define LAPACK_zgbrfsx LAPACK_GLOBAL(zgbrfsx,ZGBRFSX)
-#define LAPACK_cgbrfsx LAPACK_GLOBAL(cgbrfsx,CGBRFSX)
-#define LAPACK_sgtrfs LAPACK_GLOBAL(sgtrfs,SGTRFS)
-#define LAPACK_dgtrfs LAPACK_GLOBAL(dgtrfs,DGTRFS)
-#define LAPACK_cgtrfs LAPACK_GLOBAL(cgtrfs,CGTRFS)
-#define LAPACK_zgtrfs LAPACK_GLOBAL(zgtrfs,ZGTRFS)
-#define LAPACK_sporfs LAPACK_GLOBAL(sporfs,SPORFS)
-#define LAPACK_dporfs LAPACK_GLOBAL(dporfs,DPORFS)
-#define LAPACK_cporfs LAPACK_GLOBAL(cporfs,CPORFS)
-#define LAPACK_zporfs LAPACK_GLOBAL(zporfs,ZPORFS)
-#define LAPACK_dporfsx LAPACK_GLOBAL(dporfsx,DPORFSX)
-#define LAPACK_sporfsx LAPACK_GLOBAL(sporfsx,SPORFSX)
-#define LAPACK_zporfsx LAPACK_GLOBAL(zporfsx,ZPORFSX)
-#define LAPACK_cporfsx LAPACK_GLOBAL(cporfsx,CPORFSX)
-#define LAPACK_spprfs LAPACK_GLOBAL(spprfs,SPPRFS)
-#define LAPACK_dpprfs LAPACK_GLOBAL(dpprfs,DPPRFS)
-#define LAPACK_cpprfs LAPACK_GLOBAL(cpprfs,CPPRFS)
-#define LAPACK_zpprfs LAPACK_GLOBAL(zpprfs,ZPPRFS)
-#define LAPACK_spbrfs LAPACK_GLOBAL(spbrfs,SPBRFS)
-#define LAPACK_dpbrfs LAPACK_GLOBAL(dpbrfs,DPBRFS)
-#define LAPACK_cpbrfs LAPACK_GLOBAL(cpbrfs,CPBRFS)
-#define LAPACK_zpbrfs LAPACK_GLOBAL(zpbrfs,ZPBRFS)
-#define LAPACK_sptrfs LAPACK_GLOBAL(sptrfs,SPTRFS)
-#define LAPACK_dptrfs LAPACK_GLOBAL(dptrfs,DPTRFS)
-#define LAPACK_cptrfs LAPACK_GLOBAL(cptrfs,CPTRFS)
-#define LAPACK_zptrfs LAPACK_GLOBAL(zptrfs,ZPTRFS)
-#define LAPACK_ssyrfs LAPACK_GLOBAL(ssyrfs,SSYRFS)
-#define LAPACK_dsyrfs LAPACK_GLOBAL(dsyrfs,DSYRFS)
-#define LAPACK_csyrfs LAPACK_GLOBAL(csyrfs,CSYRFS)
-#define LAPACK_zsyrfs LAPACK_GLOBAL(zsyrfs,ZSYRFS)
-#define LAPACK_dsyrfsx LAPACK_GLOBAL(dsyrfsx,DSYRFSX)
-#define LAPACK_ssyrfsx LAPACK_GLOBAL(ssyrfsx,SSYRFSX)
-#define LAPACK_zsyrfsx LAPACK_GLOBAL(zsyrfsx,ZSYRFSX)
-#define LAPACK_csyrfsx LAPACK_GLOBAL(csyrfsx,CSYRFSX)
-#define LAPACK_cherfs LAPACK_GLOBAL(cherfs,CHERFS)
-#define LAPACK_zherfs LAPACK_GLOBAL(zherfs,ZHERFS)
-#define LAPACK_zherfsx LAPACK_GLOBAL(zherfsx,ZHERFSX)
-#define LAPACK_cherfsx LAPACK_GLOBAL(cherfsx,CHERFSX)
-#define LAPACK_ssprfs LAPACK_GLOBAL(ssprfs,SSPRFS)
-#define LAPACK_dsprfs LAPACK_GLOBAL(dsprfs,DSPRFS)
-#define LAPACK_csprfs LAPACK_GLOBAL(csprfs,CSPRFS)
-#define LAPACK_zsprfs LAPACK_GLOBAL(zsprfs,ZSPRFS)
-#define LAPACK_chprfs LAPACK_GLOBAL(chprfs,CHPRFS)
-#define LAPACK_zhprfs LAPACK_GLOBAL(zhprfs,ZHPRFS)
-#define LAPACK_strrfs LAPACK_GLOBAL(strrfs,STRRFS)
-#define LAPACK_dtrrfs LAPACK_GLOBAL(dtrrfs,DTRRFS)
-#define LAPACK_ctrrfs LAPACK_GLOBAL(ctrrfs,CTRRFS)
-#define LAPACK_ztrrfs LAPACK_GLOBAL(ztrrfs,ZTRRFS)
-#define LAPACK_stprfs LAPACK_GLOBAL(stprfs,STPRFS)
-#define LAPACK_dtprfs LAPACK_GLOBAL(dtprfs,DTPRFS)
-#define LAPACK_ctprfs LAPACK_GLOBAL(ctprfs,CTPRFS)
-#define LAPACK_ztprfs LAPACK_GLOBAL(ztprfs,ZTPRFS)
-#define LAPACK_stbrfs LAPACK_GLOBAL(stbrfs,STBRFS)
-#define LAPACK_dtbrfs LAPACK_GLOBAL(dtbrfs,DTBRFS)
-#define LAPACK_ctbrfs LAPACK_GLOBAL(ctbrfs,CTBRFS)
-#define LAPACK_ztbrfs LAPACK_GLOBAL(ztbrfs,ZTBRFS)
-#define LAPACK_sgetri LAPACK_GLOBAL(sgetri,SGETRI)
-#define LAPACK_dgetri LAPACK_GLOBAL(dgetri,DGETRI)
-#define LAPACK_cgetri LAPACK_GLOBAL(cgetri,CGETRI)
-#define LAPACK_zgetri LAPACK_GLOBAL(zgetri,ZGETRI)
-#define LAPACK_spotri LAPACK_GLOBAL(spotri,SPOTRI)
-#define LAPACK_dpotri LAPACK_GLOBAL(dpotri,DPOTRI)
-#define LAPACK_cpotri LAPACK_GLOBAL(cpotri,CPOTRI)
-#define LAPACK_zpotri LAPACK_GLOBAL(zpotri,ZPOTRI)
-#define LAPACK_dpftri LAPACK_GLOBAL(dpftri,DPFTRI)
-#define LAPACK_spftri LAPACK_GLOBAL(spftri,SPFTRI)
-#define LAPACK_zpftri LAPACK_GLOBAL(zpftri,ZPFTRI)
-#define LAPACK_cpftri LAPACK_GLOBAL(cpftri,CPFTRI)
-#define LAPACK_spptri LAPACK_GLOBAL(spptri,SPPTRI)
-#define LAPACK_dpptri LAPACK_GLOBAL(dpptri,DPPTRI)
-#define LAPACK_cpptri LAPACK_GLOBAL(cpptri,CPPTRI)
-#define LAPACK_zpptri LAPACK_GLOBAL(zpptri,ZPPTRI)
-#define LAPACK_ssytri LAPACK_GLOBAL(ssytri,SSYTRI)
-#define LAPACK_dsytri LAPACK_GLOBAL(dsytri,DSYTRI)
-#define LAPACK_csytri LAPACK_GLOBAL(csytri,CSYTRI)
-#define LAPACK_zsytri LAPACK_GLOBAL(zsytri,ZSYTRI)
-#define LAPACK_chetri LAPACK_GLOBAL(chetri,CHETRI)
-#define LAPACK_zhetri LAPACK_GLOBAL(zhetri,ZHETRI)
-#define LAPACK_ssptri LAPACK_GLOBAL(ssptri,SSPTRI)
-#define LAPACK_dsptri LAPACK_GLOBAL(dsptri,DSPTRI)
-#define LAPACK_csptri LAPACK_GLOBAL(csptri,CSPTRI)
-#define LAPACK_zsptri LAPACK_GLOBAL(zsptri,ZSPTRI)
-#define LAPACK_chptri LAPACK_GLOBAL(chptri,CHPTRI)
-#define LAPACK_zhptri LAPACK_GLOBAL(zhptri,ZHPTRI)
-#define LAPACK_strtri LAPACK_GLOBAL(strtri,STRTRI)
-#define LAPACK_dtrtri LAPACK_GLOBAL(dtrtri,DTRTRI)
-#define LAPACK_ctrtri LAPACK_GLOBAL(ctrtri,CTRTRI)
-#define LAPACK_ztrtri LAPACK_GLOBAL(ztrtri,ZTRTRI)
-#define LAPACK_dtftri LAPACK_GLOBAL(dtftri,DTFTRI)
-#define LAPACK_stftri LAPACK_GLOBAL(stftri,STFTRI)
-#define LAPACK_ztftri LAPACK_GLOBAL(ztftri,ZTFTRI)
-#define LAPACK_ctftri LAPACK_GLOBAL(ctftri,CTFTRI)
-#define LAPACK_stptri LAPACK_GLOBAL(stptri,STPTRI)
-#define LAPACK_dtptri LAPACK_GLOBAL(dtptri,DTPTRI)
-#define LAPACK_ctptri LAPACK_GLOBAL(ctptri,CTPTRI)
-#define LAPACK_ztptri LAPACK_GLOBAL(ztptri,ZTPTRI)
-#define LAPACK_sgeequ LAPACK_GLOBAL(sgeequ,SGEEQU)
-#define LAPACK_dgeequ LAPACK_GLOBAL(dgeequ,DGEEQU)
-#define LAPACK_cgeequ LAPACK_GLOBAL(cgeequ,CGEEQU)
-#define LAPACK_zgeequ LAPACK_GLOBAL(zgeequ,ZGEEQU)
-#define LAPACK_dgeequb LAPACK_GLOBAL(dgeequb,DGEEQUB)
-#define LAPACK_sgeequb LAPACK_GLOBAL(sgeequb,SGEEQUB)
-#define LAPACK_zgeequb LAPACK_GLOBAL(zgeequb,ZGEEQUB)
-#define LAPACK_cgeequb LAPACK_GLOBAL(cgeequb,CGEEQUB)
-#define LAPACK_sgbequ LAPACK_GLOBAL(sgbequ,SGBEQU)
-#define LAPACK_dgbequ LAPACK_GLOBAL(dgbequ,DGBEQU)
-#define LAPACK_cgbequ LAPACK_GLOBAL(cgbequ,CGBEQU)
-#define LAPACK_zgbequ LAPACK_GLOBAL(zgbequ,ZGBEQU)
-#define LAPACK_dgbequb LAPACK_GLOBAL(dgbequb,DGBEQUB)
-#define LAPACK_sgbequb LAPACK_GLOBAL(sgbequb,SGBEQUB)
-#define LAPACK_zgbequb LAPACK_GLOBAL(zgbequb,ZGBEQUB)
-#define LAPACK_cgbequb LAPACK_GLOBAL(cgbequb,CGBEQUB)
-#define LAPACK_spoequ LAPACK_GLOBAL(spoequ,SPOEQU)
-#define LAPACK_dpoequ LAPACK_GLOBAL(dpoequ,DPOEQU)
-#define LAPACK_cpoequ LAPACK_GLOBAL(cpoequ,CPOEQU)
-#define LAPACK_zpoequ LAPACK_GLOBAL(zpoequ,ZPOEQU)
-#define LAPACK_dpoequb LAPACK_GLOBAL(dpoequb,DPOEQUB)
-#define LAPACK_spoequb LAPACK_GLOBAL(spoequb,SPOEQUB)
-#define LAPACK_zpoequb LAPACK_GLOBAL(zpoequb,ZPOEQUB)
-#define LAPACK_cpoequb LAPACK_GLOBAL(cpoequb,CPOEQUB)
-#define LAPACK_sppequ LAPACK_GLOBAL(sppequ,SPPEQU)
-#define LAPACK_dppequ LAPACK_GLOBAL(dppequ,DPPEQU)
-#define LAPACK_cppequ LAPACK_GLOBAL(cppequ,CPPEQU)
-#define LAPACK_zppequ LAPACK_GLOBAL(zppequ,ZPPEQU)
-#define LAPACK_spbequ LAPACK_GLOBAL(spbequ,SPBEQU)
-#define LAPACK_dpbequ LAPACK_GLOBAL(dpbequ,DPBEQU)
-#define LAPACK_cpbequ LAPACK_GLOBAL(cpbequ,CPBEQU)
-#define LAPACK_zpbequ LAPACK_GLOBAL(zpbequ,ZPBEQU)
-#define LAPACK_dsyequb LAPACK_GLOBAL(dsyequb,DSYEQUB)
-#define LAPACK_ssyequb LAPACK_GLOBAL(ssyequb,SSYEQUB)
-#define LAPACK_zsyequb LAPACK_GLOBAL(zsyequb,ZSYEQUB)
-#define LAPACK_csyequb LAPACK_GLOBAL(csyequb,CSYEQUB)
-#define LAPACK_zheequb LAPACK_GLOBAL(zheequb,ZHEEQUB)
-#define LAPACK_cheequb LAPACK_GLOBAL(cheequb,CHEEQUB)
-#define LAPACK_sgesv LAPACK_GLOBAL(sgesv,SGESV)
-#define LAPACK_dgesv LAPACK_GLOBAL(dgesv,DGESV)
-#define LAPACK_cgesv LAPACK_GLOBAL(cgesv,CGESV)
-#define LAPACK_zgesv LAPACK_GLOBAL(zgesv,ZGESV)
-#define LAPACK_dsgesv LAPACK_GLOBAL(dsgesv,DSGESV)
-#define LAPACK_zcgesv LAPACK_GLOBAL(zcgesv,ZCGESV)
-#define LAPACK_sgesvx LAPACK_GLOBAL(sgesvx,SGESVX)
-#define LAPACK_dgesvx LAPACK_GLOBAL(dgesvx,DGESVX)
-#define LAPACK_cgesvx LAPACK_GLOBAL(cgesvx,CGESVX)
-#define LAPACK_zgesvx LAPACK_GLOBAL(zgesvx,ZGESVX)
-#define LAPACK_dgesvxx LAPACK_GLOBAL(dgesvxx,DGESVXX)
-#define LAPACK_sgesvxx LAPACK_GLOBAL(sgesvxx,SGESVXX)
-#define LAPACK_zgesvxx LAPACK_GLOBAL(zgesvxx,ZGESVXX)
-#define LAPACK_cgesvxx LAPACK_GLOBAL(cgesvxx,CGESVXX)
-#define LAPACK_sgbsv LAPACK_GLOBAL(sgbsv,SGBSV)
-#define LAPACK_dgbsv LAPACK_GLOBAL(dgbsv,DGBSV)
-#define LAPACK_cgbsv LAPACK_GLOBAL(cgbsv,CGBSV)
-#define LAPACK_zgbsv LAPACK_GLOBAL(zgbsv,ZGBSV)
-#define LAPACK_sgbsvx LAPACK_GLOBAL(sgbsvx,SGBSVX)
-#define LAPACK_dgbsvx LAPACK_GLOBAL(dgbsvx,DGBSVX)
-#define LAPACK_cgbsvx LAPACK_GLOBAL(cgbsvx,CGBSVX)
-#define LAPACK_zgbsvx LAPACK_GLOBAL(zgbsvx,ZGBSVX)
-#define LAPACK_dgbsvxx LAPACK_GLOBAL(dgbsvxx,DGBSVXX)
-#define LAPACK_sgbsvxx LAPACK_GLOBAL(sgbsvxx,SGBSVXX)
-#define LAPACK_zgbsvxx LAPACK_GLOBAL(zgbsvxx,ZGBSVXX)
-#define LAPACK_cgbsvxx LAPACK_GLOBAL(cgbsvxx,CGBSVXX)
-#define LAPACK_sgtsv LAPACK_GLOBAL(sgtsv,SGTSV)
-#define LAPACK_dgtsv LAPACK_GLOBAL(dgtsv,DGTSV)
-#define LAPACK_cgtsv LAPACK_GLOBAL(cgtsv,CGTSV)
-#define LAPACK_zgtsv LAPACK_GLOBAL(zgtsv,ZGTSV)
-#define LAPACK_sgtsvx LAPACK_GLOBAL(sgtsvx,SGTSVX)
-#define LAPACK_dgtsvx LAPACK_GLOBAL(dgtsvx,DGTSVX)
-#define LAPACK_cgtsvx LAPACK_GLOBAL(cgtsvx,CGTSVX)
-#define LAPACK_zgtsvx LAPACK_GLOBAL(zgtsvx,ZGTSVX)
-#define LAPACK_sposv LAPACK_GLOBAL(sposv,SPOSV)
-#define LAPACK_dposv LAPACK_GLOBAL(dposv,DPOSV)
-#define LAPACK_cposv LAPACK_GLOBAL(cposv,CPOSV)
-#define LAPACK_zposv LAPACK_GLOBAL(zposv,ZPOSV)
-#define LAPACK_dsposv LAPACK_GLOBAL(dsposv,DSPOSV)
-#define LAPACK_zcposv LAPACK_GLOBAL(zcposv,ZCPOSV)
-#define LAPACK_sposvx LAPACK_GLOBAL(sposvx,SPOSVX)
-#define LAPACK_dposvx LAPACK_GLOBAL(dposvx,DPOSVX)
-#define LAPACK_cposvx LAPACK_GLOBAL(cposvx,CPOSVX)
-#define LAPACK_zposvx LAPACK_GLOBAL(zposvx,ZPOSVX)
-#define LAPACK_dposvxx LAPACK_GLOBAL(dposvxx,DPOSVXX)
-#define LAPACK_sposvxx LAPACK_GLOBAL(sposvxx,SPOSVXX)
-#define LAPACK_zposvxx LAPACK_GLOBAL(zposvxx,ZPOSVXX)
-#define LAPACK_cposvxx LAPACK_GLOBAL(cposvxx,CPOSVXX)
-#define LAPACK_sppsv LAPACK_GLOBAL(sppsv,SPPSV)
-#define LAPACK_dppsv LAPACK_GLOBAL(dppsv,DPPSV)
-#define LAPACK_cppsv LAPACK_GLOBAL(cppsv,CPPSV)
-#define LAPACK_zppsv LAPACK_GLOBAL(zppsv,ZPPSV)
-#define LAPACK_sppsvx LAPACK_GLOBAL(sppsvx,SPPSVX)
-#define LAPACK_dppsvx LAPACK_GLOBAL(dppsvx,DPPSVX)
-#define LAPACK_cppsvx LAPACK_GLOBAL(cppsvx,CPPSVX)
-#define LAPACK_zppsvx LAPACK_GLOBAL(zppsvx,ZPPSVX)
-#define LAPACK_spbsv LAPACK_GLOBAL(spbsv,SPBSV)
-#define LAPACK_dpbsv LAPACK_GLOBAL(dpbsv,DPBSV)
-#define LAPACK_cpbsv LAPACK_GLOBAL(cpbsv,CPBSV)
-#define LAPACK_zpbsv LAPACK_GLOBAL(zpbsv,ZPBSV)
-#define LAPACK_spbsvx LAPACK_GLOBAL(spbsvx,SPBSVX)
-#define LAPACK_dpbsvx LAPACK_GLOBAL(dpbsvx,DPBSVX)
-#define LAPACK_cpbsvx LAPACK_GLOBAL(cpbsvx,CPBSVX)
-#define LAPACK_zpbsvx LAPACK_GLOBAL(zpbsvx,ZPBSVX)
-#define LAPACK_sptsv LAPACK_GLOBAL(sptsv,SPTSV)
-#define LAPACK_dptsv LAPACK_GLOBAL(dptsv,DPTSV)
-#define LAPACK_cptsv LAPACK_GLOBAL(cptsv,CPTSV)
-#define LAPACK_zptsv LAPACK_GLOBAL(zptsv,ZPTSV)
-#define LAPACK_sptsvx LAPACK_GLOBAL(sptsvx,SPTSVX)
-#define LAPACK_dptsvx LAPACK_GLOBAL(dptsvx,DPTSVX)
-#define LAPACK_cptsvx LAPACK_GLOBAL(cptsvx,CPTSVX)
-#define LAPACK_zptsvx LAPACK_GLOBAL(zptsvx,ZPTSVX)
-#define LAPACK_ssysv LAPACK_GLOBAL(ssysv,SSYSV)
-#define LAPACK_dsysv LAPACK_GLOBAL(dsysv,DSYSV)
-#define LAPACK_csysv LAPACK_GLOBAL(csysv,CSYSV)
-#define LAPACK_zsysv LAPACK_GLOBAL(zsysv,ZSYSV)
-#define LAPACK_ssysvx LAPACK_GLOBAL(ssysvx,SSYSVX)
-#define LAPACK_dsysvx LAPACK_GLOBAL(dsysvx,DSYSVX)
-#define LAPACK_csysvx LAPACK_GLOBAL(csysvx,CSYSVX)
-#define LAPACK_zsysvx LAPACK_GLOBAL(zsysvx,ZSYSVX)
-#define LAPACK_dsysvxx LAPACK_GLOBAL(dsysvxx,DSYSVXX)
-#define LAPACK_ssysvxx LAPACK_GLOBAL(ssysvxx,SSYSVXX)
-#define LAPACK_zsysvxx LAPACK_GLOBAL(zsysvxx,ZSYSVXX)
-#define LAPACK_csysvxx LAPACK_GLOBAL(csysvxx,CSYSVXX)
-#define LAPACK_chesv LAPACK_GLOBAL(chesv,CHESV)
-#define LAPACK_zhesv LAPACK_GLOBAL(zhesv,ZHESV)
-#define LAPACK_chesvx LAPACK_GLOBAL(chesvx,CHESVX)
-#define LAPACK_zhesvx LAPACK_GLOBAL(zhesvx,ZHESVX)
-#define LAPACK_zhesvxx LAPACK_GLOBAL(zhesvxx,ZHESVXX)
-#define LAPACK_chesvxx LAPACK_GLOBAL(chesvxx,CHESVXX)
-#define LAPACK_sspsv LAPACK_GLOBAL(sspsv,SSPSV)
-#define LAPACK_dspsv LAPACK_GLOBAL(dspsv,DSPSV)
-#define LAPACK_cspsv LAPACK_GLOBAL(cspsv,CSPSV)
-#define LAPACK_zspsv LAPACK_GLOBAL(zspsv,ZSPSV)
-#define LAPACK_sspsvx LAPACK_GLOBAL(sspsvx,SSPSVX)
-#define LAPACK_dspsvx LAPACK_GLOBAL(dspsvx,DSPSVX)
-#define LAPACK_cspsvx LAPACK_GLOBAL(cspsvx,CSPSVX)
-#define LAPACK_zspsvx LAPACK_GLOBAL(zspsvx,ZSPSVX)
-#define LAPACK_chpsv LAPACK_GLOBAL(chpsv,CHPSV)
-#define LAPACK_zhpsv LAPACK_GLOBAL(zhpsv,ZHPSV)
-#define LAPACK_chpsvx LAPACK_GLOBAL(chpsvx,CHPSVX)
-#define LAPACK_zhpsvx LAPACK_GLOBAL(zhpsvx,ZHPSVX)
-#define LAPACK_sgeqrf LAPACK_GLOBAL(sgeqrf,SGEQRF)
-#define LAPACK_dgeqrf LAPACK_GLOBAL(dgeqrf,DGEQRF)
-#define LAPACK_cgeqrf LAPACK_GLOBAL(cgeqrf,CGEQRF)
-#define LAPACK_zgeqrf LAPACK_GLOBAL(zgeqrf,ZGEQRF)
-#define LAPACK_sgeqpf LAPACK_GLOBAL(sgeqpf,SGEQPF)
-#define LAPACK_dgeqpf LAPACK_GLOBAL(dgeqpf,DGEQPF)
-#define LAPACK_cgeqpf LAPACK_GLOBAL(cgeqpf,CGEQPF)
-#define LAPACK_zgeqpf LAPACK_GLOBAL(zgeqpf,ZGEQPF)
-#define LAPACK_sgeqp3 LAPACK_GLOBAL(sgeqp3,SGEQP3)
-#define LAPACK_dgeqp3 LAPACK_GLOBAL(dgeqp3,DGEQP3)
-#define LAPACK_cgeqp3 LAPACK_GLOBAL(cgeqp3,CGEQP3)
-#define LAPACK_zgeqp3 LAPACK_GLOBAL(zgeqp3,ZGEQP3)
-#define LAPACK_sorgqr LAPACK_GLOBAL(sorgqr,SORGQR)
-#define LAPACK_dorgqr LAPACK_GLOBAL(dorgqr,DORGQR)
-#define LAPACK_sormqr LAPACK_GLOBAL(sormqr,SORMQR)
-#define LAPACK_dormqr LAPACK_GLOBAL(dormqr,DORMQR)
-#define LAPACK_cungqr LAPACK_GLOBAL(cungqr,CUNGQR)
-#define LAPACK_zungqr LAPACK_GLOBAL(zungqr,ZUNGQR)
-#define LAPACK_cunmqr LAPACK_GLOBAL(cunmqr,CUNMQR)
-#define LAPACK_zunmqr LAPACK_GLOBAL(zunmqr,ZUNMQR)
-#define LAPACK_sgelqf LAPACK_GLOBAL(sgelqf,SGELQF)
-#define LAPACK_dgelqf LAPACK_GLOBAL(dgelqf,DGELQF)
-#define LAPACK_cgelqf LAPACK_GLOBAL(cgelqf,CGELQF)
-#define LAPACK_zgelqf LAPACK_GLOBAL(zgelqf,ZGELQF)
-#define LAPACK_sorglq LAPACK_GLOBAL(sorglq,SORGLQ)
-#define LAPACK_dorglq LAPACK_GLOBAL(dorglq,DORGLQ)
-#define LAPACK_sormlq LAPACK_GLOBAL(sormlq,SORMLQ)
-#define LAPACK_dormlq LAPACK_GLOBAL(dormlq,DORMLQ)
-#define LAPACK_cunglq LAPACK_GLOBAL(cunglq,CUNGLQ)
-#define LAPACK_zunglq LAPACK_GLOBAL(zunglq,ZUNGLQ)
-#define LAPACK_cunmlq LAPACK_GLOBAL(cunmlq,CUNMLQ)
-#define LAPACK_zunmlq LAPACK_GLOBAL(zunmlq,ZUNMLQ)
-#define LAPACK_sgeqlf LAPACK_GLOBAL(sgeqlf,SGEQLF)
-#define LAPACK_dgeqlf LAPACK_GLOBAL(dgeqlf,DGEQLF)
-#define LAPACK_cgeqlf LAPACK_GLOBAL(cgeqlf,CGEQLF)
-#define LAPACK_zgeqlf LAPACK_GLOBAL(zgeqlf,ZGEQLF)
-#define LAPACK_sorgql LAPACK_GLOBAL(sorgql,SORGQL)
-#define LAPACK_dorgql LAPACK_GLOBAL(dorgql,DORGQL)
-#define LAPACK_cungql LAPACK_GLOBAL(cungql,CUNGQL)
-#define LAPACK_zungql LAPACK_GLOBAL(zungql,ZUNGQL)
-#define LAPACK_sormql LAPACK_GLOBAL(sormql,SORMQL)
-#define LAPACK_dormql LAPACK_GLOBAL(dormql,DORMQL)
-#define LAPACK_cunmql LAPACK_GLOBAL(cunmql,CUNMQL)
-#define LAPACK_zunmql LAPACK_GLOBAL(zunmql,ZUNMQL)
-#define LAPACK_sgerqf LAPACK_GLOBAL(sgerqf,SGERQF)
-#define LAPACK_dgerqf LAPACK_GLOBAL(dgerqf,DGERQF)
-#define LAPACK_cgerqf LAPACK_GLOBAL(cgerqf,CGERQF)
-#define LAPACK_zgerqf LAPACK_GLOBAL(zgerqf,ZGERQF)
-#define LAPACK_sorgrq LAPACK_GLOBAL(sorgrq,SORGRQ)
-#define LAPACK_dorgrq LAPACK_GLOBAL(dorgrq,DORGRQ)
-#define LAPACK_cungrq LAPACK_GLOBAL(cungrq,CUNGRQ)
-#define LAPACK_zungrq LAPACK_GLOBAL(zungrq,ZUNGRQ)
-#define LAPACK_sormrq LAPACK_GLOBAL(sormrq,SORMRQ)
-#define LAPACK_dormrq LAPACK_GLOBAL(dormrq,DORMRQ)
-#define LAPACK_cunmrq LAPACK_GLOBAL(cunmrq,CUNMRQ)
-#define LAPACK_zunmrq LAPACK_GLOBAL(zunmrq,ZUNMRQ)
-#define LAPACK_stzrzf LAPACK_GLOBAL(stzrzf,STZRZF)
-#define LAPACK_dtzrzf LAPACK_GLOBAL(dtzrzf,DTZRZF)
-#define LAPACK_ctzrzf LAPACK_GLOBAL(ctzrzf,CTZRZF)
-#define LAPACK_ztzrzf LAPACK_GLOBAL(ztzrzf,ZTZRZF)
-#define LAPACK_sormrz LAPACK_GLOBAL(sormrz,SORMRZ)
-#define LAPACK_dormrz LAPACK_GLOBAL(dormrz,DORMRZ)
-#define LAPACK_cunmrz LAPACK_GLOBAL(cunmrz,CUNMRZ)
-#define LAPACK_zunmrz LAPACK_GLOBAL(zunmrz,ZUNMRZ)
-#define LAPACK_sggqrf LAPACK_GLOBAL(sggqrf,SGGQRF)
-#define LAPACK_dggqrf LAPACK_GLOBAL(dggqrf,DGGQRF)
-#define LAPACK_cggqrf LAPACK_GLOBAL(cggqrf,CGGQRF)
-#define LAPACK_zggqrf LAPACK_GLOBAL(zggqrf,ZGGQRF)
-#define LAPACK_sggrqf LAPACK_GLOBAL(sggrqf,SGGRQF)
-#define LAPACK_dggrqf LAPACK_GLOBAL(dggrqf,DGGRQF)
-#define LAPACK_cggrqf LAPACK_GLOBAL(cggrqf,CGGRQF)
-#define LAPACK_zggrqf LAPACK_GLOBAL(zggrqf,ZGGRQF)
-#define LAPACK_sgebrd LAPACK_GLOBAL(sgebrd,SGEBRD)
-#define LAPACK_dgebrd LAPACK_GLOBAL(dgebrd,DGEBRD)
-#define LAPACK_cgebrd LAPACK_GLOBAL(cgebrd,CGEBRD)
-#define LAPACK_zgebrd LAPACK_GLOBAL(zgebrd,ZGEBRD)
-#define LAPACK_sgbbrd LAPACK_GLOBAL(sgbbrd,SGBBRD)
-#define LAPACK_dgbbrd LAPACK_GLOBAL(dgbbrd,DGBBRD)
-#define LAPACK_cgbbrd LAPACK_GLOBAL(cgbbrd,CGBBRD)
-#define LAPACK_zgbbrd LAPACK_GLOBAL(zgbbrd,ZGBBRD)
-#define LAPACK_sorgbr LAPACK_GLOBAL(sorgbr,SORGBR)
-#define LAPACK_dorgbr LAPACK_GLOBAL(dorgbr,DORGBR)
-#define LAPACK_sormbr LAPACK_GLOBAL(sormbr,SORMBR)
-#define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR)
-#define LAPACK_cungbr LAPACK_GLOBAL(cungbr,CUNGBR)
-#define LAPACK_zungbr LAPACK_GLOBAL(zungbr,ZUNGBR)
-#define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR)
-#define LAPACK_zunmbr LAPACK_GLOBAL(zunmbr,ZUNMBR)
-#define LAPACK_sbdsqr LAPACK_GLOBAL(sbdsqr,SBDSQR)
-#define LAPACK_dbdsqr LAPACK_GLOBAL(dbdsqr,DBDSQR)
-#define LAPACK_cbdsqr LAPACK_GLOBAL(cbdsqr,CBDSQR)
-#define LAPACK_zbdsqr LAPACK_GLOBAL(zbdsqr,ZBDSQR)
-#define LAPACK_sbdsdc LAPACK_GLOBAL(sbdsdc,SBDSDC)
-#define LAPACK_dbdsdc LAPACK_GLOBAL(dbdsdc,DBDSDC)
-#define LAPACK_sbdsvdx LAPACK_GLOBAL(sbdsvdx,SBDSVDX)
-#define LAPACK_dbdsvdx LAPACK_GLOBAL(dbdsvdx,DBDSVDX)
-#define LAPACK_ssytrd LAPACK_GLOBAL(ssytrd,SSYTRD)
-#define LAPACK_dsytrd LAPACK_GLOBAL(dsytrd,DSYTRD)
-#define LAPACK_sorgtr LAPACK_GLOBAL(sorgtr,SORGTR)
-#define LAPACK_dorgtr LAPACK_GLOBAL(dorgtr,DORGTR)
-#define LAPACK_sormtr LAPACK_GLOBAL(sormtr,SORMTR)
-#define LAPACK_dormtr LAPACK_GLOBAL(dormtr,DORMTR)
-#define LAPACK_chetrd LAPACK_GLOBAL(chetrd,CHETRD)
-#define LAPACK_zhetrd LAPACK_GLOBAL(zhetrd,ZHETRD)
-#define LAPACK_cungtr LAPACK_GLOBAL(cungtr,CUNGTR)
-#define LAPACK_zungtr LAPACK_GLOBAL(zungtr,ZUNGTR)
-#define LAPACK_cunmtr LAPACK_GLOBAL(cunmtr,CUNMTR)
-#define LAPACK_zunmtr LAPACK_GLOBAL(zunmtr,ZUNMTR)
-#define LAPACK_ssptrd LAPACK_GLOBAL(ssptrd,SSPTRD)
-#define LAPACK_dsptrd LAPACK_GLOBAL(dsptrd,DSPTRD)
-#define LAPACK_sopgtr LAPACK_GLOBAL(sopgtr,SOPGTR)
-#define LAPACK_dopgtr LAPACK_GLOBAL(dopgtr,DOPGTR)
-#define LAPACK_sopmtr LAPACK_GLOBAL(sopmtr,SOPMTR)
-#define LAPACK_dopmtr LAPACK_GLOBAL(dopmtr,DOPMTR)
-#define LAPACK_chptrd LAPACK_GLOBAL(chptrd,CHPTRD)
-#define LAPACK_zhptrd LAPACK_GLOBAL(zhptrd,ZHPTRD)
-#define LAPACK_cupgtr LAPACK_GLOBAL(cupgtr,CUPGTR)
-#define LAPACK_zupgtr LAPACK_GLOBAL(zupgtr,ZUPGTR)
-#define LAPACK_cupmtr LAPACK_GLOBAL(cupmtr,CUPMTR)
-#define LAPACK_zupmtr LAPACK_GLOBAL(zupmtr,ZUPMTR)
-#define LAPACK_ssbtrd LAPACK_GLOBAL(ssbtrd,SSBTRD)
-#define LAPACK_dsbtrd LAPACK_GLOBAL(dsbtrd,DSBTRD)
-#define LAPACK_chbtrd LAPACK_GLOBAL(chbtrd,CHBTRD)
-#define LAPACK_zhbtrd LAPACK_GLOBAL(zhbtrd,ZHBTRD)
-#define LAPACK_ssterf LAPACK_GLOBAL(ssterf,SSTERF)
-#define LAPACK_dsterf LAPACK_GLOBAL(dsterf,DSTERF)
-#define LAPACK_ssteqr LAPACK_GLOBAL(ssteqr,SSTEQR)
-#define LAPACK_dsteqr LAPACK_GLOBAL(dsteqr,DSTEQR)
-#define LAPACK_csteqr LAPACK_GLOBAL(csteqr,CSTEQR)
-#define LAPACK_zsteqr LAPACK_GLOBAL(zsteqr,ZSTEQR)
-#define LAPACK_sstemr LAPACK_GLOBAL(sstemr,SSTEMR)
-#define LAPACK_dstemr LAPACK_GLOBAL(dstemr,DSTEMR)
-#define LAPACK_cstemr LAPACK_GLOBAL(cstemr,CSTEMR)
-#define LAPACK_zstemr LAPACK_GLOBAL(zstemr,ZSTEMR)
-#define LAPACK_sstedc LAPACK_GLOBAL(sstedc,SSTEDC)
-#define LAPACK_dstedc LAPACK_GLOBAL(dstedc,DSTEDC)
-#define LAPACK_cstedc LAPACK_GLOBAL(cstedc,CSTEDC)
-#define LAPACK_zstedc LAPACK_GLOBAL(zstedc,ZSTEDC)
-#define LAPACK_sstegr LAPACK_GLOBAL(sstegr,SSTEGR)
-#define LAPACK_dstegr LAPACK_GLOBAL(dstegr,DSTEGR)
-#define LAPACK_cstegr LAPACK_GLOBAL(cstegr,CSTEGR)
-#define LAPACK_zstegr LAPACK_GLOBAL(zstegr,ZSTEGR)
-#define LAPACK_spteqr LAPACK_GLOBAL(spteqr,SPTEQR)
-#define LAPACK_dpteqr LAPACK_GLOBAL(dpteqr,DPTEQR)
-#define LAPACK_cpteqr LAPACK_GLOBAL(cpteqr,CPTEQR)
-#define LAPACK_zpteqr LAPACK_GLOBAL(zpteqr,ZPTEQR)
-#define LAPACK_sstebz LAPACK_GLOBAL(sstebz,SSTEBZ)
-#define LAPACK_dstebz LAPACK_GLOBAL(dstebz,DSTEBZ)
-#define LAPACK_sstein LAPACK_GLOBAL(sstein,SSTEIN)
-#define LAPACK_dstein LAPACK_GLOBAL(dstein,DSTEIN)
-#define LAPACK_cstein LAPACK_GLOBAL(cstein,CSTEIN)
-#define LAPACK_zstein LAPACK_GLOBAL(zstein,ZSTEIN)
-#define LAPACK_sdisna LAPACK_GLOBAL(sdisna,SDISNA)
-#define LAPACK_ddisna LAPACK_GLOBAL(ddisna,DDISNA)
-#define LAPACK_ssygst LAPACK_GLOBAL(ssygst,SSYGST)
-#define LAPACK_dsygst LAPACK_GLOBAL(dsygst,DSYGST)
-#define LAPACK_chegst LAPACK_GLOBAL(chegst,CHEGST)
-#define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST)
-#define LAPACK_sspgst LAPACK_GLOBAL(sspgst,SSPGST)
-#define LAPACK_dspgst LAPACK_GLOBAL(dspgst,DSPGST)
-#define LAPACK_chpgst LAPACK_GLOBAL(chpgst,CHPGST)
-#define LAPACK_zhpgst LAPACK_GLOBAL(zhpgst,ZHPGST)
-#define LAPACK_ssbgst LAPACK_GLOBAL(ssbgst,SSBGST)
-#define LAPACK_dsbgst LAPACK_GLOBAL(dsbgst,DSBGST)
-#define LAPACK_chbgst LAPACK_GLOBAL(chbgst,CHBGST)
-#define LAPACK_zhbgst LAPACK_GLOBAL(zhbgst,ZHBGST)
-#define LAPACK_spbstf LAPACK_GLOBAL(spbstf,SPBSTF)
-#define LAPACK_dpbstf LAPACK_GLOBAL(dpbstf,DPBSTF)
-#define LAPACK_cpbstf LAPACK_GLOBAL(cpbstf,CPBSTF)
-#define LAPACK_zpbstf LAPACK_GLOBAL(zpbstf,ZPBSTF)
-#define LAPACK_sgehrd LAPACK_GLOBAL(sgehrd,SGEHRD)
-#define LAPACK_dgehrd LAPACK_GLOBAL(dgehrd,DGEHRD)
-#define LAPACK_cgehrd LAPACK_GLOBAL(cgehrd,CGEHRD)
-#define LAPACK_zgehrd LAPACK_GLOBAL(zgehrd,ZGEHRD)
-#define LAPACK_sorghr LAPACK_GLOBAL(sorghr,SORGHR)
-#define LAPACK_dorghr LAPACK_GLOBAL(dorghr,DORGHR)
-#define LAPACK_sormhr LAPACK_GLOBAL(sormhr,SORMHR)
-#define LAPACK_dormhr LAPACK_GLOBAL(dormhr,DORMHR)
-#define LAPACK_cunghr LAPACK_GLOBAL(cunghr,CUNGHR)
-#define LAPACK_zunghr LAPACK_GLOBAL(zunghr,ZUNGHR)
-#define LAPACK_cunmhr LAPACK_GLOBAL(cunmhr,CUNMHR)
-#define LAPACK_zunmhr LAPACK_GLOBAL(zunmhr,ZUNMHR)
-#define LAPACK_sgebal LAPACK_GLOBAL(sgebal,SGEBAL)
-#define LAPACK_dgebal LAPACK_GLOBAL(dgebal,DGEBAL)
-#define LAPACK_cgebal LAPACK_GLOBAL(cgebal,CGEBAL)
-#define LAPACK_zgebal LAPACK_GLOBAL(zgebal,ZGEBAL)
-#define LAPACK_sgebak LAPACK_GLOBAL(sgebak,SGEBAK)
-#define LAPACK_dgebak LAPACK_GLOBAL(dgebak,DGEBAK)
-#define LAPACK_cgebak LAPACK_GLOBAL(cgebak,CGEBAK)
-#define LAPACK_zgebak LAPACK_GLOBAL(zgebak,ZGEBAK)
-#define LAPACK_shseqr LAPACK_GLOBAL(shseqr,SHSEQR)
-#define LAPACK_dhseqr LAPACK_GLOBAL(dhseqr,DHSEQR)
-#define LAPACK_chseqr LAPACK_GLOBAL(chseqr,CHSEQR)
-#define LAPACK_zhseqr LAPACK_GLOBAL(zhseqr,ZHSEQR)
-#define LAPACK_shsein LAPACK_GLOBAL(shsein,SHSEIN)
-#define LAPACK_dhsein LAPACK_GLOBAL(dhsein,DHSEIN)
-#define LAPACK_chsein LAPACK_GLOBAL(chsein,CHSEIN)
-#define LAPACK_zhsein LAPACK_GLOBAL(zhsein,ZHSEIN)
-#define LAPACK_strevc LAPACK_GLOBAL(strevc,STREVC)
-#define LAPACK_dtrevc LAPACK_GLOBAL(dtrevc,DTREVC)
-#define LAPACK_ctrevc LAPACK_GLOBAL(ctrevc,CTREVC)
-#define LAPACK_ztrevc LAPACK_GLOBAL(ztrevc,ZTREVC)
-#define LAPACK_strsna LAPACK_GLOBAL(strsna,STRSNA)
-#define LAPACK_dtrsna LAPACK_GLOBAL(dtrsna,DTRSNA)
-#define LAPACK_ctrsna LAPACK_GLOBAL(ctrsna,CTRSNA)
-#define LAPACK_ztrsna LAPACK_GLOBAL(ztrsna,ZTRSNA)
-#define LAPACK_strexc LAPACK_GLOBAL(strexc,STREXC)
-#define LAPACK_dtrexc LAPACK_GLOBAL(dtrexc,DTREXC)
-#define LAPACK_ctrexc LAPACK_GLOBAL(ctrexc,CTREXC)
-#define LAPACK_ztrexc LAPACK_GLOBAL(ztrexc,ZTREXC)
-#define LAPACK_strsen LAPACK_GLOBAL(strsen,STRSEN)
-#define LAPACK_dtrsen LAPACK_GLOBAL(dtrsen,DTRSEN)
-#define LAPACK_ctrsen LAPACK_GLOBAL(ctrsen,CTRSEN)
-#define LAPACK_ztrsen LAPACK_GLOBAL(ztrsen,ZTRSEN)
-#define LAPACK_strsyl LAPACK_GLOBAL(strsyl,STRSYL)
-#define LAPACK_dtrsyl LAPACK_GLOBAL(dtrsyl,DTRSYL)
-#define LAPACK_ctrsyl LAPACK_GLOBAL(ctrsyl,CTRSYL)
-#define LAPACK_ztrsyl LAPACK_GLOBAL(ztrsyl,ZTRSYL)
-#define LAPACK_sgghrd LAPACK_GLOBAL(sgghrd,SGGHRD)
-#define LAPACK_dgghrd LAPACK_GLOBAL(dgghrd,DGGHRD)
-#define LAPACK_cgghrd LAPACK_GLOBAL(cgghrd,CGGHRD)
-#define LAPACK_zgghrd LAPACK_GLOBAL(zgghrd,ZGGHRD)
-#define LAPACK_sgghd3 LAPACK_GLOBAL(sgghd3,SGGHD3)
-#define LAPACK_dgghd3 LAPACK_GLOBAL(dgghd3,DGGHD3)
-#define LAPACK_cgghd3 LAPACK_GLOBAL(cgghd3,CGGHD3)
-#define LAPACK_zgghd3 LAPACK_GLOBAL(zgghd3,ZGGHD3)
-#define LAPACK_sggbal LAPACK_GLOBAL(sggbal,SGGBAL)
-#define LAPACK_dggbal LAPACK_GLOBAL(dggbal,DGGBAL)
-#define LAPACK_cggbal LAPACK_GLOBAL(cggbal,CGGBAL)
-#define LAPACK_zggbal LAPACK_GLOBAL(zggbal,ZGGBAL)
-#define LAPACK_sggbak LAPACK_GLOBAL(sggbak,SGGBAK)
-#define LAPACK_dggbak LAPACK_GLOBAL(dggbak,DGGBAK)
-#define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK)
-#define LAPACK_zggbak LAPACK_GLOBAL(zggbak,ZGGBAK)
-#define LAPACK_shgeqz LAPACK_GLOBAL(shgeqz,SHGEQZ)
-#define LAPACK_dhgeqz LAPACK_GLOBAL(dhgeqz,DHGEQZ)
-#define LAPACK_chgeqz LAPACK_GLOBAL(chgeqz,CHGEQZ)
-#define LAPACK_zhgeqz LAPACK_GLOBAL(zhgeqz,ZHGEQZ)
-#define LAPACK_stgevc LAPACK_GLOBAL(stgevc,STGEVC)
-#define LAPACK_dtgevc LAPACK_GLOBAL(dtgevc,DTGEVC)
-#define LAPACK_ctgevc LAPACK_GLOBAL(ctgevc,CTGEVC)
-#define LAPACK_ztgevc LAPACK_GLOBAL(ztgevc,ZTGEVC)
-#define LAPACK_stgexc LAPACK_GLOBAL(stgexc,STGEXC)
-#define LAPACK_dtgexc LAPACK_GLOBAL(dtgexc,DTGEXC)
-#define LAPACK_ctgexc LAPACK_GLOBAL(ctgexc,CTGEXC)
-#define LAPACK_ztgexc LAPACK_GLOBAL(ztgexc,ZTGEXC)
-#define LAPACK_stgsen LAPACK_GLOBAL(stgsen,STGSEN)
-#define LAPACK_dtgsen LAPACK_GLOBAL(dtgsen,DTGSEN)
-#define LAPACK_ctgsen LAPACK_GLOBAL(ctgsen,CTGSEN)
-#define LAPACK_ztgsen LAPACK_GLOBAL(ztgsen,ZTGSEN)
-#define LAPACK_stgsyl LAPACK_GLOBAL(stgsyl,STGSYL)
-#define LAPACK_dtgsyl LAPACK_GLOBAL(dtgsyl,DTGSYL)
-#define LAPACK_ctgsyl LAPACK_GLOBAL(ctgsyl,CTGSYL)
-#define LAPACK_ztgsyl LAPACK_GLOBAL(ztgsyl,ZTGSYL)
-#define LAPACK_stgsna LAPACK_GLOBAL(stgsna,STGSNA)
-#define LAPACK_dtgsna LAPACK_GLOBAL(dtgsna,DTGSNA)
-#define LAPACK_ctgsna LAPACK_GLOBAL(ctgsna,CTGSNA)
-#define LAPACK_ztgsna LAPACK_GLOBAL(ztgsna,ZTGSNA)
-#define LAPACK_sggsvp LAPACK_GLOBAL(sggsvp,SGGSVP)
-#define LAPACK_dggsvp LAPACK_GLOBAL(dggsvp,DGGSVP)
-#define LAPACK_cggsvp LAPACK_GLOBAL(cggsvp,CGGSVP)
-#define LAPACK_zggsvp LAPACK_GLOBAL(zggsvp,ZGGSVP)
-#define LAPACK_sggsvp3 LAPACK_GLOBAL(sggsvp3,SGGSVP3)
-#define LAPACK_dggsvp3 LAPACK_GLOBAL(dggsvp3,DGGSVP3)
-#define LAPACK_cggsvp3 LAPACK_GLOBAL(cggsvp3,CGGSVP3)
-#define LAPACK_zggsvp3 LAPACK_GLOBAL(zggsvp3,ZGGSVP3)
-#define LAPACK_stgsja LAPACK_GLOBAL(stgsja,STGSJA)
-#define LAPACK_dtgsja LAPACK_GLOBAL(dtgsja,DTGSJA)
-#define LAPACK_ctgsja LAPACK_GLOBAL(ctgsja,CTGSJA)
-#define LAPACK_ztgsja LAPACK_GLOBAL(ztgsja,ZTGSJA)
-#define LAPACK_sgels LAPACK_GLOBAL(sgels,SGELS)
-#define LAPACK_dgels LAPACK_GLOBAL(dgels,DGELS)
-#define LAPACK_cgels LAPACK_GLOBAL(cgels,CGELS)
-#define LAPACK_zgels LAPACK_GLOBAL(zgels,ZGELS)
-#define LAPACK_sgelsy LAPACK_GLOBAL(sgelsy,SGELSY)
-#define LAPACK_dgelsy LAPACK_GLOBAL(dgelsy,DGELSY)
-#define LAPACK_cgelsy LAPACK_GLOBAL(cgelsy,CGELSY)
-#define LAPACK_zgelsy LAPACK_GLOBAL(zgelsy,ZGELSY)
-#define LAPACK_sgelss LAPACK_GLOBAL(sgelss,SGELSS)
-#define LAPACK_dgelss LAPACK_GLOBAL(dgelss,DGELSS)
-#define LAPACK_cgelss LAPACK_GLOBAL(cgelss,CGELSS)
-#define LAPACK_zgelss LAPACK_GLOBAL(zgelss,ZGELSS)
-#define LAPACK_sgelsd LAPACK_GLOBAL(sgelsd,SGELSD)
-#define LAPACK_dgelsd LAPACK_GLOBAL(dgelsd,DGELSD)
-#define LAPACK_cgelsd LAPACK_GLOBAL(cgelsd,CGELSD)
-#define LAPACK_zgelsd LAPACK_GLOBAL(zgelsd,ZGELSD)
-#define LAPACK_sgglse LAPACK_GLOBAL(sgglse,SGGLSE)
-#define LAPACK_dgglse LAPACK_GLOBAL(dgglse,DGGLSE)
-#define LAPACK_cgglse LAPACK_GLOBAL(cgglse,CGGLSE)
-#define LAPACK_zgglse LAPACK_GLOBAL(zgglse,ZGGLSE)
-#define LAPACK_sggglm LAPACK_GLOBAL(sggglm,SGGGLM)
-#define LAPACK_dggglm LAPACK_GLOBAL(dggglm,DGGGLM)
-#define LAPACK_cggglm LAPACK_GLOBAL(cggglm,CGGGLM)
-#define LAPACK_zggglm LAPACK_GLOBAL(zggglm,ZGGGLM)
-#define LAPACK_ssyev LAPACK_GLOBAL(ssyev,SSYEV)
-#define LAPACK_dsyev LAPACK_GLOBAL(dsyev,DSYEV)
-#define LAPACK_cheev LAPACK_GLOBAL(cheev,CHEEV)
-#define LAPACK_zheev LAPACK_GLOBAL(zheev,ZHEEV)
-#define LAPACK_ssyev_2stage LAPACK_GLOBAL(ssyev_2stage,SSYEV_2STAGE)
-#define LAPACK_dsyev_2stage LAPACK_GLOBAL(dsyev_2stage,DSYEV_2STAGE)
-#define LAPACK_cheev_2stage LAPACK_GLOBAL(cheev_2stage,CHEEV_2STAGE)
-#define LAPACK_zheev_2stage LAPACK_GLOBAL(zheev_2stage,ZHEEV_2STAGE)
-#define LAPACK_ssyevd LAPACK_GLOBAL(ssyevd,SSYEVD)
-#define LAPACK_dsyevd LAPACK_GLOBAL(dsyevd,DSYEVD)
-#define LAPACK_cheevd LAPACK_GLOBAL(cheevd,CHEEVD)
-#define LAPACK_zheevd LAPACK_GLOBAL(zheevd,ZHEEVD)
-#define LAPACK_ssyevd_2stage LAPACK_GLOBAL(ssyevd_2stage,SSYEVD_2STAGE)
-#define LAPACK_dsyevd_2stage LAPACK_GLOBAL(dsyevd_2stage,DSYEVD_2STAGE)
-#define LAPACK_cheevd_2stage LAPACK_GLOBAL(cheevd_2stage,CHEEVD_2STAGE)
-#define LAPACK_zheevd_2stage LAPACK_GLOBAL(zheevd_2stage,ZHEEVD_2STAGE)
-#define LAPACK_ssyevx LAPACK_GLOBAL(ssyevx,SSYEVX)
-#define LAPACK_dsyevx LAPACK_GLOBAL(dsyevx,DSYEVX)
-#define LAPACK_cheevx LAPACK_GLOBAL(cheevx,CHEEVX)
-#define LAPACK_zheevx LAPACK_GLOBAL(zheevx,ZHEEVX)
-#define LAPACK_ssyevx_2stage LAPACK_GLOBAL(ssyevx_2stage,SSYEVX_2STAGE)
-#define LAPACK_dsyevx_2stage LAPACK_GLOBAL(dsyevx_2stage,DSYEVX_2STAGE)
-#define LAPACK_cheevx_2stage LAPACK_GLOBAL(cheevx_2stage,CHEEVX_2STAGE)
-#define LAPACK_zheevx_2stage LAPACK_GLOBAL(zheevx_2stage,ZHEEVX_2STAGE)
-#define LAPACK_ssyevr LAPACK_GLOBAL(ssyevr,SSYEVR)
-#define LAPACK_dsyevr LAPACK_GLOBAL(dsyevr,DSYEVR)
-#define LAPACK_cheevr LAPACK_GLOBAL(cheevr,CHEEVR)
-#define LAPACK_zheevr LAPACK_GLOBAL(zheevr,ZHEEVR)
-#define LAPACK_ssyevr_2stage LAPACK_GLOBAL(ssyevr_2stage,SSYEVR_2STAGE)
-#define LAPACK_dsyevr_2stage LAPACK_GLOBAL(dsyevr_2stage,DSYEVR_2STAGE)
-#define LAPACK_cheevr_2stage LAPACK_GLOBAL(cheevr_2stage,CHEEVR_2STAGE)
-#define LAPACK_zheevr_2stage LAPACK_GLOBAL(zheevr_2stage,ZHEEVR_2STAGE)
-#define LAPACK_sspev LAPACK_GLOBAL(sspev,SSPEV)
-#define LAPACK_dspev LAPACK_GLOBAL(dspev,DSPEV)
-#define LAPACK_chpev LAPACK_GLOBAL(chpev,CHPEV)
-#define LAPACK_zhpev LAPACK_GLOBAL(zhpev,ZHPEV)
-#define LAPACK_sspevd LAPACK_GLOBAL(sspevd,SSPEVD)
-#define LAPACK_dspevd LAPACK_GLOBAL(dspevd,DSPEVD)
-#define LAPACK_chpevd LAPACK_GLOBAL(chpevd,CHPEVD)
-#define LAPACK_zhpevd LAPACK_GLOBAL(zhpevd,ZHPEVD)
-#define LAPACK_sspevx LAPACK_GLOBAL(sspevx,SSPEVX)
-#define LAPACK_dspevx LAPACK_GLOBAL(dspevx,DSPEVX)
-#define LAPACK_chpevx LAPACK_GLOBAL(chpevx,CHPEVX)
-#define LAPACK_zhpevx LAPACK_GLOBAL(zhpevx,ZHPEVX)
-#define LAPACK_ssbev LAPACK_GLOBAL(ssbev,SSBEV)
-#define LAPACK_dsbev LAPACK_GLOBAL(dsbev,DSBEV)
-#define LAPACK_chbev LAPACK_GLOBAL(chbev,CHBEV)
-#define LAPACK_zhbev LAPACK_GLOBAL(zhbev,ZHBEV)
-#define LAPACK_ssbev_2stage LAPACK_GLOBAL(ssbev_2stage,SSBEV_2STAGE)
-#define LAPACK_dsbev_2stage LAPACK_GLOBAL(dsbev_2stage,DSBEV_2STAGE)
-#define LAPACK_chbev_2stage LAPACK_GLOBAL(chbev_2stage,CHBEV_2STAGE)
-#define LAPACK_zhbev_2stage LAPACK_GLOBAL(zhbev_2stage,ZHBEV_2STAGE)
-#define LAPACK_ssbevd LAPACK_GLOBAL(ssbevd,SSBEVD)
-#define LAPACK_dsbevd LAPACK_GLOBAL(dsbevd,DSBEVD)
-#define LAPACK_chbevd LAPACK_GLOBAL(chbevd,CHBEVD)
-#define LAPACK_zhbevd LAPACK_GLOBAL(zhbevd,ZHBEVD)
-#define LAPACK_ssbevd_2stage LAPACK_GLOBAL(ssbevd_2stage,SSBEVD_2STAGE)
-#define LAPACK_dsbevd_2stage LAPACK_GLOBAL(dsbevd_2stage,DSBEVD_2STAGE)
-#define LAPACK_chbevd_2stage LAPACK_GLOBAL(chbevd_2stage,CHBEVD_2STAGE)
-#define LAPACK_zhbevd_2stage LAPACK_GLOBAL(zhbevd_2stage,ZHBEVD_2STAGE)
-#define LAPACK_ssbevx LAPACK_GLOBAL(ssbevx,SSBEVX)
-#define LAPACK_dsbevx LAPACK_GLOBAL(dsbevx,DSBEVX)
-#define LAPACK_chbevx LAPACK_GLOBAL(chbevx,CHBEVX)
-#define LAPACK_zhbevx LAPACK_GLOBAL(zhbevx,ZHBEVX)
-#define LAPACK_ssbevx_2stage LAPACK_GLOBAL(ssbevx_2stage,SSBEVX_2STAGE)
-#define LAPACK_dsbevx_2stage LAPACK_GLOBAL(dsbevx_2stage,DSBEVX_2STAGE)
-#define LAPACK_chbevx_2stage LAPACK_GLOBAL(chbevx_2stage,CHBEVX_2STAGE)
-#define LAPACK_zhbevx_2stage LAPACK_GLOBAL(zhbevx_2stage,ZHBEVX_2STAGE)
-#define LAPACK_sstev LAPACK_GLOBAL(sstev,SSTEV)
-#define LAPACK_dstev LAPACK_GLOBAL(dstev,DSTEV)
-#define LAPACK_sstevd LAPACK_GLOBAL(sstevd,SSTEVD)
-#define LAPACK_dstevd LAPACK_GLOBAL(dstevd,DSTEVD)
-#define LAPACK_sstevx LAPACK_GLOBAL(sstevx,SSTEVX)
-#define LAPACK_dstevx LAPACK_GLOBAL(dstevx,DSTEVX)
-#define LAPACK_sstevr LAPACK_GLOBAL(sstevr,SSTEVR)
-#define LAPACK_dstevr LAPACK_GLOBAL(dstevr,DSTEVR)
-#define LAPACK_sgees LAPACK_GLOBAL(sgees,SGEES)
-#define LAPACK_dgees LAPACK_GLOBAL(dgees,DGEES)
-#define LAPACK_cgees LAPACK_GLOBAL(cgees,CGEES)
-#define LAPACK_zgees LAPACK_GLOBAL(zgees,ZGEES)
-#define LAPACK_sgeesx LAPACK_GLOBAL(sgeesx,SGEESX)
-#define LAPACK_dgeesx LAPACK_GLOBAL(dgeesx,DGEESX)
-#define LAPACK_cgeesx LAPACK_GLOBAL(cgeesx,CGEESX)
-#define LAPACK_zgeesx LAPACK_GLOBAL(zgeesx,ZGEESX)
-#define LAPACK_sgeev LAPACK_GLOBAL(sgeev,SGEEV)
-#define LAPACK_dgeev LAPACK_GLOBAL(dgeev,DGEEV)
-#define LAPACK_cgeev LAPACK_GLOBAL(cgeev,CGEEV)
-#define LAPACK_zgeev LAPACK_GLOBAL(zgeev,ZGEEV)
-#define LAPACK_sgeevx LAPACK_GLOBAL(sgeevx,SGEEVX)
-#define LAPACK_dgeevx LAPACK_GLOBAL(dgeevx,DGEEVX)
-#define LAPACK_cgeevx LAPACK_GLOBAL(cgeevx,CGEEVX)
-#define LAPACK_zgeevx LAPACK_GLOBAL(zgeevx,ZGEEVX)
-#define LAPACK_sgesvd LAPACK_GLOBAL(sgesvd,SGESVD)
-#define LAPACK_dgesvd LAPACK_GLOBAL(dgesvd,DGESVD)
-#define LAPACK_cgesvd LAPACK_GLOBAL(cgesvd,CGESVD)
-#define LAPACK_zgesvd LAPACK_GLOBAL(zgesvd,ZGESVD)
-#define LAPACK_sgesvdx LAPACK_GLOBAL(sgesvdx,SGESVDX)
-#define LAPACK_dgesvdx LAPACK_GLOBAL(dgesvdx,DGESVDX)
-#define LAPACK_cgesvdx LAPACK_GLOBAL(cgesvdx,CGESVDX)
-#define LAPACK_zgesvdx LAPACK_GLOBAL(zgesvdx,ZGESVDX)
-#define LAPACK_sgesdd LAPACK_GLOBAL(sgesdd,SGESDD)
-#define LAPACK_dgesdd LAPACK_GLOBAL(dgesdd,DGESDD)
-#define LAPACK_cgesdd LAPACK_GLOBAL(cgesdd,CGESDD)
-#define LAPACK_zgesdd LAPACK_GLOBAL(zgesdd,ZGESDD)
-#define LAPACK_sgejsv LAPACK_GLOBAL(sgejsv,SGEJSV)
-#define LAPACK_dgejsv LAPACK_GLOBAL(dgejsv,DGEJSV)
-#define LAPACK_cgejsv LAPACK_GLOBAL(cgejsv,CGEJSV)
-#define LAPACK_zgejsv LAPACK_GLOBAL(zgejsv,ZGEJSV)
-#define LAPACK_sgesvj LAPACK_GLOBAL(sgesvj,SGESVJ)
-#define LAPACK_dgesvj LAPACK_GLOBAL(dgesvj,DGESVJ)
-#define LAPACK_cgesvj LAPACK_GLOBAL(cgesvj,CGESVJ)
-#define LAPACK_zgesvj LAPACK_GLOBAL(zgesvj,ZGESVJ)
-#define LAPACK_sggsvd LAPACK_GLOBAL(sggsvd,SGGSVD)
-#define LAPACK_dggsvd LAPACK_GLOBAL(dggsvd,DGGSVD)
-#define LAPACK_cggsvd LAPACK_GLOBAL(cggsvd,CGGSVD)
-#define LAPACK_zggsvd LAPACK_GLOBAL(zggsvd,ZGGSVD)
-#define LAPACK_ssygv LAPACK_GLOBAL(ssygv,SSYGV)
-#define LAPACK_dsygv LAPACK_GLOBAL(dsygv,DSYGV)
-#define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV)
-#define LAPACK_zhegv LAPACK_GLOBAL(zhegv,ZHEGV)
-#define LAPACK_ssygv_2stage LAPACK_GLOBAL(ssygv_2stage,SSYGV_2STAGE)
-#define LAPACK_dsygv_2stage LAPACK_GLOBAL(dsygv_2stage,DSYGV_2STAGE)
-#define LAPACK_chegv_2stage LAPACK_GLOBAL(chegv_2stage,CHEGV_2STAGE)
-#define LAPACK_zhegv_2stage LAPACK_GLOBAL(zhegv_2stage,ZHEGV_2STAGE)
-#define LAPACK_ssygvd LAPACK_GLOBAL(ssygvd,SSYGVD)
-#define LAPACK_dsygvd LAPACK_GLOBAL(dsygvd,DSYGVD)
-#define LAPACK_chegvd LAPACK_GLOBAL(chegvd,CHEGVD)
-#define LAPACK_zhegvd LAPACK_GLOBAL(zhegvd,ZHEGVD)
-#define LAPACK_ssygvx LAPACK_GLOBAL(ssygvx,SSYGVX)
-#define LAPACK_dsygvx LAPACK_GLOBAL(dsygvx,DSYGVX)
-#define LAPACK_chegvx LAPACK_GLOBAL(chegvx,CHEGVX)
-#define LAPACK_zhegvx LAPACK_GLOBAL(zhegvx,ZHEGVX)
-#define LAPACK_sspgv LAPACK_GLOBAL(sspgv,SSPGV)
-#define LAPACK_dspgv LAPACK_GLOBAL(dspgv,DSPGV)
-#define LAPACK_chpgv LAPACK_GLOBAL(chpgv,CHPGV)
-#define LAPACK_zhpgv LAPACK_GLOBAL(zhpgv,ZHPGV)
-#define LAPACK_sspgvd LAPACK_GLOBAL(sspgvd,SSPGVD)
-#define LAPACK_dspgvd LAPACK_GLOBAL(dspgvd,DSPGVD)
-#define LAPACK_chpgvd LAPACK_GLOBAL(chpgvd,CHPGVD)
-#define LAPACK_zhpgvd LAPACK_GLOBAL(zhpgvd,ZHPGVD)
-#define LAPACK_sspgvx LAPACK_GLOBAL(sspgvx,SSPGVX)
-#define LAPACK_dspgvx LAPACK_GLOBAL(dspgvx,DSPGVX)
-#define LAPACK_chpgvx LAPACK_GLOBAL(chpgvx,CHPGVX)
-#define LAPACK_zhpgvx LAPACK_GLOBAL(zhpgvx,ZHPGVX)
-#define LAPACK_ssbgv LAPACK_GLOBAL(ssbgv,SSBGV)
-#define LAPACK_dsbgv LAPACK_GLOBAL(dsbgv,DSBGV)
-#define LAPACK_chbgv LAPACK_GLOBAL(chbgv,CHBGV)
-#define LAPACK_zhbgv LAPACK_GLOBAL(zhbgv,ZHBGV)
-#define LAPACK_ssbgvd LAPACK_GLOBAL(ssbgvd,SSBGVD)
-#define LAPACK_dsbgvd LAPACK_GLOBAL(dsbgvd,DSBGVD)
-#define LAPACK_chbgvd LAPACK_GLOBAL(chbgvd,CHBGVD)
-#define LAPACK_zhbgvd LAPACK_GLOBAL(zhbgvd,ZHBGVD)
-#define LAPACK_ssbgvx LAPACK_GLOBAL(ssbgvx,SSBGVX)
-#define LAPACK_dsbgvx LAPACK_GLOBAL(dsbgvx,DSBGVX)
-#define LAPACK_chbgvx LAPACK_GLOBAL(chbgvx,CHBGVX)
-#define LAPACK_zhbgvx LAPACK_GLOBAL(zhbgvx,ZHBGVX)
-#define LAPACK_sgges LAPACK_GLOBAL(sgges,SGGES)
-#define LAPACK_dgges LAPACK_GLOBAL(dgges,DGGES)
-#define LAPACK_cgges LAPACK_GLOBAL(cgges,CGGES)
-#define LAPACK_zgges LAPACK_GLOBAL(zgges,ZGGES)
-#define LAPACK_sgges3 LAPACK_GLOBAL(sgges3,SGGES3)
-#define LAPACK_dgges3 LAPACK_GLOBAL(dgges3,DGGES3)
-#define LAPACK_cgges3 LAPACK_GLOBAL(cgges3,CGGES3)
-#define LAPACK_zgges3 LAPACK_GLOBAL(zgges3,ZGGES3)
-#define LAPACK_sggesx LAPACK_GLOBAL(sggesx,SGGESX)
-#define LAPACK_dggesx LAPACK_GLOBAL(dggesx,DGGESX)
-#define LAPACK_cggesx LAPACK_GLOBAL(cggesx,CGGESX)
-#define LAPACK_zggesx LAPACK_GLOBAL(zggesx,ZGGESX)
-#define LAPACK_sggev LAPACK_GLOBAL(sggev,SGGEV)
-#define LAPACK_dggev LAPACK_GLOBAL(dggev,DGGEV)
-#define LAPACK_cggev LAPACK_GLOBAL(cggev,CGGEV)
-#define LAPACK_zggev LAPACK_GLOBAL(zggev,ZGGEV)
-#define LAPACK_sggev3 LAPACK_GLOBAL(sggev3,SGGEV3)
-#define LAPACK_dggev3 LAPACK_GLOBAL(dggev3,DGGEV3)
-#define LAPACK_cggev3 LAPACK_GLOBAL(cggev3,CGGEV3)
-#define LAPACK_zggev3 LAPACK_GLOBAL(zggev3,ZGGEV3)
-#define LAPACK_sggevx LAPACK_GLOBAL(sggevx,SGGEVX)
-#define LAPACK_dggevx LAPACK_GLOBAL(dggevx,DGGEVX)
-#define LAPACK_cggevx LAPACK_GLOBAL(cggevx,CGGEVX)
-#define LAPACK_zggevx LAPACK_GLOBAL(zggevx,ZGGEVX)
-#define LAPACK_dsfrk LAPACK_GLOBAL(dsfrk,DSFRK)
-#define LAPACK_ssfrk LAPACK_GLOBAL(ssfrk,SSFRK)
-#define LAPACK_zhfrk LAPACK_GLOBAL(zhfrk,ZHFRK)
-#define LAPACK_chfrk LAPACK_GLOBAL(chfrk,CHFRK)
-#define LAPACK_dtfsm LAPACK_GLOBAL(dtfsm,DTFSM)
-#define LAPACK_stfsm LAPACK_GLOBAL(stfsm,STFSM)
-#define LAPACK_ztfsm LAPACK_GLOBAL(ztfsm,ZTFSM)
-#define LAPACK_ctfsm LAPACK_GLOBAL(ctfsm,CTFSM)
-#define LAPACK_dtfttp LAPACK_GLOBAL(dtfttp,DTFTTP)
-#define LAPACK_stfttp LAPACK_GLOBAL(stfttp,STFTTP)
-#define LAPACK_ztfttp LAPACK_GLOBAL(ztfttp,ZTFTTP)
-#define LAPACK_ctfttp LAPACK_GLOBAL(ctfttp,CTFTTP)
-#define LAPACK_dtfttr LAPACK_GLOBAL(dtfttr,DTFTTR)
-#define LAPACK_stfttr LAPACK_GLOBAL(stfttr,STFTTR)
-#define LAPACK_ztfttr LAPACK_GLOBAL(ztfttr,ZTFTTR)
-#define LAPACK_ctfttr LAPACK_GLOBAL(ctfttr,CTFTTR)
-#define LAPACK_dtpttf LAPACK_GLOBAL(dtpttf,DTPTTF)
-#define LAPACK_stpttf LAPACK_GLOBAL(stpttf,STPTTF)
-#define LAPACK_ztpttf LAPACK_GLOBAL(ztpttf,ZTPTTF)
-#define LAPACK_ctpttf LAPACK_GLOBAL(ctpttf,CTPTTF)
-#define LAPACK_dtpttr LAPACK_GLOBAL(dtpttr,DTPTTR)
-#define LAPACK_stpttr LAPACK_GLOBAL(stpttr,STPTTR)
-#define LAPACK_ztpttr LAPACK_GLOBAL(ztpttr,ZTPTTR)
-#define LAPACK_ctpttr LAPACK_GLOBAL(ctpttr,CTPTTR)
-#define LAPACK_dtrttf LAPACK_GLOBAL(dtrttf,DTRTTF)
-#define LAPACK_strttf LAPACK_GLOBAL(strttf,STRTTF)
-#define LAPACK_ztrttf LAPACK_GLOBAL(ztrttf,ZTRTTF)
-#define LAPACK_ctrttf LAPACK_GLOBAL(ctrttf,CTRTTF)
-#define LAPACK_dtrttp LAPACK_GLOBAL(dtrttp,DTRTTP)
-#define LAPACK_strttp LAPACK_GLOBAL(strttp,STRTTP)
-#define LAPACK_ztrttp LAPACK_GLOBAL(ztrttp,ZTRTTP)
-#define LAPACK_ctrttp LAPACK_GLOBAL(ctrttp,CTRTTP)
-#define LAPACK_sgeqrfp LAPACK_GLOBAL(sgeqrfp,SGEQRFP)
-#define LAPACK_dgeqrfp LAPACK_GLOBAL(dgeqrfp,DGEQRFP)
-#define LAPACK_cgeqrfp LAPACK_GLOBAL(cgeqrfp,CGEQRFP)
-#define LAPACK_zgeqrfp LAPACK_GLOBAL(zgeqrfp,ZGEQRFP)
-#define LAPACK_clacgv LAPACK_GLOBAL(clacgv,CLACGV)
-#define LAPACK_zlacgv LAPACK_GLOBAL(zlacgv,ZLACGV)
-#define LAPACK_slarnv LAPACK_GLOBAL(slarnv,SLARNV)
-#define LAPACK_dlarnv LAPACK_GLOBAL(dlarnv,DLARNV)
-#define LAPACK_clarnv LAPACK_GLOBAL(clarnv,CLARNV)
-#define LAPACK_zlarnv LAPACK_GLOBAL(zlarnv,ZLARNV)
-#define LAPACK_sgeqr2 LAPACK_GLOBAL(sgeqr2,SGEQR2)
-#define LAPACK_dgeqr2 LAPACK_GLOBAL(dgeqr2,DGEQR2)
-#define LAPACK_cgeqr2 LAPACK_GLOBAL(cgeqr2,CGEQR2)
-#define LAPACK_zgeqr2 LAPACK_GLOBAL(zgeqr2,ZGEQR2)
-#define LAPACK_slacn2 LAPACK_GLOBAL(slacn2,SLACN2)
-#define LAPACK_dlacn2 LAPACK_GLOBAL(dlacn2,DLACN2)
-#define LAPACK_clacn2 LAPACK_GLOBAL(clacn2,CLACN2)
-#define LAPACK_zlacn2 LAPACK_GLOBAL(zlacn2,ZLACN2)
-#define LAPACK_slacpy LAPACK_GLOBAL(slacpy,SLACPY)
-#define LAPACK_dlacpy LAPACK_GLOBAL(dlacpy,DLACPY)
-#define LAPACK_clacpy LAPACK_GLOBAL(clacpy,CLACPY)
-#define LAPACK_zlacpy LAPACK_GLOBAL(zlacpy,ZLACPY)
-#define LAPACK_clacp2 LAPACK_GLOBAL(clacp2,CLACP2)
-#define LAPACK_zlacp2 LAPACK_GLOBAL(zlacp2,ZLACP2)
-#define LAPACK_sgetf2 LAPACK_GLOBAL(sgetf2,SGETF2)
-#define LAPACK_dgetf2 LAPACK_GLOBAL(dgetf2,DGETF2)
-#define LAPACK_cgetf2 LAPACK_GLOBAL(cgetf2,CGETF2)
-#define LAPACK_zgetf2 LAPACK_GLOBAL(zgetf2,ZGETF2)
-#define LAPACK_slaswp LAPACK_GLOBAL(slaswp,SLASWP)
-#define LAPACK_dlaswp LAPACK_GLOBAL(dlaswp,DLASWP)
-#define LAPACK_claswp LAPACK_GLOBAL(claswp,CLASWP)
-#define LAPACK_zlaswp LAPACK_GLOBAL(zlaswp,ZLASWP)
-#define LAPACK_slange LAPACK_GLOBAL(slange,SLANGE)
-#define LAPACK_dlange LAPACK_GLOBAL(dlange,DLANGE)
-#define LAPACK_clange LAPACK_GLOBAL(clange,CLANGE)
-#define LAPACK_zlange LAPACK_GLOBAL(zlange,ZLANGE)
-#define LAPACK_clanhe LAPACK_GLOBAL(clanhe,CLANHE)
-#define LAPACK_zlanhe LAPACK_GLOBAL(zlanhe,ZLANHE)
-#define LAPACK_clarcm LAPACK_GLOBAL(clarcm,CLARCM)
-#define LAPACK_zlarcm LAPACK_GLOBAL(zlarcm,ZLARCM)
-#define LAPACK_clacrm LAPACK_GLOBAL(clacrm,CLACRM)
-#define LAPACK_zlacrm LAPACK_GLOBAL(zlacrm,ZLACRM)
-#define LAPACK_slansy LAPACK_GLOBAL(slansy,SLANSY)
-#define LAPACK_dlansy LAPACK_GLOBAL(dlansy,DLANSY)
-#define LAPACK_clansy LAPACK_GLOBAL(clansy,CLANSY)
-#define LAPACK_zlansy LAPACK_GLOBAL(zlansy,ZLANSY)
-#define LAPACK_slantr LAPACK_GLOBAL(slantr,SLANTR)
-#define LAPACK_dlantr LAPACK_GLOBAL(dlantr,DLANTR)
-#define LAPACK_clantr LAPACK_GLOBAL(clantr,CLANTR)
-#define LAPACK_zlantr LAPACK_GLOBAL(zlantr,ZLANTR)
-#define LAPACK_slamch LAPACK_GLOBAL(slamch,SLAMCH)
-#define LAPACK_dlamch LAPACK_GLOBAL(dlamch,DLAMCH)
-#define LAPACK_sgelq2 LAPACK_GLOBAL(sgelq2,SGELQ2)
-#define LAPACK_dgelq2 LAPACK_GLOBAL(dgelq2,DGELQ2)
-#define LAPACK_cgelq2 LAPACK_GLOBAL(cgelq2,CGELQ2)
-#define LAPACK_zgelq2 LAPACK_GLOBAL(zgelq2,ZGELQ2)
-#define LAPACK_slarfb LAPACK_GLOBAL(slarfb,SLARFB)
-#define LAPACK_dlarfb LAPACK_GLOBAL(dlarfb,DLARFB)
-#define LAPACK_clarfb LAPACK_GLOBAL(clarfb,CLARFB)
-#define LAPACK_zlarfb LAPACK_GLOBAL(zlarfb,ZLARFB)
-#define LAPACK_slarfg LAPACK_GLOBAL(slarfg,SLARFG)
-#define LAPACK_dlarfg LAPACK_GLOBAL(dlarfg,DLARFG)
-#define LAPACK_clarfg LAPACK_GLOBAL(clarfg,CLARFG)
-#define LAPACK_zlarfg LAPACK_GLOBAL(zlarfg,ZLARFG)
-#define LAPACK_slassq LAPACK_GLOBAL(slassq,SLASSQ)
-#define LAPACK_dlassq LAPACK_GLOBAL(dlassq,DLASSQ)
-#define LAPACK_classq LAPACK_GLOBAL(classq,CLASSQ)
-#define LAPACK_zlassq LAPACK_GLOBAL(zlassq,ZLASSQ)
-#define LAPACK_slarft LAPACK_GLOBAL(slarft,SLARFT)
-#define LAPACK_dlarft LAPACK_GLOBAL(dlarft,DLARFT)
-#define LAPACK_clarft LAPACK_GLOBAL(clarft,CLARFT)
-#define LAPACK_zlarft LAPACK_GLOBAL(zlarft,ZLARFT)
-#define LAPACK_slarfx LAPACK_GLOBAL(slarfx,SLARFX)
-#define LAPACK_dlarfx LAPACK_GLOBAL(dlarfx,DLARFX)
-#define LAPACK_clarfx LAPACK_GLOBAL(clarfx,CLARFX)
-#define LAPACK_zlarfx LAPACK_GLOBAL(zlarfx,ZLARFX)
-#define LAPACK_slatms LAPACK_GLOBAL(slatms,SLATMS)
-#define LAPACK_dlatms LAPACK_GLOBAL(dlatms,DLATMS)
-#define LAPACK_clatms LAPACK_GLOBAL(clatms,CLATMS)
-#define LAPACK_zlatms LAPACK_GLOBAL(zlatms,ZLATMS)
-#define LAPACK_slag2d LAPACK_GLOBAL(slag2d,SLAG2D)
-#define LAPACK_dlag2s LAPACK_GLOBAL(dlag2s,DLAG2S)
-#define LAPACK_clag2z LAPACK_GLOBAL(clag2z,CLAG2Z)
-#define LAPACK_zlag2c LAPACK_GLOBAL(zlag2c,ZLAG2C)
-#define LAPACK_slauum LAPACK_GLOBAL(slauum,SLAUUM)
-#define LAPACK_dlauum LAPACK_GLOBAL(dlauum,DLAUUM)
-#define LAPACK_clauum LAPACK_GLOBAL(clauum,CLAUUM)
-#define LAPACK_zlauum LAPACK_GLOBAL(zlauum,ZLAUUM)
-#define LAPACK_slagge LAPACK_GLOBAL(slagge,SLAGGE)
-#define LAPACK_dlagge LAPACK_GLOBAL(dlagge,DLAGGE)
-#define LAPACK_clagge LAPACK_GLOBAL(clagge,CLAGGE)
-#define LAPACK_zlagge LAPACK_GLOBAL(zlagge,ZLAGGE)
-#define LAPACK_slascl LAPACK_GLOBAL(slascl,SLASCL)
-#define LAPACK_dlascl LAPACK_GLOBAL(dlascl,DLASCL)
-#define LAPACK_clascl LAPACK_GLOBAL(clascl,CLASCL)
-#define LAPACK_zlascl LAPACK_GLOBAL(zlascl,ZLASCL)
-#define LAPACK_slaset LAPACK_GLOBAL(slaset,SLASET)
-#define LAPACK_dlaset LAPACK_GLOBAL(dlaset,DLASET)
-#define LAPACK_claset LAPACK_GLOBAL(claset,CLASET)
-#define LAPACK_zlaset LAPACK_GLOBAL(zlaset,ZLASET)
-#define LAPACK_slasrt LAPACK_GLOBAL(slasrt,SLASRT)
-#define LAPACK_dlasrt LAPACK_GLOBAL(dlasrt,DLASRT)
-#define LAPACK_slagsy LAPACK_GLOBAL(slagsy,SLAGSY)
-#define LAPACK_dlagsy LAPACK_GLOBAL(dlagsy,DLAGSY)
-#define LAPACK_clagsy LAPACK_GLOBAL(clagsy,CLAGSY)
-#define LAPACK_zlagsy LAPACK_GLOBAL(zlagsy,ZLAGSY)
-#define LAPACK_claghe LAPACK_GLOBAL(claghe,CLAGHE)
-#define LAPACK_zlaghe LAPACK_GLOBAL(zlaghe,ZLAGHE)
-#define LAPACK_slapmr LAPACK_GLOBAL(slapmr,SLAPMR)
-#define LAPACK_dlapmr LAPACK_GLOBAL(dlapmr,DLAPMR)
-#define LAPACK_clapmr LAPACK_GLOBAL(clapmr,CLAPMR)
-#define LAPACK_zlapmr LAPACK_GLOBAL(zlapmr,ZLAPMR)
-#define LAPACK_slapmt LAPACK_GLOBAL(slapmt,SLAPMT)
-#define LAPACK_dlapmt LAPACK_GLOBAL(dlapmt,DLAPMT)
-#define LAPACK_clapmt LAPACK_GLOBAL(clapmt,CLAPMT)
-#define LAPACK_zlapmt LAPACK_GLOBAL(zlapmt,ZLAPMT)
-#define LAPACK_slapy2 LAPACK_GLOBAL(slapy2,SLAPY2)
-#define LAPACK_dlapy2 LAPACK_GLOBAL(dlapy2,DLAPY2)
-#define LAPACK_slapy3 LAPACK_GLOBAL(slapy3,SLAPY3)
-#define LAPACK_dlapy3 LAPACK_GLOBAL(dlapy3,DLAPY3)
-#define LAPACK_slartgp LAPACK_GLOBAL(slartgp,SLARTGP)
-#define LAPACK_dlartgp LAPACK_GLOBAL(dlartgp,DLARTGP)
-#define LAPACK_slartgs LAPACK_GLOBAL(slartgs,SLARTGS)
-#define LAPACK_dlartgs LAPACK_GLOBAL(dlartgs,DLARTGS)
-// LAPACK 3.3.0
-#define LAPACK_cbbcsd LAPACK_GLOBAL(cbbcsd,CBBCSD)
-#define LAPACK_cheswapr LAPACK_GLOBAL(cheswapr,CHESWAPR)
-#define LAPACK_chetri2 LAPACK_GLOBAL(chetri2,CHETRI2)
-#define LAPACK_chetri2x LAPACK_GLOBAL(chetri2x,CHETRI2X)
-#define LAPACK_chetrs2 LAPACK_GLOBAL(chetrs2,CHETRS2)
-#define LAPACK_csyconv LAPACK_GLOBAL(csyconv,CSYCONV)
-#define LAPACK_csyswapr LAPACK_GLOBAL(csyswapr,CSYSWAPR)
-#define LAPACK_csytri2 LAPACK_GLOBAL(csytri2,CSYTRI2)
-#define LAPACK_csytri2x LAPACK_GLOBAL(csytri2x,CSYTRI2X)
-#define LAPACK_csytrs2 LAPACK_GLOBAL(csytrs2,CSYTRS2)
-#define LAPACK_cunbdb LAPACK_GLOBAL(cunbdb,CUNBDB)
-#define LAPACK_cuncsd LAPACK_GLOBAL(cuncsd,CUNCSD)
-#define LAPACK_cuncsd2by1 LAPACK_GLOBAL(cuncsd2by1,CUNCSD2BY1)
-#define LAPACK_dbbcsd LAPACK_GLOBAL(dbbcsd,DBBCSD)
-#define LAPACK_dorbdb LAPACK_GLOBAL(dorbdb,DORBDB)
-#define LAPACK_dorcsd LAPACK_GLOBAL(dorcsd,DORCSD)
-#define LAPACK_dorcsd2by1 LAPACK_GLOBAL(dorcsd2by1,DORCSD2BY1)
-#define LAPACK_dsyconv LAPACK_GLOBAL(dsyconv,DSYCONV)
-#define LAPACK_dsyswapr LAPACK_GLOBAL(dsyswapr,DSYSWAPR)
-#define LAPACK_dsytri2 LAPACK_GLOBAL(dsytri2,DSYTRI2)
-#define LAPACK_dsytri2x LAPACK_GLOBAL(dsytri2x,DSYTRI2X)
-#define LAPACK_dsytrs2 LAPACK_GLOBAL(dsytrs2,DSYTRS2)
-#define LAPACK_sbbcsd LAPACK_GLOBAL(sbbcsd,SBBCSD)
-#define LAPACK_sorbdb LAPACK_GLOBAL(sorbdb,SORBDB)
-#define LAPACK_sorcsd LAPACK_GLOBAL(sorcsd,SORCSD)
-#define LAPACK_sorcsd2by1 LAPACK_GLOBAL(sorcsd2by1,SORCSD2BY1)
-#define LAPACK_ssyconv LAPACK_GLOBAL(ssyconv,SSYCONV)
-#define LAPACK_ssyswapr LAPACK_GLOBAL(ssyswapr,SSYSWAPR)
-#define LAPACK_ssytri2 LAPACK_GLOBAL(ssytri2,SSYTRI2)
-#define LAPACK_ssytri2x LAPACK_GLOBAL(ssytri2x,SSYTRI2X)
-#define LAPACK_ssytrs2 LAPACK_GLOBAL(ssytrs2,SSYTRS2)
-#define LAPACK_zbbcsd LAPACK_GLOBAL(zbbcsd,ZBBCSD)
-#define LAPACK_zheswapr LAPACK_GLOBAL(zheswapr,ZHESWAPR)
-#define LAPACK_zhetri2 LAPACK_GLOBAL(zhetri2,ZHETRI2)
-#define LAPACK_zhetri2x LAPACK_GLOBAL(zhetri2x,ZHETRI2X)
-#define LAPACK_zhetrs2 LAPACK_GLOBAL(zhetrs2,ZHETRS2)
-#define LAPACK_zsyconv LAPACK_GLOBAL(zsyconv,ZSYCONV)
-#define LAPACK_zsyswapr LAPACK_GLOBAL(zsyswapr,ZSYSWAPR)
-#define LAPACK_zsytri2 LAPACK_GLOBAL(zsytri2,ZSYTRI2)
-#define LAPACK_zsytri2x LAPACK_GLOBAL(zsytri2x,ZSYTRI2X)
-#define LAPACK_zsytrs2 LAPACK_GLOBAL(zsytrs2,ZSYTRS2)
-#define LAPACK_zunbdb LAPACK_GLOBAL(zunbdb,ZUNBDB)
-#define LAPACK_zuncsd LAPACK_GLOBAL(zuncsd,ZUNCSD)
-#define LAPACK_zuncsd2by1 LAPACK_GLOBAL(zuncsd2by1,ZUNCSD2BY1)
-// LAPACK 3.4.0
-#define LAPACK_sgemqrt LAPACK_GLOBAL(sgemqrt,SGEMQRT)
-#define LAPACK_dgemqrt LAPACK_GLOBAL(dgemqrt,DGEMQRT)
-#define LAPACK_cgemqrt LAPACK_GLOBAL(cgemqrt,CGEMQRT)
-#define LAPACK_zgemqrt LAPACK_GLOBAL(zgemqrt,ZGEMQRT)
-#define LAPACK_sgeqrt LAPACK_GLOBAL(sgeqrt,SGEQRT)
-#define LAPACK_dgeqrt LAPACK_GLOBAL(dgeqrt,DGEQRT)
-#define LAPACK_cgeqrt LAPACK_GLOBAL(cgeqrt,CGEQRT)
-#define LAPACK_zgeqrt LAPACK_GLOBAL(zgeqrt,ZGEQRT)
-#define LAPACK_sgeqrt2 LAPACK_GLOBAL(sgeqrt2,SGEQRT2)
-#define LAPACK_dgeqrt2 LAPACK_GLOBAL(dgeqrt2,DGEQRT2)
-#define LAPACK_cgeqrt2 LAPACK_GLOBAL(cgeqrt2,CGEQRT2)
-#define LAPACK_zgeqrt2 LAPACK_GLOBAL(zgeqrt2,ZGEQRT2)
-#define LAPACK_sgeqrt3 LAPACK_GLOBAL(sgeqrt3,SGEQRT3)
-#define LAPACK_dgeqrt3 LAPACK_GLOBAL(dgeqrt3,DGEQRT3)
-#define LAPACK_cgeqrt3 LAPACK_GLOBAL(cgeqrt3,CGEQRT3)
-#define LAPACK_zgeqrt3 LAPACK_GLOBAL(zgeqrt3,ZGEQRT3)
-#define LAPACK_stpmqrt LAPACK_GLOBAL(stpmqrt,STPMQRT)
-#define LAPACK_dtpmqrt LAPACK_GLOBAL(dtpmqrt,DTPMQRT)
-#define LAPACK_ctpmqrt LAPACK_GLOBAL(ctpmqrt,CTPMQRT)
-#define LAPACK_ztpmqrt LAPACK_GLOBAL(ztpmqrt,ZTPMQRT)
-#define LAPACK_stpqrt LAPACK_GLOBAL(stpqrt,STPQRT)
-#define LAPACK_dtpqrt LAPACK_GLOBAL(dtpqrt,DTPQRT)
-#define LAPACK_ctpqrt LAPACK_GLOBAL(ctpqrt,CTPQRT)
-#define LAPACK_ztpqrt LAPACK_GLOBAL(ztpqrt,ZTPQRT)
-#define LAPACK_stpqrt2 LAPACK_GLOBAL(stpqrt2,STPQRT2)
-#define LAPACK_dtpqrt2 LAPACK_GLOBAL(dtpqrt2,DTPQRT2)
-#define LAPACK_ctpqrt2 LAPACK_GLOBAL(ctpqrt2,CTPQRT2)
-#define LAPACK_ztpqrt2 LAPACK_GLOBAL(ztpqrt2,ZTPQRT2)
-#define LAPACK_stprfb LAPACK_GLOBAL(stprfb,STPRFB)
-#define LAPACK_dtprfb LAPACK_GLOBAL(dtprfb,DTPRFB)
-#define LAPACK_ctprfb LAPACK_GLOBAL(ctprfb,CTPRFB)
-#define LAPACK_ztprfb LAPACK_GLOBAL(ztprfb,ZTPRFB)
-// LAPACK 3.5.0
-#define LAPACK_ssysv_rook LAPACK_GLOBAL(ssysv_rook,SSYSV_ROOK)
-#define LAPACK_dsysv_rook LAPACK_GLOBAL(dsysv_rook,DSYSV_ROOK)
-#define LAPACK_csysv_rook LAPACK_GLOBAL(csysv_rook,CSYSV_ROOK)
-#define LAPACK_zsysv_rook LAPACK_GLOBAL(zsysv_rook,ZSYSV_ROOK)
-#define LAPACK_csyr LAPACK_GLOBAL(csyr,CSYR)
-#define LAPACK_zsyr LAPACK_GLOBAL(zsyr,ZSYR)
-#define LAPACK_ilaver LAPACK_GLOBAL(ilaver,ILAVER)
-// LAPACK 3.6.0
-#define LAPACK_sggsvd3 LAPACK_GLOBAL(sggsvd3,SGGSVD3)
-#define LAPACK_dggsvd3 LAPACK_GLOBAL(dggsvd3,DGGSVD3)
-#define LAPACK_cggsvd3 LAPACK_GLOBAL(cggsvd3,CGGSVD3)
-#define LAPACK_zggsvd3 LAPACK_GLOBAL(zggsvd3,ZGGSVD3)
-// LAPACK 3.7.0
-#define LAPACK_ssysv_aa LAPACK_GLOBAL(ssysv_aa,SSYSV_AA)
-#define LAPACK_dsysv_aa LAPACK_GLOBAL(dsysv_aa,DSYSV_AA)
-#define LAPACK_chesv_aa LAPACK_GLOBAL(chesv_aa,CHESV_AA)
-#define LAPACK_zsysv_aa LAPACK_GLOBAL(zsysv_aa,ZSYSV_AA)
-#define LAPACK_csysv_aa LAPACK_GLOBAL(csysv_aa,CSYSV_AA)
-#define LAPACK_zhesv_aa LAPACK_GLOBAL(zhesv_aa,ZHESV_AA)
-#define LAPACK_ssytrs_aa LAPACK_GLOBAL(ssytrs_aa,SSYTRS_AA)
-#define LAPACK_dsytrs_aa LAPACK_GLOBAL(dsytrs_aa,DSYTRS_AA)
-#define LAPACK_csytrs_aa LAPACK_GLOBAL(csytrs_aa,CSYTRS_AA)
-#define LAPACK_zsytrs_aa LAPACK_GLOBAL(zsytrs_aa,ZSYTRS_AA)
-#define LAPACK_chetrs_aa LAPACK_GLOBAL(chetrs_aa,CHETRS_AA)
-#define LAPACK_zhetrs_aa LAPACK_GLOBAL(zhetrs_aa,ZHETRS_AA)
-#define LAPACK_ssytrf_aa LAPACK_GLOBAL(ssytrf_aa,SSYTRF_AA)
-#define LAPACK_dsytrf_aa LAPACK_GLOBAL(dsytrf_aa,DSYTRF_AA)
-#define LAPACK_csytrf_aa LAPACK_GLOBAL(csytrf_aa,CSYTRF_AA)
-#define LAPACK_zsytrf_aa LAPACK_GLOBAL(zsytrf_aa,ZSYTRF_AA)
-#define LAPACK_chetrf_aa LAPACK_GLOBAL(chetrf_aa,CHETRF_AA)
-#define LAPACK_zhetrf_aa LAPACK_GLOBAL(zhetrf_aa,ZHETRF_AA)
-
-#define LAPACK_ssysv_rk LAPACK_GLOBAL(ssysv_rk,SSYSV_RK)
-#define LAPACK_dsysv_rk LAPACK_GLOBAL(dsysv_rk,DSYSV_RK)
-#define LAPACK_chesv_rk LAPACK_GLOBAL(chesv_rk,CHESV_RK)
-#define LAPACK_zsysv_rk LAPACK_GLOBAL(zsysv_rk,ZSYSV_RK)
-#define LAPACK_csysv_rk LAPACK_GLOBAL(csysv_rk,CSYSV_RK)
-#define LAPACK_zhesv_rk LAPACK_GLOBAL(zhesv_rk,ZHESV_RK)
-#define LAPACK_ssytrf_rk LAPACK_GLOBAL(ssytrf_rk,SSYTRF_RK)
-#define LAPACK_dsytrf_rk LAPACK_GLOBAL(dsytrf_rk,DSYTRF_RK)
-#define LAPACK_csytrf_rk LAPACK_GLOBAL(csytrf_rk,CSYTRF_RK)
-#define LAPACK_zsytrf_rk LAPACK_GLOBAL(zsytrf_rk,ZSYTRF_RK)
-#define LAPACK_chetrf_rk LAPACK_GLOBAL(chetrf_rk,CHETRF_RK)
-#define LAPACK_zhetrf_rk LAPACK_GLOBAL(zhetrf_rk,ZHETRF_RK)
-#define LAPACK_ssytrs_3 LAPACK_GLOBAL(ssytrs_3,SSYTRS_3)
-#define LAPACK_dsytrs_3 LAPACK_GLOBAL(dsytrs_3,DSYTRS_3)
-#define LAPACK_csytrs_3 LAPACK_GLOBAL(csytrs_3,CSYTRS_3)
-#define LAPACK_zsytrs_3 LAPACK_GLOBAL(zsytrs_3,ZSYTRS_3)
-#define LAPACK_chetrs_3 LAPACK_GLOBAL(chetrs_3,CHETRS_3)
-#define LAPACK_zhetrs_3 LAPACK_GLOBAL(zhetrs_3,ZHETRS_3)
-#define LAPACK_ssytri_3 LAPACK_GLOBAL(ssytri_3,SSYTRI_3)
-#define LAPACK_dsytri_3 LAPACK_GLOBAL(dsytri_3,DSYTRI_3)
-#define LAPACK_csytri_3 LAPACK_GLOBAL(csytri_3,CSYTRI_3)
-#define LAPACK_zsytri_3 LAPACK_GLOBAL(zsytri_3,ZSYTRI_3)
-#define LAPACK_chetri_3 LAPACK_GLOBAL(chetri_3,CHETRI_3)
-#define LAPACK_zhetri_3 LAPACK_GLOBAL(zhetri_3,ZHETRI_3)
-#define LAPACK_ssycon_3 LAPACK_GLOBAL(ssycon_3,SSYCON_3)
-#define LAPACK_dsycon_3 LAPACK_GLOBAL(dsycon_3,DSYCON_3)
-#define LAPACK_csycon_3 LAPACK_GLOBAL(csycon_3,CSYCON_3)
-#define LAPACK_zsycon_3 LAPACK_GLOBAL(zsycon_3,ZSYCON_3)
-#define LAPACK_checon_3 LAPACK_GLOBAL(checon_3,CHECON_3)
-#define LAPACK_zhecon_3 LAPACK_GLOBAL(zhecon_3,ZHECON_3)
-#define LAPACK_sgelq LAPACK_GLOBAL(sgelq,SGELQ)
-#define LAPACK_dgelq LAPACK_GLOBAL(dgelq,DGELQ)
-#define LAPACK_cgelq LAPACK_GLOBAL(cgelq,CGELQ)
-#define LAPACK_zgelq LAPACK_GLOBAL(zgelq,ZGELQ)
-#define LAPACK_sgemlq LAPACK_GLOBAL(sgemlq,SGEMLQ)
-#define LAPACK_dgemlq LAPACK_GLOBAL(dgemlq,DGEMLQ)
-#define LAPACK_cgemlq LAPACK_GLOBAL(cgemlq,CGEMLQ)
-#define LAPACK_zgemlq LAPACK_GLOBAL(zgemlq,ZGEMLQ)
-#define LAPACK_sgeqr LAPACK_GLOBAL(sgeqr,SGEQR)
-#define LAPACK_dgeqr LAPACK_GLOBAL(dgeqr,DGEQR)
-#define LAPACK_cgeqr LAPACK_GLOBAL(cgeqr,CGEQR)
-#define LAPACK_zgeqr LAPACK_GLOBAL(zgeqr,ZGEQR)
-#define LAPACK_sgemqr LAPACK_GLOBAL(sgemqr,SGEMQR)
-#define LAPACK_dgemqr LAPACK_GLOBAL(dgemqr,DGEMQR)
-#define LAPACK_cgemqr LAPACK_GLOBAL(cgemqr,CGEMQR)
-#define LAPACK_zgemqr LAPACK_GLOBAL(zgemqr,ZGEMQR)
-#define LAPACK_sgetsls LAPACK_GLOBAL(sgetsls,SGETSLS)
-#define LAPACK_dgetsls LAPACK_GLOBAL(dgetsls,DGETSLS)
-#define LAPACK_cgetsls LAPACK_GLOBAL(cgetsls,CGETSLS)
-#define LAPACK_zgetsls LAPACK_GLOBAL(zgetsls,ZGETSLS)
-
-// LAPACK 3.8.0
-#define LAPACK_ssysv_aa_2stage LAPACK_GLOBAL(ssysv_aa_2stage,SSYSV_AA_2STAGE)
-#define LAPACK_dsysv_aa_2stage LAPACK_GLOBAL(dsysv_aa_2stage,DSYSV_AA_2STAGE)
-#define LAPACK_chesv_aa_2stage LAPACK_GLOBAL(chesv_aa_2stage,CHESV_AA_2STAGE)
-#define LAPACK_zsysv_aa_2stage LAPACK_GLOBAL(zsysv_aa_2stage,ZSYSV_AA_2STAGE)
-#define LAPACK_csysv_aa_2stage LAPACK_GLOBAL(csysv_aa_2stage,CSYSV_AA_2STAGE)
-#define LAPACK_zhesv_aa_2stage LAPACK_GLOBAL(zhesv_aa_2stage,ZHESV_AA_2STAGE)
-#define LAPACK_ssytrs_aa_2stage LAPACK_GLOBAL(ssytrs_aa_2stage,SSYTRS_AA_2STAGE)
-#define LAPACK_dsytrs_aa_2stage LAPACK_GLOBAL(dsytrs_aa_2stage,DSYTRS_AA_2STAGE)
-#define LAPACK_csytrs_aa_2stage LAPACK_GLOBAL(csytrs_aa_2stage,CSYTRS_AA_2STAGE)
-#define LAPACK_zsytrs_aa_2stage LAPACK_GLOBAL(zsytrs_aa_2stage,ZSYTRS_AA_2STAGE)
-#define LAPACK_chetrs_aa_2stage LAPACK_GLOBAL(chetrs_aa_2stage,CHETRS_AA_2STAGE)
-#define LAPACK_zhetrs_aa_2stage LAPACK_GLOBAL(zhetrs_aa_2stage,ZHETRS_AA_2STAGE)
-#define LAPACK_ssytrf_aa_2stage LAPACK_GLOBAL(ssytrf_aa_2stage,SSYTRF_AA_2STAGE)
-#define LAPACK_dsytrf_aa_2stage LAPACK_GLOBAL(dsytrf_aa_2stage,DSYTRF_AA_2STAGE)
-#define LAPACK_csytrf_aa_2stage LAPACK_GLOBAL(csytrf_aa_2stage,CSYTRF_AA_2STAGE)
-#define LAPACK_zsytrf_aa_2stage LAPACK_GLOBAL(zsytrf_aa_2stage,ZSYTRF_AA_2STAGE)
-#define LAPACK_chetrf_aa_2stage LAPACK_GLOBAL(chetrf_aa_2stage,CHETRF_AA_2STAGE)
-#define LAPACK_zhetrf_aa_2stage LAPACK_GLOBAL(zhetrf_aa_2stage,ZHETRF_AA_2STAGE)
-
-
-void LAPACK_sgetrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_dgetrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_cgetrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
-void LAPACK_zgetrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
-void LAPACK_sgetrf2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_dgetrf2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_cgetrf2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
-void LAPACK_zgetrf2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
-void LAPACK_sgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, float* ab, lapack_int* ldab,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_dgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, double* ab, lapack_int* ldab,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_cgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, lapack_complex_float* ab, lapack_int* ldab,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_zgbtrf( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, lapack_complex_double* ab, lapack_int* ldab,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_sgttrf( lapack_int* n, float* dl, float* d, float* du, float* du2,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_dgttrf( lapack_int* n, double* dl, double* d, double* du,
-                    double* du2, lapack_int* ipiv, lapack_int *info );
-void LAPACK_cgttrf( lapack_int* n, lapack_complex_float* dl,
-                    lapack_complex_float* d, lapack_complex_float* du,
-                    lapack_complex_float* du2, lapack_int* ipiv,
-                    lapack_int *info );
-void LAPACK_zgttrf( lapack_int* n, lapack_complex_double* dl,
-                    lapack_complex_double* d, lapack_complex_double* du,
-                    lapack_complex_double* du2, lapack_int* ipiv,
-                    lapack_int *info );
-void LAPACK_spotrf2( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_dpotrf2( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_cpotrf2( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_zpotrf2( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_spotrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_dpotrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_cpotrf( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_zpotrf( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_dpstrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* piv, lapack_int* rank, double* tol,
-                    double* work, lapack_int *info );
-void LAPACK_spstrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* piv, lapack_int* rank, float* tol, float* work,
-                    lapack_int *info );
-void LAPACK_zpstrf( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
-                    double* tol, double* work, lapack_int *info );
-void LAPACK_cpstrf( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* piv, lapack_int* rank,
-                    float* tol, float* work, lapack_int *info );
-void LAPACK_dpftrf( char* transr, char* uplo, lapack_int* n, double* a,
-                    lapack_int *info );
-void LAPACK_spftrf( char* transr, char* uplo, lapack_int* n, float* a,
-                    lapack_int *info );
-void LAPACK_zpftrf( char* transr, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int *info );
-void LAPACK_cpftrf( char* transr, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int *info );
-void LAPACK_spptrf( char* uplo, lapack_int* n, float* ap, lapack_int *info );
-void LAPACK_dpptrf( char* uplo, lapack_int* n, double* ap, lapack_int *info );
-void LAPACK_cpptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
-                    lapack_int *info );
-void LAPACK_zpptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
-                    lapack_int *info );
-void LAPACK_spbtrf( char* uplo, lapack_int* n, lapack_int* kd, float* ab,
-                    lapack_int* ldab, lapack_int *info );
-void LAPACK_dpbtrf( char* uplo, lapack_int* n, lapack_int* kd, double* ab,
-                    lapack_int* ldab, lapack_int *info );
-void LAPACK_cpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_float* ab, lapack_int* ldab,
-                    lapack_int *info );
-void LAPACK_zpbtrf( char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_double* ab, lapack_int* ldab,
-                    lapack_int *info );
-void LAPACK_spttrf( lapack_int* n, float* d, float* e, lapack_int *info );
-void LAPACK_dpttrf( lapack_int* n, double* d, double* e, lapack_int *info );
-void LAPACK_cpttrf( lapack_int* n, float* d, lapack_complex_float* e,
-                    lapack_int *info );
-void LAPACK_zpttrf( lapack_int* n, double* d, lapack_complex_double* e,
-                    lapack_int *info );
-void LAPACK_ssytrf( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* ipiv, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dsytrf( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* ipiv, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_csytrf( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zsytrf( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_chetrf( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zhetrf( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_ssptrf( char* uplo, lapack_int* n, float* ap, lapack_int* ipiv,
-                    lapack_int *info );
-void LAPACK_dsptrf( char* uplo, lapack_int* n, double* ap, lapack_int* ipiv,
-                    lapack_int *info );
-void LAPACK_csptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_zsptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_chptrf( char* uplo, lapack_int* n, lapack_complex_float* ap,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_zhptrf( char* uplo, lapack_int* n, lapack_complex_double* ap,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_sgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const float* a, lapack_int* lda, const lapack_int* ipiv,
-                    float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_dgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, const lapack_int* ipiv,
-                    double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_cgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_zgetrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_sgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
-                    const lapack_int* ipiv, float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_dgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
-                    const lapack_int* ipiv, double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_cgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const lapack_complex_float* ab,
-                    lapack_int* ldab, const lapack_int* ipiv,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zgbtrs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const lapack_complex_double* ab,
-                    lapack_int* ldab, const lapack_int* ipiv,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_sgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const float* dl, const float* d, const float* du,
-                    const float* du2, const lapack_int* ipiv, float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_dgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const double* dl, const double* d, const double* du,
-                    const double* du2, const lapack_int* ipiv, double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_cgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* dl,
-                    const lapack_complex_float* d,
-                    const lapack_complex_float* du,
-                    const lapack_complex_float* du2, const lapack_int* ipiv,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zgttrs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* dl,
-                    const lapack_complex_double* d,
-                    const lapack_complex_double* du,
-                    const lapack_complex_double* du2, const lapack_int* ipiv,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_spotrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_dpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_cpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zpotrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_dpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* a, double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_spftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const float* a, float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_cpftrs( char* transr, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_spptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const float* ap, float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_dpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* ap, double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_cpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_zpptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_spbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const float* ab, lapack_int* ldab, float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_dpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const double* ab, lapack_int* ldab, double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_cpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_float* ab, lapack_int* ldab,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zpbtrs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_double* ab, lapack_int* ldab,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_spttrs( lapack_int* n, lapack_int* nrhs, const float* d,
-                    const float* e, float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_dpttrs( lapack_int* n, lapack_int* nrhs, const double* d,
-                    const double* e, double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_cpttrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
-                    const lapack_complex_float* e, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_zpttrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* d, const lapack_complex_double* e,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_ssytrs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
-                    lapack_int* lda, const lapack_int* ipiv, float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_dsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, const lapack_int* ipiv,
-                    double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_csytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_zsytrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_chetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_zhetrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_ssptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const float* ap, const lapack_int* ipiv, float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_dsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* ap, const lapack_int* ipiv, double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_csptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap, const lapack_int* ipiv,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zsptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap, const lapack_int* ipiv,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_chptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap, const lapack_int* ipiv,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zhptrs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap, const lapack_int* ipiv,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_strtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const float* a, lapack_int* lda, float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_dtrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const double* a, lapack_int* lda,
-                    double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_ctrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_ztrtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_stptrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const float* ap, float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_dtptrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const double* ap, double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_ctptrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_float* ap,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_ztptrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_double* ap,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_stbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs, const float* ab,
-                    lapack_int* ldab, float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_dtbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs, const double* ab,
-                    lapack_int* ldab, double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_ctbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_float* ab, lapack_int* ldab,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_ztbtrs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_double* ab, lapack_int* ldab,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_sgecon( char* norm, lapack_int* n, const float* a, lapack_int* lda,
-                    float* anorm, float* rcond, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgecon( char* norm, lapack_int* n, const double* a, lapack_int* lda,
-                    double* anorm, double* rcond, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_cgecon( char* norm, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, float* anorm, float* rcond,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zgecon( char* norm, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, double* anorm, double* rcond,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    const float* ab, lapack_int* ldab, const lapack_int* ipiv,
-                    float* anorm, float* rcond, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    const double* ab, lapack_int* ldab, const lapack_int* ipiv,
-                    double* anorm, double* rcond, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_cgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    const lapack_complex_float* ab, lapack_int* ldab,
-                    const lapack_int* ipiv, float* anorm, float* rcond,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zgbcon( char* norm, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    const lapack_complex_double* ab, lapack_int* ldab,
-                    const lapack_int* ipiv, double* anorm, double* rcond,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sgtcon( char* norm, lapack_int* n, const float* dl, const float* d,
-                    const float* du, const float* du2, const lapack_int* ipiv,
-                    float* anorm, float* rcond, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgtcon( char* norm, lapack_int* n, const double* dl,
-                    const double* d, const double* du, const double* du2,
-                    const lapack_int* ipiv, double* anorm, double* rcond,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cgtcon( char* norm, lapack_int* n, const lapack_complex_float* dl,
-                    const lapack_complex_float* d,
-                    const lapack_complex_float* du,
-                    const lapack_complex_float* du2, const lapack_int* ipiv,
-                    float* anorm, float* rcond, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zgtcon( char* norm, lapack_int* n, const lapack_complex_double* dl,
-                    const lapack_complex_double* d,
-                    const lapack_complex_double* du,
-                    const lapack_complex_double* du2, const lapack_int* ipiv,
-                    double* anorm, double* rcond, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_spocon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
-                    float* anorm, float* rcond, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dpocon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
-                    double* anorm, double* rcond, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_cpocon( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, float* anorm, float* rcond,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zpocon( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, double* anorm, double* rcond,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sppcon( char* uplo, lapack_int* n, const float* ap, float* anorm,
-                    float* rcond, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dppcon( char* uplo, lapack_int* n, const double* ap, double* anorm,
-                    double* rcond, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cppcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
-                    float* anorm, float* rcond, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zppcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
-                    double* anorm, double* rcond, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_spbcon( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
-                    lapack_int* ldab, float* anorm, float* rcond, float* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dpbcon( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
-                    lapack_int* ldab, double* anorm, double* rcond,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cpbcon( char* uplo, lapack_int* n, lapack_int* kd,
-                    const lapack_complex_float* ab, lapack_int* ldab,
-                    float* anorm, float* rcond, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zpbcon( char* uplo, lapack_int* n, lapack_int* kd,
-                    const lapack_complex_double* ab, lapack_int* ldab,
-                    double* anorm, double* rcond, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_sptcon( lapack_int* n, const float* d, const float* e, float* anorm,
-                    float* rcond, float* work, lapack_int *info );
-void LAPACK_dptcon( lapack_int* n, const double* d, const double* e,
-                    double* anorm, double* rcond, double* work,
-                    lapack_int *info );
-void LAPACK_cptcon( lapack_int* n, const float* d,
-                    const lapack_complex_float* e, float* anorm, float* rcond,
-                    float* work, lapack_int *info );
-void LAPACK_zptcon( lapack_int* n, const double* d,
-                    const lapack_complex_double* e, double* anorm,
-                    double* rcond, double* work, lapack_int *info );
-void LAPACK_ssycon( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
-                    const lapack_int* ipiv, float* anorm, float* rcond,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dsycon( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
-                    const lapack_int* ipiv, double* anorm, double* rcond,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_csycon( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
-                    float* rcond, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zsycon( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
-                    double* rcond, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_checon( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_int* ipiv, float* anorm,
-                    float* rcond, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zhecon( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_int* ipiv, double* anorm,
-                    double* rcond, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_sspcon( char* uplo, lapack_int* n, const float* ap,
-                    const lapack_int* ipiv, float* anorm, float* rcond,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dspcon( char* uplo, lapack_int* n, const double* ap,
-                    const lapack_int* ipiv, double* anorm, double* rcond,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cspcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
-                    const lapack_int* ipiv, float* anorm, float* rcond,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zspcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
-                    const lapack_int* ipiv, double* anorm, double* rcond,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_chpcon( char* uplo, lapack_int* n, const lapack_complex_float* ap,
-                    const lapack_int* ipiv, float* anorm, float* rcond,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zhpcon( char* uplo, lapack_int* n, const lapack_complex_double* ap,
-                    const lapack_int* ipiv, double* anorm, double* rcond,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_strcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const float* a, lapack_int* lda, float* rcond, float* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dtrcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const double* a, lapack_int* lda, double* rcond,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_ctrcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    float* rcond, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_ztrcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    double* rcond, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_stpcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const float* ap, float* rcond, float* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dtpcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const double* ap, double* rcond, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_ctpcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const lapack_complex_float* ap, float* rcond,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_ztpcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    const lapack_complex_double* ap, double* rcond,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_stbcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    lapack_int* kd, const float* ab, lapack_int* ldab,
-                    float* rcond, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dtbcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    lapack_int* kd, const double* ab, lapack_int* ldab,
-                    double* rcond, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_ctbcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    lapack_int* kd, const lapack_complex_float* ab,
-                    lapack_int* ldab, float* rcond, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_ztbcon( char* norm, char* uplo, char* diag, lapack_int* n,
-                    lapack_int* kd, const lapack_complex_double* ab,
-                    lapack_int* ldab, double* rcond,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const float* a, lapack_int* lda, const float* af,
-                    lapack_int* ldaf, const lapack_int* ipiv, const float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
-                    float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, const double* af,
-                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
-                    double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* af, lapack_int* ldaf,
-                    const lapack_int* ipiv, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zgerfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* af, lapack_int* ldaf,
-                    const lapack_int* ipiv, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_dgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const double* a, lapack_int* lda, const double* af,
-                     lapack_int* ldaf, const lapack_int* ipiv, const double* r,
-                     const double* c, const double* b, lapack_int* ldb,
-                     double* x, lapack_int* ldx, double* rcond, double* berr,
-                     lapack_int* n_err_bnds, double* err_bnds_norm,
-                     double* err_bnds_comp, lapack_int* nparams, double* params,
-                     double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_sgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const float* a, lapack_int* lda, const float* af,
-                     lapack_int* ldaf, const lapack_int* ipiv, const float* r,
-                     const float* c, const float* b, lapack_int* ldb, float* x,
-                     lapack_int* ldx, float* rcond, float* berr,
-                     lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_zgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_complex_double* af, lapack_int* ldaf,
-                     const lapack_int* ipiv, const double* r, const double* c,
-                     const lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_cgerfsx( char* trans, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_float* a, lapack_int* lda,
-                     const lapack_complex_float* af, lapack_int* ldaf,
-                     const lapack_int* ipiv, const float* r, const float* c,
-                     const lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_sgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const float* ab, lapack_int* ldab,
-                    const float* afb, lapack_int* ldafb, const lapack_int* ipiv,
-                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const double* ab, lapack_int* ldab,
-                    const double* afb, lapack_int* ldafb,
-                    const lapack_int* ipiv, const double* b, lapack_int* ldb,
-                    double* x, lapack_int* ldx, double* ferr, double* berr,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const lapack_complex_float* ab,
-                    lapack_int* ldab, const lapack_complex_float* afb,
-                    lapack_int* ldafb, const lapack_int* ipiv,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zgbrfs( char* trans, lapack_int* n, lapack_int* kl, lapack_int* ku,
-                    lapack_int* nrhs, const lapack_complex_double* ab,
-                    lapack_int* ldab, const lapack_complex_double* afb,
-                    lapack_int* ldafb, const lapack_int* ipiv,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_dgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs, const double* ab,
-                     lapack_int* ldab, const double* afb, lapack_int* ldafb,
-                     const lapack_int* ipiv, const double* r, const double* c,
-                     const double* b, lapack_int* ldb, double* x,
-                     lapack_int* ldx, double* rcond, double* berr,
-                     lapack_int* n_err_bnds, double* err_bnds_norm,
-                     double* err_bnds_comp, lapack_int* nparams, double* params,
-                     double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_sgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs, const float* ab,
-                     lapack_int* ldab, const float* afb, lapack_int* ldafb,
-                     const lapack_int* ipiv, const float* r, const float* c,
-                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                     float* rcond, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params, float* work,
-                     lapack_int* iwork, lapack_int *info );
-void LAPACK_zgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs,
-                     const lapack_complex_double* ab, lapack_int* ldab,
-                     const lapack_complex_double* afb, lapack_int* ldafb,
-                     const lapack_int* ipiv, const double* r, const double* c,
-                     const lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_cgbrfsx( char* trans, char* equed, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs,
-                     const lapack_complex_float* ab, lapack_int* ldab,
-                     const lapack_complex_float* afb, lapack_int* ldafb,
-                     const lapack_int* ipiv, const float* r, const float* c,
-                     const lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_sgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const float* dl, const float* d, const float* du,
-                    const float* dlf, const float* df, const float* duf,
-                    const float* du2, const lapack_int* ipiv, const float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
-                    float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const double* dl, const double* d, const double* du,
-                    const double* dlf, const double* df, const double* duf,
-                    const double* du2, const lapack_int* ipiv, const double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
-                    double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* dl,
-                    const lapack_complex_float* d,
-                    const lapack_complex_float* du,
-                    const lapack_complex_float* dlf,
-                    const lapack_complex_float* df,
-                    const lapack_complex_float* duf,
-                    const lapack_complex_float* du2, const lapack_int* ipiv,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zgtrfs( char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* dl,
-                    const lapack_complex_double* d,
-                    const lapack_complex_double* du,
-                    const lapack_complex_double* dlf,
-                    const lapack_complex_double* df,
-                    const lapack_complex_double* duf,
-                    const lapack_complex_double* du2, const lapack_int* ipiv,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sporfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
-                    lapack_int* lda, const float* af, lapack_int* ldaf,
-                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, const double* af,
-                    lapack_int* ldaf, const double* b, lapack_int* ldb,
-                    double* x, lapack_int* ldx, double* ferr, double* berr,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* af, lapack_int* ldaf,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zporfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* af, lapack_int* ldaf,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_dporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const double* a, lapack_int* lda, const double* af,
-                     lapack_int* ldaf, const double* s, const double* b,
-                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                     double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params, double* work,
-                     lapack_int* iwork, lapack_int *info );
-void LAPACK_sporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const float* a, lapack_int* lda, const float* af,
-                     lapack_int* ldaf, const float* s, const float* b,
-                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_zporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_complex_double* af, lapack_int* ldaf,
-                     const double* s, const lapack_complex_double* b,
-                     lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                     double* rcond, double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_cporfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_float* a, lapack_int* lda,
-                     const lapack_complex_float* af, lapack_int* ldaf,
-                     const float* s, const lapack_complex_float* b,
-                     lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                     float* rcond, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_spprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const float* ap, const float* afp, const float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* ferr,
-                    float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* ap, const double* afp, const double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
-                    double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap,
-                    const lapack_complex_float* afp,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zpprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap,
-                    const lapack_complex_double* afp,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_spbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const float* ab, lapack_int* ldab, const float* afb,
-                    lapack_int* ldafb, const float* b, lapack_int* ldb,
-                    float* x, lapack_int* ldx, float* ferr, float* berr,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const double* ab, lapack_int* ldab, const double* afb,
-                    lapack_int* ldafb, const double* b, lapack_int* ldb,
-                    double* x, lapack_int* ldx, double* ferr, double* berr,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_float* ab, lapack_int* ldab,
-                    const lapack_complex_float* afb, lapack_int* ldafb,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zpbrfs( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_double* ab, lapack_int* ldab,
-                    const lapack_complex_double* afb, lapack_int* ldafb,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sptrfs( lapack_int* n, lapack_int* nrhs, const float* d,
-                    const float* e, const float* df, const float* ef,
-                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                    float* ferr, float* berr, float* work, lapack_int *info );
-void LAPACK_dptrfs( lapack_int* n, lapack_int* nrhs, const double* d,
-                    const double* e, const double* df, const double* ef,
-                    const double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* ferr, double* berr, double* work,
-                    lapack_int *info );
-void LAPACK_cptrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* d,
-                    const lapack_complex_float* e, const float* df,
-                    const lapack_complex_float* ef,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zptrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* d, const lapack_complex_double* e,
-                    const double* df, const lapack_complex_double* ef,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_ssyrfs( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
-                    lapack_int* lda, const float* af, lapack_int* ldaf,
-                    const lapack_int* ipiv, const float* b, lapack_int* ldb,
-                    float* x, lapack_int* ldx, float* ferr, float* berr,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, const double* af,
-                    lapack_int* ldaf, const lapack_int* ipiv, const double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* ferr,
-                    double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_csyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* af, lapack_int* ldaf,
-                    const lapack_int* ipiv, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zsyrfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* af, lapack_int* ldaf,
-                    const lapack_int* ipiv, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_dsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const double* a, lapack_int* lda, const double* af,
-                     lapack_int* ldaf, const lapack_int* ipiv, const double* s,
-                     const double* b, lapack_int* ldb, double* x,
-                     lapack_int* ldx, double* rcond, double* berr,
-                     lapack_int* n_err_bnds, double* err_bnds_norm,
-                     double* err_bnds_comp, lapack_int* nparams, double* params,
-                     double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_ssyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const float* a, lapack_int* lda, const float* af,
-                     lapack_int* ldaf, const lapack_int* ipiv, const float* s,
-                     const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                     float* rcond, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params, float* work,
-                     lapack_int* iwork, lapack_int *info );
-void LAPACK_zsyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_complex_double* af, lapack_int* ldaf,
-                     const lapack_int* ipiv, const double* s,
-                     const lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_csyrfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_float* a, lapack_int* lda,
-                     const lapack_complex_float* af, lapack_int* ldaf,
-                     const lapack_int* ipiv, const float* s,
-                     const lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_cherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* af, lapack_int* ldaf,
-                    const lapack_int* ipiv, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zherfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* af, lapack_int* ldaf,
-                    const lapack_int* ipiv, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_zherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_complex_double* af, lapack_int* ldaf,
-                     const lapack_int* ipiv, const double* s,
-                     const lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_cherfsx( char* uplo, char* equed, lapack_int* n, lapack_int* nrhs,
-                     const lapack_complex_float* a, lapack_int* lda,
-                     const lapack_complex_float* af, lapack_int* ldaf,
-                     const lapack_int* ipiv, const float* s,
-                     const lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* berr, lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_ssprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const float* ap, const float* afp, const lapack_int* ipiv,
-                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* ap, const double* afp, const lapack_int* ipiv,
-                    const double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* ferr, double* berr, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_csprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap,
-                    const lapack_complex_float* afp, const lapack_int* ipiv,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zsprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap,
-                    const lapack_complex_double* afp, const lapack_int* ipiv,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_chprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap,
-                    const lapack_complex_float* afp, const lapack_int* ipiv,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zhprfs( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap,
-                    const lapack_complex_double* afp, const lapack_int* ipiv,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* ferr,
-                    double* berr, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_strrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const float* a, lapack_int* lda,
-                    const float* b, lapack_int* ldb, const float* x,
-                    lapack_int* ldx, float* ferr, float* berr, float* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dtrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const double* a, lapack_int* lda,
-                    const double* b, lapack_int* ldb, const double* x,
-                    lapack_int* ldx, double* ferr, double* berr, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_ctrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* b,
-                    lapack_int* ldb, const lapack_complex_float* x,
-                    lapack_int* ldx, float* ferr, float* berr,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_ztrrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* b,
-                    lapack_int* ldb, const lapack_complex_double* x,
-                    lapack_int* ldx, double* ferr, double* berr,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_stprfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const float* ap, const float* b,
-                    lapack_int* ldb, const float* x, lapack_int* ldx,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dtprfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const double* ap, const double* b,
-                    lapack_int* ldb, const double* x, lapack_int* ldx,
-                    double* ferr, double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_ctprfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_float* ap,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_ztprfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* nrhs, const lapack_complex_double* ap,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    const lapack_complex_double* x, lapack_int* ldx,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_stbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs, const float* ab,
-                    lapack_int* ldab, const float* b, lapack_int* ldb,
-                    const float* x, lapack_int* ldx, float* ferr, float* berr,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dtbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs, const double* ab,
-                    lapack_int* ldab, const double* b, lapack_int* ldb,
-                    const double* x, lapack_int* ldx, double* ferr,
-                    double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_ctbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_float* ab, lapack_int* ldab,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    const lapack_complex_float* x, lapack_int* ldx, float* ferr,
-                    float* berr, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_ztbrfs( char* uplo, char* trans, char* diag, lapack_int* n,
-                    lapack_int* kd, lapack_int* nrhs,
-                    const lapack_complex_double* ab, lapack_int* ldab,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    const lapack_complex_double* x, lapack_int* ldx,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_sgetri( lapack_int* n, float* a, lapack_int* lda,
-                    const lapack_int* ipiv, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgetri( lapack_int* n, double* a, lapack_int* lda,
-                    const lapack_int* ipiv, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cgetri( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zgetri( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_spotri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_dpotri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_cpotri( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_zpotri( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_dpftri( char* transr, char* uplo, lapack_int* n, double* a,
-                    lapack_int *info );
-void LAPACK_spftri( char* transr, char* uplo, lapack_int* n, float* a,
-                    lapack_int *info );
-void LAPACK_zpftri( char* transr, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int *info );
-void LAPACK_cpftri( char* transr, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int *info );
-void LAPACK_spptri( char* uplo, lapack_int* n, float* ap, lapack_int *info );
-void LAPACK_dpptri( char* uplo, lapack_int* n, double* ap, lapack_int *info );
-void LAPACK_cpptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
-                    lapack_int *info );
-void LAPACK_zpptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
-                    lapack_int *info );
-void LAPACK_ssytri( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    const lapack_int* ipiv, float* work, lapack_int *info );
-void LAPACK_dsytri( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    const lapack_int* ipiv, double* work, lapack_int *info );
-void LAPACK_csytri( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zsytri( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_chetri( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zhetri( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, const lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_ssptri( char* uplo, lapack_int* n, float* ap,
-                    const lapack_int* ipiv, float* work, lapack_int *info );
-void LAPACK_dsptri( char* uplo, lapack_int* n, double* ap,
-                    const lapack_int* ipiv, double* work, lapack_int *info );
-void LAPACK_csptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
-                    const lapack_int* ipiv, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zsptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
-                    const lapack_int* ipiv, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_chptri( char* uplo, lapack_int* n, lapack_complex_float* ap,
-                    const lapack_int* ipiv, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zhptri( char* uplo, lapack_int* n, lapack_complex_double* ap,
-                    const lapack_int* ipiv, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_strtri( char* uplo, char* diag, lapack_int* n, float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_dtrtri( char* uplo, char* diag, lapack_int* n, double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_ctrtri( char* uplo, char* diag, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_ztrtri( char* uplo, char* diag, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_dtftri( char* transr, char* uplo, char* diag, lapack_int* n,
-                    double* a, lapack_int *info );
-void LAPACK_stftri( char* transr, char* uplo, char* diag, lapack_int* n,
-                    float* a, lapack_int *info );
-void LAPACK_ztftri( char* transr, char* uplo, char* diag, lapack_int* n,
-                    lapack_complex_double* a, lapack_int *info );
-void LAPACK_ctftri( char* transr, char* uplo, char* diag, lapack_int* n,
-                    lapack_complex_float* a, lapack_int *info );
-void LAPACK_stptri( char* uplo, char* diag, lapack_int* n, float* ap,
-                    lapack_int *info );
-void LAPACK_dtptri( char* uplo, char* diag, lapack_int* n, double* ap,
-                    lapack_int *info );
-void LAPACK_ctptri( char* uplo, char* diag, lapack_int* n,
-                    lapack_complex_float* ap, lapack_int *info );
-void LAPACK_ztptri( char* uplo, char* diag, lapack_int* n,
-                    lapack_complex_double* ap, lapack_int *info );
-void LAPACK_sgeequ( lapack_int* m, lapack_int* n, const float* a,
-                    lapack_int* lda, float* r, float* c, float* rowcnd,
-                    float* colcnd, float* amax, lapack_int *info );
-void LAPACK_dgeequ( lapack_int* m, lapack_int* n, const double* a,
-                    lapack_int* lda, double* r, double* c, double* rowcnd,
-                    double* colcnd, double* amax, lapack_int *info );
-void LAPACK_cgeequ( lapack_int* m, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, float* r, float* c, float* rowcnd,
-                    float* colcnd, float* amax, lapack_int *info );
-void LAPACK_zgeequ( lapack_int* m, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda, double* r,
-                    double* c, double* rowcnd, double* colcnd, double* amax,
-                    lapack_int *info );
-void LAPACK_dgeequb( lapack_int* m, lapack_int* n, const double* a,
-                     lapack_int* lda, double* r, double* c, double* rowcnd,
-                     double* colcnd, double* amax, lapack_int *info );
-void LAPACK_sgeequb( lapack_int* m, lapack_int* n, const float* a,
-                     lapack_int* lda, float* r, float* c, float* rowcnd,
-                     float* colcnd, float* amax, lapack_int *info );
-void LAPACK_zgeequb( lapack_int* m, lapack_int* n,
-                     const lapack_complex_double* a, lapack_int* lda, double* r,
-                     double* c, double* rowcnd, double* colcnd, double* amax,
-                     lapack_int *info );
-void LAPACK_cgeequb( lapack_int* m, lapack_int* n,
-                     const lapack_complex_float* a, lapack_int* lda, float* r,
-                     float* c, float* rowcnd, float* colcnd, float* amax,
-                     lapack_int *info );
-void LAPACK_sgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const float* ab, lapack_int* ldab, float* r,
-                    float* c, float* rowcnd, float* colcnd, float* amax,
-                    lapack_int *info );
-void LAPACK_dgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const double* ab, lapack_int* ldab,
-                    double* r, double* c, double* rowcnd, double* colcnd,
-                    double* amax, lapack_int *info );
-void LAPACK_cgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const lapack_complex_float* ab,
-                    lapack_int* ldab, float* r, float* c, float* rowcnd,
-                    float* colcnd, float* amax, lapack_int *info );
-void LAPACK_zgbequ( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const lapack_complex_double* ab,
-                    lapack_int* ldab, double* r, double* c, double* rowcnd,
-                    double* colcnd, double* amax, lapack_int *info );
-void LAPACK_dgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, const double* ab, lapack_int* ldab,
-                     double* r, double* c, double* rowcnd, double* colcnd,
-                     double* amax, lapack_int *info );
-void LAPACK_sgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, const float* ab, lapack_int* ldab,
-                     float* r, float* c, float* rowcnd, float* colcnd,
-                     float* amax, lapack_int *info );
-void LAPACK_zgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, const lapack_complex_double* ab,
-                     lapack_int* ldab, double* r, double* c, double* rowcnd,
-                     double* colcnd, double* amax, lapack_int *info );
-void LAPACK_cgbequb( lapack_int* m, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, const lapack_complex_float* ab,
-                     lapack_int* ldab, float* r, float* c, float* rowcnd,
-                     float* colcnd, float* amax, lapack_int *info );
-void LAPACK_spoequ( lapack_int* n, const float* a, lapack_int* lda, float* s,
-                    float* scond, float* amax, lapack_int *info );
-void LAPACK_dpoequ( lapack_int* n, const double* a, lapack_int* lda, double* s,
-                    double* scond, double* amax, lapack_int *info );
-void LAPACK_cpoequ( lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, float* s, float* scond, float* amax,
-                    lapack_int *info );
-void LAPACK_zpoequ( lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, double* s, double* scond, double* amax,
-                    lapack_int *info );
-void LAPACK_dpoequb( lapack_int* n, const double* a, lapack_int* lda, double* s,
-                     double* scond, double* amax, lapack_int *info );
-void LAPACK_spoequb( lapack_int* n, const float* a, lapack_int* lda, float* s,
-                     float* scond, float* amax, lapack_int *info );
-void LAPACK_zpoequb( lapack_int* n, const lapack_complex_double* a,
-                     lapack_int* lda, double* s, double* scond, double* amax,
-                     lapack_int *info );
-void LAPACK_cpoequb( lapack_int* n, const lapack_complex_float* a,
-                     lapack_int* lda, float* s, float* scond, float* amax,
-                     lapack_int *info );
-void LAPACK_sppequ( char* uplo, lapack_int* n, const float* ap, float* s,
-                    float* scond, float* amax, lapack_int *info );
-void LAPACK_dppequ( char* uplo, lapack_int* n, const double* ap, double* s,
-                    double* scond, double* amax, lapack_int *info );
-void LAPACK_cppequ( char* uplo, lapack_int* n, const lapack_complex_float* ap,
-                    float* s, float* scond, float* amax, lapack_int *info );
-void LAPACK_zppequ( char* uplo, lapack_int* n, const lapack_complex_double* ap,
-                    double* s, double* scond, double* amax, lapack_int *info );
-void LAPACK_spbequ( char* uplo, lapack_int* n, lapack_int* kd, const float* ab,
-                    lapack_int* ldab, float* s, float* scond, float* amax,
-                    lapack_int *info );
-void LAPACK_dpbequ( char* uplo, lapack_int* n, lapack_int* kd, const double* ab,
-                    lapack_int* ldab, double* s, double* scond, double* amax,
-                    lapack_int *info );
-void LAPACK_cpbequ( char* uplo, lapack_int* n, lapack_int* kd,
-                    const lapack_complex_float* ab, lapack_int* ldab, float* s,
-                    float* scond, float* amax, lapack_int *info );
-void LAPACK_zpbequ( char* uplo, lapack_int* n, lapack_int* kd,
-                    const lapack_complex_double* ab, lapack_int* ldab,
-                    double* s, double* scond, double* amax, lapack_int *info );
-void LAPACK_dsyequb( char* uplo, lapack_int* n, const double* a,
-                     lapack_int* lda, double* s, double* scond, double* amax,
-                     double* work, lapack_int *info );
-void LAPACK_ssyequb( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
-                     float* s, float* scond, float* amax, float* work,
-                     lapack_int *info );
-void LAPACK_zsyequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                     lapack_int* lda, double* s, double* scond, double* amax,
-                     lapack_complex_double* work, lapack_int *info );
-void LAPACK_csyequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                     lapack_int* lda, float* s, float* scond, float* amax,
-                     lapack_complex_float* work, lapack_int *info );
-void LAPACK_zheequb( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                     lapack_int* lda, double* s, double* scond, double* amax,
-                     lapack_complex_double* work, lapack_int *info );
-void LAPACK_cheequb( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                     lapack_int* lda, float* s, float* scond, float* amax,
-                     lapack_complex_float* work, lapack_int *info );
-void LAPACK_sgesv( lapack_int* n, lapack_int* nrhs, float* a, lapack_int* lda,
-                   lapack_int* ipiv, float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_dgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
-                   lapack_int* ipiv, double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_cgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* a,
-                   lapack_int* lda, lapack_int* ipiv, lapack_complex_float* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_zgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
-                   lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_dsgesv( lapack_int* n, lapack_int* nrhs, double* a, lapack_int* lda,
-                    lapack_int* ipiv, double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* work, float* swork,
-                    lapack_int* iter, lapack_int *info );
-void LAPACK_zcgesv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    lapack_complex_double* work, lapack_complex_float* swork,
-                    double* rwork, lapack_int* iter, lapack_int *info );
-void LAPACK_sgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
-                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
-                    lapack_int* ipiv, char* equed, double* r, double* c,
-                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_cgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf,
-                    lapack_int* ipiv, char* equed, float* r, float* c,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zgesvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf,
-                    lapack_int* ipiv, char* equed, double* r, double* c,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_dgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, double* r, double* c,
-                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                     double* rcond, double* rpvgrw, double* berr,
-                     lapack_int* n_err_bnds, double* err_bnds_norm,
-                     double* err_bnds_comp, lapack_int* nparams, double* params,
-                     double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_sgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, float* r, float* c,
-                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                     float* rcond, float* rpvgrw, float* berr,
-                     lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_zgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, double* r, double* c,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_cgesvxx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, float* r, float* c,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_sgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
-                   lapack_int* nrhs, float* ab, lapack_int* ldab,
-                   lapack_int* ipiv, float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_dgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
-                   lapack_int* nrhs, double* ab, lapack_int* ldab,
-                   lapack_int* ipiv, double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_cgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
-                   lapack_int* nrhs, lapack_complex_float* ab, lapack_int* ldab,
-                   lapack_int* ipiv, lapack_complex_float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_zgbsv( lapack_int* n, lapack_int* kl, lapack_int* ku,
-                   lapack_int* nrhs, lapack_complex_double* ab,
-                   lapack_int* ldab, lapack_int* ipiv, lapack_complex_double* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_sgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, lapack_int* nrhs, float* ab,
-                    lapack_int* ldab, float* afb, lapack_int* ldafb,
-                    lapack_int* ipiv, char* equed, float* r, float* c, float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, lapack_int* nrhs, double* ab,
-                    lapack_int* ldab, double* afb, lapack_int* ldafb,
-                    lapack_int* ipiv, char* equed, double* r, double* c,
-                    double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_cgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
-                    lapack_int* ldab, lapack_complex_float* afb,
-                    lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
-                    float* c, lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zgbsvx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, lapack_int* nrhs, lapack_complex_double* ab,
-                    lapack_int* ldab, lapack_complex_double* afb,
-                    lapack_int* ldafb, lapack_int* ipiv, char* equed, double* r,
-                    double* c, lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_dgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs, double* ab,
-                     lapack_int* ldab, double* afb, lapack_int* ldafb,
-                     lapack_int* ipiv, char* equed, double* r, double* c,
-                     double* b, lapack_int* ldb, double* x, lapack_int* ldx,
-                     double* rcond, double* rpvgrw, double* berr,
-                     lapack_int* n_err_bnds, double* err_bnds_norm,
-                     double* err_bnds_comp, lapack_int* nparams, double* params,
-                     double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_sgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs, float* ab,
-                     lapack_int* ldab, float* afb, lapack_int* ldafb,
-                     lapack_int* ipiv, char* equed, float* r, float* c,
-                     float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                     float* rcond, float* rpvgrw, float* berr,
-                     lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_zgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs,
-                     lapack_complex_double* ab, lapack_int* ldab,
-                     lapack_complex_double* afb, lapack_int* ldafb,
-                     lapack_int* ipiv, char* equed, double* r, double* c,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_cgbsvxx( char* fact, char* trans, lapack_int* n, lapack_int* kl,
-                     lapack_int* ku, lapack_int* nrhs, lapack_complex_float* ab,
-                     lapack_int* ldab, lapack_complex_float* afb,
-                     lapack_int* ldafb, lapack_int* ipiv, char* equed, float* r,
-                     float* c, lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_sgtsv( lapack_int* n, lapack_int* nrhs, float* dl, float* d,
-                   float* du, float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_dgtsv( lapack_int* n, lapack_int* nrhs, double* dl, double* d,
-                   double* du, double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_cgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_float* dl,
-                   lapack_complex_float* d, lapack_complex_float* du,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_zgtsv( lapack_int* n, lapack_int* nrhs, lapack_complex_double* dl,
-                   lapack_complex_double* d, lapack_complex_double* du,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_sgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    const float* dl, const float* d, const float* du,
-                    float* dlf, float* df, float* duf, float* du2,
-                    lapack_int* ipiv, const float* b, lapack_int* ldb, float* x,
-                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    const double* dl, const double* d, const double* du,
-                    double* dlf, double* df, double* duf, double* du2,
-                    lapack_int* ipiv, const double* b, lapack_int* ldb,
-                    double* x, lapack_int* ldx, double* rcond, double* ferr,
-                    double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* dl,
-                    const lapack_complex_float* d,
-                    const lapack_complex_float* du, lapack_complex_float* dlf,
-                    lapack_complex_float* df, lapack_complex_float* duf,
-                    lapack_complex_float* du2, lapack_int* ipiv,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zgtsvx( char* fact, char* trans, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* dl,
-                    const lapack_complex_double* d,
-                    const lapack_complex_double* du, lapack_complex_double* dlf,
-                    lapack_complex_double* df, lapack_complex_double* duf,
-                    lapack_complex_double* du2, lapack_int* ipiv,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_sposv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
-                   lapack_int* lda, float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_dposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                   lapack_int* lda, double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_cposv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_zposv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_dsposv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* work, float* swork,
-                    lapack_int* iter, lapack_int *info );
-void LAPACK_zcposv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx,
-                    lapack_complex_double* work, lapack_complex_float* swork,
-                    double* rwork, lapack_int* iter, lapack_int *info );
-void LAPACK_sposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    float* a, lapack_int* lda, float* af, lapack_int* ldaf,
-                    char* equed, float* s, float* b, lapack_int* ldb, float* x,
-                    lapack_int* ldx, float* rcond, float* ferr, float* berr,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    double* a, lapack_int* lda, double* af, lapack_int* ldaf,
-                    char* equed, double* s, double* b, lapack_int* ldb,
-                    double* x, lapack_int* ldx, double* rcond, double* ferr,
-                    double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf, char* equed,
-                    float* s, lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zposvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf, char* equed,
-                    double* s, lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_dposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
-                     char* equed, double* s, double* b, lapack_int* ldb,
-                     double* x, lapack_int* ldx, double* rcond, double* rpvgrw,
-                     double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params, double* work,
-                     lapack_int* iwork, lapack_int *info );
-void LAPACK_sposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
-                     char* equed, float* s, float* b, lapack_int* ldb, float* x,
-                     lapack_int* ldx, float* rcond, float* rpvgrw, float* berr,
-                     lapack_int* n_err_bnds, float* err_bnds_norm,
-                     float* err_bnds_comp, lapack_int* nparams, float* params,
-                     float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_zposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* af, lapack_int* ldaf, char* equed,
-                     double* s, lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_cposvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* af, lapack_int* ldaf, char* equed,
-                     float* s, lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_sppsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
-                   float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_dppsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
-                   double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_cppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* ap, lapack_complex_float* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_zppsv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* ap, lapack_complex_double* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_sppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    float* ap, float* afp, char* equed, float* s, float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    double* ap, double* afp, char* equed, double* s, double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_float* ap, lapack_complex_float* afp,
-                    char* equed, float* s, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* ferr, float* berr,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zppsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_double* ap, lapack_complex_double* afp,
-                    char* equed, double* s, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_spbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   float* ab, lapack_int* ldab, float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_dpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   double* ab, lapack_int* ldab, double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_cpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   lapack_complex_float* ab, lapack_int* ldab,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_zpbsv( char* uplo, lapack_int* n, lapack_int* kd, lapack_int* nrhs,
-                   lapack_complex_double* ab, lapack_int* ldab,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_spbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_int* nrhs, float* ab, lapack_int* ldab, float* afb,
-                    lapack_int* ldafb, char* equed, float* s, float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_int* nrhs, double* ab, lapack_int* ldab, double* afb,
-                    lapack_int* ldafb, char* equed, double* s, double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_cpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_int* nrhs, lapack_complex_float* ab,
-                    lapack_int* ldab, lapack_complex_float* afb,
-                    lapack_int* ldafb, char* equed, float* s,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zpbsvx( char* fact, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_int* nrhs, lapack_complex_double* ab,
-                    lapack_int* ldab, lapack_complex_double* afb,
-                    lapack_int* ldafb, char* equed, double* s,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_sptsv( lapack_int* n, lapack_int* nrhs, float* d, float* e,
-                   float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_dptsv( lapack_int* n, lapack_int* nrhs, double* d, double* e,
-                   double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_cptsv( lapack_int* n, lapack_int* nrhs, float* d,
-                   lapack_complex_float* e, lapack_complex_float* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_zptsv( lapack_int* n, lapack_int* nrhs, double* d,
-                   lapack_complex_double* e, lapack_complex_double* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_sptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
-                    const float* e, float* df, float* ef, const float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, float* work, lapack_int *info );
-void LAPACK_dptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
-                    const double* d, const double* e, double* df, double* ef,
-                    const double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
-                    double* work, lapack_int *info );
-void LAPACK_cptsvx( char* fact, lapack_int* n, lapack_int* nrhs, const float* d,
-                    const lapack_complex_float* e, float* df,
-                    lapack_complex_float* ef, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* ferr, float* berr,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zptsvx( char* fact, lapack_int* n, lapack_int* nrhs,
-                    const double* d, const lapack_complex_double* e, double* df,
-                    lapack_complex_double* ef, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_ssysv( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
-                   lapack_int* lda, lapack_int* ipiv, float* b, lapack_int* ldb,
-                   float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsysv( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                   lapack_int* lda, lapack_int* ipiv, double* b,
-                   lapack_int* ldb, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_csysv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zsysv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_ssysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const float* a, lapack_int* lda, float* af,
-                    lapack_int* ldaf, lapack_int* ipiv, const float* b,
-                    lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                    float* ferr, float* berr, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, double* af,
-                    lapack_int* ldaf, lapack_int* ipiv, const double* b,
-                    lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                    double* ferr, double* berr, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_csysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf,
-                    lapack_int* ipiv, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* ferr, float* berr,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int *info );
-void LAPACK_zsysvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf,
-                    lapack_int* ipiv, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *info );
-void LAPACK_dsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     double* a, lapack_int* lda, double* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, double* s, double* b,
-                     lapack_int* ldb, double* x, lapack_int* ldx, double* rcond,
-                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params, double* work,
-                     lapack_int* iwork, lapack_int *info );
-void LAPACK_ssysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     float* a, lapack_int* lda, float* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, float* s, float* b,
-                     lapack_int* ldb, float* x, lapack_int* ldx, float* rcond,
-                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params, float* work,
-                     lapack_int* iwork, lapack_int *info );
-void LAPACK_zsysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, double* s,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_csysvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, float* s,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_chesv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zhesv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_chesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* af, lapack_int* ldaf,
-                    lapack_int* ipiv, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* ferr, float* berr,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int *info );
-void LAPACK_zhesvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* af, lapack_int* ldaf,
-                    lapack_int* ipiv, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *info );
-void LAPACK_zhesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, double* s,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* x, lapack_int* ldx, double* rcond,
-                     double* rpvgrw, double* berr, lapack_int* n_err_bnds,
-                     double* err_bnds_norm, double* err_bnds_comp,
-                     lapack_int* nparams, double* params,
-                     lapack_complex_double* work, double* rwork,
-                     lapack_int *info );
-void LAPACK_chesvxx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* af, lapack_int* ldaf,
-                     lapack_int* ipiv, char* equed, float* s,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* x, lapack_int* ldx, float* rcond,
-                     float* rpvgrw, float* berr, lapack_int* n_err_bnds,
-                     float* err_bnds_norm, float* err_bnds_comp,
-                     lapack_int* nparams, float* params,
-                     lapack_complex_float* work, float* rwork,
-                     lapack_int *info );
-void LAPACK_sspsv( char* uplo, lapack_int* n, lapack_int* nrhs, float* ap,
-                   lapack_int* ipiv, float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_dspsv( char* uplo, lapack_int* n, lapack_int* nrhs, double* ap,
-                   lapack_int* ipiv, double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_cspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* ap, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_zspsv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* ap, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_sspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const float* ap, float* afp, lapack_int* ipiv,
-                    const float* b, lapack_int* ldb, float* x, lapack_int* ldx,
-                    float* rcond, float* ferr, float* berr, float* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* ap, double* afp, lapack_int* ipiv,
-                    const double* b, lapack_int* ldb, double* x,
-                    lapack_int* ldx, double* rcond, double* ferr, double* berr,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap, lapack_complex_float* afp,
-                    lapack_int* ipiv, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* ferr, float* berr,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zspsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap, lapack_complex_double* afp,
-                    lapack_int* ipiv, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_chpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* ap, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_zhpsv( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* ap, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_chpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* ap, lapack_complex_float* afp,
-                    lapack_int* ipiv, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* x, lapack_int* ldx,
-                    float* rcond, float* ferr, float* berr,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zhpsvx( char* fact, char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* ap, lapack_complex_double* afp,
-                    lapack_int* ipiv, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* x, lapack_int* ldx,
-                    double* rcond, double* ferr, double* berr,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sgeqrf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* tau, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgeqrf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* tau, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cgeqrf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zgeqrf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sgeqpf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* jpvt, float* tau, float* work,
-                    lapack_int *info );
-void LAPACK_dgeqpf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* jpvt, double* tau, double* work,
-                    lapack_int *info );
-void LAPACK_cgeqpf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* jpvt,
-                    lapack_complex_float* tau, lapack_complex_float* work,
-                    float* rwork, lapack_int *info );
-void LAPACK_zgeqpf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* jpvt,
-                    lapack_complex_double* tau, lapack_complex_double* work,
-                    double* rwork, lapack_int *info );
-void LAPACK_sgeqp3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* jpvt, float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dgeqp3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* jpvt, double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cgeqp3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* jpvt,
-                    lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int *info );
-void LAPACK_zgeqp3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* jpvt,
-                    lapack_complex_double* tau, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_int *info );
-void LAPACK_sorgqr( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
-                    lapack_int* lda, const float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dorgqr( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
-                    lapack_int* lda, const double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const float* a, lapack_int* lda,
-                    const float* tau, float* c, lapack_int* ldc, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dormqr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const double* a, lapack_int* lda,
-                    const double* tau, double* c, lapack_int* ldc, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cungqr( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zungqr( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zunmqr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* tau,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sgelqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* tau, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgelqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* tau, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cgelqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zgelqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sorglq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
-                    lapack_int* lda, const float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dorglq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
-                    lapack_int* lda, const double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const float* a, lapack_int* lda,
-                    const float* tau, float* c, lapack_int* ldc, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dormlq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const double* a, lapack_int* lda,
-                    const double* tau, double* c, lapack_int* ldc, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cunglq( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zunglq( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zunmlq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* tau,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sgeqlf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* tau, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgeqlf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* tau, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cgeqlf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zgeqlf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sorgql( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
-                    lapack_int* lda, const float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dorgql( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
-                    lapack_int* lda, const double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cungql( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zungql( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sormql( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const float* a, lapack_int* lda,
-                    const float* tau, float* c, lapack_int* ldc, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dormql( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const double* a, lapack_int* lda,
-                    const double* tau, double* c, lapack_int* ldc, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zunmql( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* tau,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sgerqf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* tau, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgerqf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* tau, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cgerqf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zgerqf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sorgrq( lapack_int* m, lapack_int* n, lapack_int* k, float* a,
-                    lapack_int* lda, const float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dorgrq( lapack_int* m, lapack_int* n, lapack_int* k, double* a,
-                    lapack_int* lda, const double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cungrq( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zungrq( lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const float* a, lapack_int* lda,
-                    const float* tau, float* c, lapack_int* ldc, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dormrq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const double* a, lapack_int* lda,
-                    const double* tau, double* c, lapack_int* ldc, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zunmrq( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* tau,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_stzrzf( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* tau, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dtzrzf( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* tau, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_ctzrzf( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_ztzrzf( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, lapack_int* l, const float* a,
-                    lapack_int* lda, const float* tau, float* c,
-                    lapack_int* ldc, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dormrz( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, lapack_int* l, const double* a,
-                    lapack_int* lda, const double* tau, double* c,
-                    lapack_int* ldc, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, lapack_int* l, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zunmrz( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* k, lapack_int* l,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau, lapack_complex_double* c,
-                    lapack_int* ldc, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sggqrf( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
-                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
-                    float* taub, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dggqrf( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
-                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
-                    double* taub, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* taua, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* taub,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zggqrf( lapack_int* n, lapack_int* m, lapack_int* p,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* taua, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* taub,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sggrqf( lapack_int* m, lapack_int* p, lapack_int* n, float* a,
-                    lapack_int* lda, float* taua, float* b, lapack_int* ldb,
-                    float* taub, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dggrqf( lapack_int* m, lapack_int* p, lapack_int* n, double* a,
-                    lapack_int* lda, double* taua, double* b, lapack_int* ldb,
-                    double* taub, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* taua, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* taub,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zggrqf( lapack_int* m, lapack_int* p, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* taua, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* taub,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sgebrd( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* d, float* e, float* tauq, float* taup, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dgebrd( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* d, double* e, double* tauq, double* taup,
-                    double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_cgebrd( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, float* d, float* e,
-                    lapack_complex_float* tauq, lapack_complex_float* taup,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zgebrd( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, double* d, double* e,
-                    lapack_complex_double* tauq, lapack_complex_double* taup,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
-                    lapack_int* kl, lapack_int* ku, float* ab, lapack_int* ldab,
-                    float* d, float* e, float* q, lapack_int* ldq, float* pt,
-                    lapack_int* ldpt, float* c, lapack_int* ldc, float* work,
-                    lapack_int *info );
-void LAPACK_dgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
-                    lapack_int* kl, lapack_int* ku, double* ab,
-                    lapack_int* ldab, double* d, double* e, double* q,
-                    lapack_int* ldq, double* pt, lapack_int* ldpt, double* c,
-                    lapack_int* ldc, double* work, lapack_int *info );
-void LAPACK_cgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
-                    lapack_int* kl, lapack_int* ku, lapack_complex_float* ab,
-                    lapack_int* ldab, float* d, float* e,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* pt, lapack_int* ldpt,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zgbbrd( char* vect, lapack_int* m, lapack_int* n, lapack_int* ncc,
-                    lapack_int* kl, lapack_int* ku, lapack_complex_double* ab,
-                    lapack_int* ldab, double* d, double* e,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* pt, lapack_int* ldpt,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_sorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
-                    float* a, lapack_int* lda, const float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dorgbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
-                    double* a, lapack_int* lda, const double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sormbr( char* vect, char* side, char* trans, lapack_int* m,
-                    lapack_int* n, lapack_int* k, const float* a,
-                    lapack_int* lda, const float* tau, float* c,
-                    lapack_int* ldc, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dormbr( char* vect, char* side, char* trans, lapack_int* m,
-                    lapack_int* n, lapack_int* k, const double* a,
-                    lapack_int* lda, const double* tau, double* c,
-                    lapack_int* ldc, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zungbr( char* vect, lapack_int* m, lapack_int* n, lapack_int* k,
-                    lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cunmbr( char* vect, char* side, char* trans, lapack_int* m,
-                    lapack_int* n, lapack_int* k, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zunmbr( char* vect, char* side, char* trans, lapack_int* m,
-                    lapack_int* n, lapack_int* k,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau, lapack_complex_double* c,
-                    lapack_int* ldc, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
-                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
-                    float* vt, lapack_int* ldvt, float* u, lapack_int* ldu,
-                    float* c, lapack_int* ldc, float* work, lapack_int *info );
-void LAPACK_dbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
-                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
-                    double* vt, lapack_int* ldvt, double* u, lapack_int* ldu,
-                    double* c, lapack_int* ldc, double* work,
-                    lapack_int *info );
-void LAPACK_cbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
-                    lapack_int* nru, lapack_int* ncc, float* d, float* e,
-                    lapack_complex_float* vt, lapack_int* ldvt,
-                    lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* c, lapack_int* ldc, float* work,
-                    lapack_int *info );
-void LAPACK_zbdsqr( char* uplo, lapack_int* n, lapack_int* ncvt,
-                    lapack_int* nru, lapack_int* ncc, double* d, double* e,
-                    lapack_complex_double* vt, lapack_int* ldvt,
-                    lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* c, lapack_int* ldc, double* work,
-                    lapack_int *info );
-void LAPACK_sbdsdc( char* uplo, char* compq, lapack_int* n, float* d, float* e,
-                    float* u, lapack_int* ldu, float* vt, lapack_int* ldvt,
-                    float* q, lapack_int* iq, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dbdsdc( char* uplo, char* compq, lapack_int* n, double* d,
-                    double* e, double* u, lapack_int* ldu, double* vt,
-                    lapack_int* ldvt, double* q, lapack_int* iq, double* work,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_sbdsvdx( char* uplo, char* jobz, char* range,
-                     lapack_int* n, float* d, float* e,
-                     float* vl, float* vu,
-                     lapack_int* il, lapack_int* iu, lapack_int* ns,
-                     float* s, float* z, lapack_int* ldz,
-                     float* work, lapack_int *iwork, lapack_int *info );
-void LAPACK_dbdsvdx( char* uplo, char* jobz, char* range,
-                     lapack_int* n, double* d, double* e,
-                     double* vl, double* vu,
-                     lapack_int* il, lapack_int* iu, lapack_int* ns,
-                     double* s, double* z, lapack_int* ldz,
-                     double* work, lapack_int *iwork, lapack_int *info );
-void LAPACK_ssytrd( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    float* d, float* e, float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dsytrd( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    double* d, double* e, double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sorgtr( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    const float* tau, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dorgtr( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    const double* tau, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sormtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const float* a, lapack_int* lda,
-                    const float* tau, float* c, lapack_int* ldc, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dormtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const double* a, lapack_int* lda,
-                    const double* tau, double* c, lapack_int* ldc, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_chetrd( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, float* d, float* e,
-                    lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zhetrd( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, double* d, double* e,
-                    lapack_complex_double* tau, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cungtr( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zungtr( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cunmtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zunmtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* tau,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_ssptrd( char* uplo, lapack_int* n, float* ap, float* d, float* e,
-                    float* tau, lapack_int *info );
-void LAPACK_dsptrd( char* uplo, lapack_int* n, double* ap, double* d, double* e,
-                    double* tau, lapack_int *info );
-void LAPACK_sopgtr( char* uplo, lapack_int* n, const float* ap,
-                    const float* tau, float* q, lapack_int* ldq, float* work,
-                    lapack_int *info );
-void LAPACK_dopgtr( char* uplo, lapack_int* n, const double* ap,
-                    const double* tau, double* q, lapack_int* ldq, double* work,
-                    lapack_int *info );
-void LAPACK_sopmtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const float* ap, const float* tau, float* c,
-                    lapack_int* ldc, float* work, lapack_int *info );
-void LAPACK_dopmtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const double* ap, const double* tau,
-                    double* c, lapack_int* ldc, double* work,
-                    lapack_int *info );
-void LAPACK_chptrd( char* uplo, lapack_int* n, lapack_complex_float* ap,
-                    float* d, float* e, lapack_complex_float* tau,
-                    lapack_int *info );
-void LAPACK_zhptrd( char* uplo, lapack_int* n, lapack_complex_double* ap,
-                    double* d, double* e, lapack_complex_double* tau,
-                    lapack_int *info );
-void LAPACK_cupgtr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
-                    const lapack_complex_float* tau, lapack_complex_float* q,
-                    lapack_int* ldq, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zupgtr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
-                    const lapack_complex_double* tau, lapack_complex_double* q,
-                    lapack_int* ldq, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_cupmtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const lapack_complex_float* ap,
-                    const lapack_complex_float* tau, lapack_complex_float* c,
-                    lapack_int* ldc, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zupmtr( char* side, char* uplo, char* trans, lapack_int* m,
-                    lapack_int* n, const lapack_complex_double* ap,
-                    const lapack_complex_double* tau, lapack_complex_double* c,
-                    lapack_int* ldc, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_ssbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
-                    float* ab, lapack_int* ldab, float* d, float* e, float* q,
-                    lapack_int* ldq, float* work, lapack_int *info );
-void LAPACK_dsbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
-                    double* ab, lapack_int* ldab, double* d, double* e,
-                    double* q, lapack_int* ldq, double* work,
-                    lapack_int *info );
-void LAPACK_chbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_float* ab, lapack_int* ldab, float* d,
-                    float* e, lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zhbtrd( char* vect, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_double* ab, lapack_int* ldab, double* d,
-                    double* e, lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_ssterf( lapack_int* n, float* d, float* e, lapack_int *info );
-void LAPACK_dsterf( lapack_int* n, double* d, double* e, lapack_int *info );
-void LAPACK_ssteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
-                    lapack_int* ldz, float* work, lapack_int *info );
-void LAPACK_dsteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
-                    lapack_int* ldz, double* work, lapack_int *info );
-void LAPACK_csteqr( char* compz, lapack_int* n, float* d, float* e,
-                    lapack_complex_float* z, lapack_int* ldz, float* work,
-                    lapack_int *info );
-void LAPACK_zsteqr( char* compz, lapack_int* n, double* d, double* e,
-                    lapack_complex_double* z, lapack_int* ldz, double* work,
-                    lapack_int *info );
-void LAPACK_sstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    lapack_int* m, float* w, float* z, lapack_int* ldz,
-                    lapack_int* nzc, lapack_int* isuppz, lapack_logical* tryrac,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_dstemr( char* jobz, char* range, lapack_int* n, double* d,
-                    double* e, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, lapack_int* m, double* w, double* z,
-                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
-                    lapack_logical* tryrac, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_cstemr( char* jobz, char* range, lapack_int* n, float* d, float* e,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_int* nzc, lapack_int* isuppz,
-                    lapack_logical* tryrac, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_zstemr( char* jobz, char* range, lapack_int* n, double* d,
-                    double* e, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, lapack_int* m, double* w,
-                    lapack_complex_double* z, lapack_int* ldz, lapack_int* nzc,
-                    lapack_int* isuppz, lapack_logical* tryrac, double* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_sstedc( char* compz, lapack_int* n, float* d, float* e, float* z,
-                    lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dstedc( char* compz, lapack_int* n, double* d, double* e, double* z,
-                    lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_cstedc( char* compz, lapack_int* n, float* d, float* e,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zstedc( char* compz, lapack_int* n, double* d, double* e,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_sstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    float* abstol, lapack_int* m, float* w, float* z,
-                    lapack_int* ldz, lapack_int* isuppz, float* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_dstegr( char* jobz, char* range, lapack_int* n, double* d,
-                    double* e, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, lapack_int* isuppz,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_cstegr( char* jobz, char* range, lapack_int* n, float* d, float* e,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    float* abstol, lapack_int* m, float* w,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_int* isuppz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_zstegr( char* jobz, char* range, lapack_int* n, double* d,
-                    double* e, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_int* isuppz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_spteqr( char* compz, lapack_int* n, float* d, float* e, float* z,
-                    lapack_int* ldz, float* work, lapack_int *info );
-void LAPACK_dpteqr( char* compz, lapack_int* n, double* d, double* e, double* z,
-                    lapack_int* ldz, double* work, lapack_int *info );
-void LAPACK_cpteqr( char* compz, lapack_int* n, float* d, float* e,
-                    lapack_complex_float* z, lapack_int* ldz, float* work,
-                    lapack_int *info );
-void LAPACK_zpteqr( char* compz, lapack_int* n, double* d, double* e,
-                    lapack_complex_double* z, lapack_int* ldz, double* work,
-                    lapack_int *info );
-void LAPACK_sstebz( char* range, char* order, lapack_int* n, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    const float* d, const float* e, lapack_int* m,
-                    lapack_int* nsplit, float* w, lapack_int* iblock,
-                    lapack_int* isplit, float* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dstebz( char* range, char* order, lapack_int* n, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    const double* d, const double* e, lapack_int* m,
-                    lapack_int* nsplit, double* w, lapack_int* iblock,
-                    lapack_int* isplit, double* work, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_sstein( lapack_int* n, const float* d, const float* e,
-                    lapack_int* m, const float* w, const lapack_int* iblock,
-                    const lapack_int* isplit, float* z, lapack_int* ldz,
-                    float* work, lapack_int* iwork, lapack_int* ifailv,
-                    lapack_int *info );
-void LAPACK_dstein( lapack_int* n, const double* d, const double* e,
-                    lapack_int* m, const double* w, const lapack_int* iblock,
-                    const lapack_int* isplit, double* z, lapack_int* ldz,
-                    double* work, lapack_int* iwork, lapack_int* ifailv,
-                    lapack_int *info );
-void LAPACK_cstein( lapack_int* n, const float* d, const float* e,
-                    lapack_int* m, const float* w, const lapack_int* iblock,
-                    const lapack_int* isplit, lapack_complex_float* z,
-                    lapack_int* ldz, float* work, lapack_int* iwork,
-                    lapack_int* ifailv, lapack_int *info );
-void LAPACK_zstein( lapack_int* n, const double* d, const double* e,
-                    lapack_int* m, const double* w, const lapack_int* iblock,
-                    const lapack_int* isplit, lapack_complex_double* z,
-                    lapack_int* ldz, double* work, lapack_int* iwork,
-                    lapack_int* ifailv, lapack_int *info );
-void LAPACK_sdisna( char* job, lapack_int* m, lapack_int* n, const float* d,
-                    float* sep, lapack_int *info );
-void LAPACK_ddisna( char* job, lapack_int* m, lapack_int* n, const double* d,
-                    double* sep, lapack_int *info );
-void LAPACK_ssygst( lapack_int* itype, char* uplo, lapack_int* n, float* a,
-                    lapack_int* lda, const float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_dsygst( lapack_int* itype, char* uplo, lapack_int* n, double* a,
-                    lapack_int* lda, const double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_chegst( lapack_int* itype, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_zhegst( lapack_int* itype, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int *info );
-void LAPACK_sspgst( lapack_int* itype, char* uplo, lapack_int* n, float* ap,
-                    const float* bp, lapack_int *info );
-void LAPACK_dspgst( lapack_int* itype, char* uplo, lapack_int* n, double* ap,
-                    const double* bp, lapack_int *info );
-void LAPACK_chpgst( lapack_int* itype, char* uplo, lapack_int* n,
-                    lapack_complex_float* ap, const lapack_complex_float* bp,
-                    lapack_int *info );
-void LAPACK_zhpgst( lapack_int* itype, char* uplo, lapack_int* n,
-                    lapack_complex_double* ap, const lapack_complex_double* bp,
-                    lapack_int *info );
-void LAPACK_ssbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, float* ab, lapack_int* ldab,
-                    const float* bb, lapack_int* ldbb, float* x,
-                    lapack_int* ldx, float* work, lapack_int *info );
-void LAPACK_dsbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, double* ab, lapack_int* ldab,
-                    const double* bb, lapack_int* ldbb, double* x,
-                    lapack_int* ldx, double* work, lapack_int *info );
-void LAPACK_chbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
-                    const lapack_complex_float* bb, lapack_int* ldbb,
-                    lapack_complex_float* x, lapack_int* ldx,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_zhbgst( char* vect, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
-                    const lapack_complex_double* bb, lapack_int* ldbb,
-                    lapack_complex_double* x, lapack_int* ldx,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_spbstf( char* uplo, lapack_int* n, lapack_int* kb, float* bb,
-                    lapack_int* ldbb, lapack_int *info );
-void LAPACK_dpbstf( char* uplo, lapack_int* n, lapack_int* kb, double* bb,
-                    lapack_int* ldbb, lapack_int *info );
-void LAPACK_cpbstf( char* uplo, lapack_int* n, lapack_int* kb,
-                    lapack_complex_float* bb, lapack_int* ldbb,
-                    lapack_int *info );
-void LAPACK_zpbstf( char* uplo, lapack_int* n, lapack_int* kb,
-                    lapack_complex_double* bb, lapack_int* ldbb,
-                    lapack_int *info );
-void LAPACK_sgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
-                    lapack_int* lda, float* tau, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
-                    lapack_int* lda, double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zgehrd( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* tau, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, float* a,
-                    lapack_int* lda, const float* tau, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dorghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi, double* a,
-                    lapack_int* lda, const double* tau, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi, const float* a,
-                    lapack_int* lda, const float* tau, float* c,
-                    lapack_int* ldc, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dormhr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi, const double* a,
-                    lapack_int* lda, const double* tau, double* c,
-                    lapack_int* ldc, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                    lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* tau, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zunghr( lapack_int* n, lapack_int* ilo, lapack_int* ihi,
-                    lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* tau, lapack_complex_float* c,
-                    lapack_int* ldc, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zunmhr( char* side, char* trans, lapack_int* m, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* tau, lapack_complex_double* c,
-                    lapack_int* ldc, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sgebal( char* job, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* ilo, lapack_int* ihi, float* scale,
-                    lapack_int *info );
-void LAPACK_dgebal( char* job, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* ilo, lapack_int* ihi, double* scale,
-                    lapack_int *info );
-void LAPACK_cgebal( char* job, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
-                    float* scale, lapack_int *info );
-void LAPACK_zgebal( char* job, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ilo, lapack_int* ihi,
-                    double* scale, lapack_int *info );
-void LAPACK_sgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const float* scale, lapack_int* m,
-                    float* v, lapack_int* ldv, lapack_int *info );
-void LAPACK_dgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const double* scale, lapack_int* m,
-                    double* v, lapack_int* ldv, lapack_int *info );
-void LAPACK_cgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const float* scale, lapack_int* m,
-                    lapack_complex_float* v, lapack_int* ldv,
-                    lapack_int *info );
-void LAPACK_zgebak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const double* scale, lapack_int* m,
-                    lapack_complex_double* v, lapack_int* ldv,
-                    lapack_int *info );
-void LAPACK_shseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, float* h, lapack_int* ldh, float* wr,
-                    float* wi, float* z, lapack_int* ldz, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, double* h, lapack_int* ldh, double* wr,
-                    double* wi, double* z, lapack_int* ldz, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_chseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, lapack_complex_float* h, lapack_int* ldh,
-                    lapack_complex_float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zhseqr( char* job, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, lapack_complex_double* h, lapack_int* ldh,
-                    lapack_complex_double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_shsein( char* job, char* eigsrc, char* initv,
-                    lapack_logical* select, lapack_int* n, const float* h,
-                    lapack_int* ldh, float* wr, const float* wi, float* vl,
-                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
-                    lapack_int* mm, lapack_int* m, float* work,
-                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
-void LAPACK_dhsein( char* job, char* eigsrc, char* initv,
-                    lapack_logical* select, lapack_int* n, const double* h,
-                    lapack_int* ldh, double* wr, const double* wi, double* vl,
-                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
-                    lapack_int* mm, lapack_int* m, double* work,
-                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
-void LAPACK_chsein( char* job, char* eigsrc, char* initv,
-                    const lapack_logical* select, lapack_int* n,
-                    const lapack_complex_float* h, lapack_int* ldh,
-                    lapack_complex_float* w, lapack_complex_float* vl,
-                    lapack_int* ldvl, lapack_complex_float* vr,
-                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
-                    lapack_complex_float* work, float* rwork,
-                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
-void LAPACK_zhsein( char* job, char* eigsrc, char* initv,
-                    const lapack_logical* select, lapack_int* n,
-                    const lapack_complex_double* h, lapack_int* ldh,
-                    lapack_complex_double* w, lapack_complex_double* vl,
-                    lapack_int* ldvl, lapack_complex_double* vr,
-                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int* ifaill, lapack_int* ifailr, lapack_int *info );
-void LAPACK_strevc( char* side, char* howmny, lapack_logical* select,
-                    lapack_int* n, const float* t, lapack_int* ldt, float* vl,
-                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
-                    lapack_int* mm, lapack_int* m, float* work,
-                    lapack_int *info );
-void LAPACK_dtrevc( char* side, char* howmny, lapack_logical* select,
-                    lapack_int* n, const double* t, lapack_int* ldt, double* vl,
-                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
-                    lapack_int* mm, lapack_int* m, double* work,
-                    lapack_int *info );
-void LAPACK_ctrevc( char* side, char* howmny, const lapack_logical* select,
-                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* vl, lapack_int* ldvl,
-                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
-                    lapack_int* m, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_ztrevc( char* side, char* howmny, const lapack_logical* select,
-                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* vl, lapack_int* ldvl,
-                    lapack_complex_double* vr, lapack_int* ldvr, lapack_int* mm,
-                    lapack_int* m, lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_strsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const float* t, lapack_int* ldt,
-                    const float* vl, lapack_int* ldvl, const float* vr,
-                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
-                    lapack_int* m, float* work, lapack_int* ldwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dtrsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const double* t, lapack_int* ldt,
-                    const double* vl, lapack_int* ldvl, const double* vr,
-                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
-                    lapack_int* m, double* work, lapack_int* ldwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_ctrsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const lapack_complex_float* t,
-                    lapack_int* ldt, const lapack_complex_float* vl,
-                    lapack_int* ldvl, const lapack_complex_float* vr,
-                    lapack_int* ldvr, float* s, float* sep, lapack_int* mm,
-                    lapack_int* m, lapack_complex_float* work,
-                    lapack_int* ldwork, float* rwork, lapack_int *info );
-void LAPACK_ztrsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const lapack_complex_double* t,
-                    lapack_int* ldt, const lapack_complex_double* vl,
-                    lapack_int* ldvl, const lapack_complex_double* vr,
-                    lapack_int* ldvr, double* s, double* sep, lapack_int* mm,
-                    lapack_int* m, lapack_complex_double* work,
-                    lapack_int* ldwork, double* rwork, lapack_int *info );
-void LAPACK_strexc( char* compq, lapack_int* n, float* t, lapack_int* ldt,
-                    float* q, lapack_int* ldq, lapack_int* ifst,
-                    lapack_int* ilst, float* work, lapack_int *info );
-void LAPACK_dtrexc( char* compq, lapack_int* n, double* t, lapack_int* ldt,
-                    double* q, lapack_int* ldq, lapack_int* ifst,
-                    lapack_int* ilst, double* work, lapack_int *info );
-void LAPACK_ctrexc( char* compq, lapack_int* n, lapack_complex_float* t,
-                    lapack_int* ldt, lapack_complex_float* q, lapack_int* ldq,
-                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
-void LAPACK_ztrexc( char* compq, lapack_int* n, lapack_complex_double* t,
-                    lapack_int* ldt, lapack_complex_double* q, lapack_int* ldq,
-                    lapack_int* ifst, lapack_int* ilst, lapack_int *info );
-void LAPACK_strsen( char* job, char* compq, const lapack_logical* select,
-                    lapack_int* n, float* t, lapack_int* ldt, float* q,
-                    lapack_int* ldq, float* wr, float* wi, lapack_int* m,
-                    float* s, float* sep, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dtrsen( char* job, char* compq, const lapack_logical* select,
-                    lapack_int* n, double* t, lapack_int* ldt, double* q,
-                    lapack_int* ldq, double* wr, double* wi, lapack_int* m,
-                    double* s, double* sep, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_ctrsen( char* job, char* compq, const lapack_logical* select,
-                    lapack_int* n, lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* w, lapack_int* m, float* s,
-                    float* sep, lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_ztrsen( char* job, char* compq, const lapack_logical* select,
-                    lapack_int* n, lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* w, lapack_int* m, double* s,
-                    double* sep, lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_strsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
-                    lapack_int* n, const float* a, lapack_int* lda,
-                    const float* b, lapack_int* ldb, float* c, lapack_int* ldc,
-                    float* scale, lapack_int *info );
-void LAPACK_dtrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
-                    lapack_int* n, const double* a, lapack_int* lda,
-                    const double* b, lapack_int* ldb, double* c,
-                    lapack_int* ldc, double* scale, lapack_int *info );
-void LAPACK_ctrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
-                    lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* c, lapack_int* ldc,
-                    float* scale, lapack_int *info );
-void LAPACK_ztrsyl( char* trana, char* tranb, lapack_int* isgn, lapack_int* m,
-                    lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* c, lapack_int* ldc,
-                    double* scale, lapack_int *info );
-void LAPACK_sgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, float* a, lapack_int* lda, float* b,
-                    lapack_int* ldb, float* q, lapack_int* ldq, float* z,
-                    lapack_int* ldz, lapack_int *info );
-void LAPACK_dgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, double* a, lapack_int* lda, double* b,
-                    lapack_int* ldb, double* q, lapack_int* ldq, double* z,
-                    lapack_int* ldz, lapack_int *info );
-void LAPACK_cgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_int *info );
-void LAPACK_zgghrd( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_int *info );
-void LAPACK_sgghd3( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, float* a, lapack_int* lda, float* b,
-                    lapack_int* ldb, float* q, lapack_int* ldq, float* z,
-                    lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgghd3( char* compq, char* compz, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, double* a, lapack_int* lda, double* b,
-                    lapack_int* ldb, double* q, lapack_int* ldq, double* z,
-                    lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cgghd3( char* compq, char* compz, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zgghd3( char* compq, char* compz, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_sggbal( char* job, lapack_int* n, float* a, lapack_int* lda,
-                    float* b, lapack_int* ldb, lapack_int* ilo, lapack_int* ihi,
-                    float* lscale, float* rscale, float* work,
-                    lapack_int *info );
-void LAPACK_dggbal( char* job, lapack_int* n, double* a, lapack_int* lda,
-                    double* b, lapack_int* ldb, lapack_int* ilo,
-                    lapack_int* ihi, double* lscale, double* rscale,
-                    double* work, lapack_int *info );
-void LAPACK_cggbal( char* job, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int* ilo, lapack_int* ihi, float* lscale,
-                    float* rscale, float* work, lapack_int *info );
-void LAPACK_zggbal( char* job, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                    lapack_int* ilo, lapack_int* ihi, double* lscale,
-                    double* rscale, double* work, lapack_int *info );
-void LAPACK_sggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const float* lscale, const float* rscale,
-                    lapack_int* m, float* v, lapack_int* ldv,
-                    lapack_int *info );
-void LAPACK_dggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const double* lscale, const double* rscale,
-                    lapack_int* m, double* v, lapack_int* ldv,
-                    lapack_int *info );
-void LAPACK_cggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const float* lscale, const float* rscale,
-                    lapack_int* m, lapack_complex_float* v, lapack_int* ldv,
-                    lapack_int *info );
-void LAPACK_zggbak( char* job, char* side, lapack_int* n, lapack_int* ilo,
-                    lapack_int* ihi, const double* lscale, const double* rscale,
-                    lapack_int* m, lapack_complex_double* v, lapack_int* ldv,
-                    lapack_int *info );
-void LAPACK_shgeqz( char* job, char* compq, char* compz, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi, float* h, lapack_int* ldh,
-                    float* t, lapack_int* ldt, float* alphar, float* alphai,
-                    float* beta, float* q, lapack_int* ldq, float* z,
-                    lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dhgeqz( char* job, char* compq, char* compz, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi, double* h,
-                    lapack_int* ldh, double* t, lapack_int* ldt, double* alphar,
-                    double* alphai, double* beta, double* q, lapack_int* ldq,
-                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_chgeqz( char* job, char* compq, char* compz, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi, lapack_complex_float* h,
-                    lapack_int* ldh, lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* alpha, lapack_complex_float* beta,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int *info );
-void LAPACK_zhgeqz( char* job, char* compq, char* compz, lapack_int* n,
-                    lapack_int* ilo, lapack_int* ihi, lapack_complex_double* h,
-                    lapack_int* ldh, lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* alpha, lapack_complex_double* beta,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *info );
-void LAPACK_stgevc( char* side, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const float* s, lapack_int* lds,
-                    const float* p, lapack_int* ldp, float* vl,
-                    lapack_int* ldvl, float* vr, lapack_int* ldvr,
-                    lapack_int* mm, lapack_int* m, float* work,
-                    lapack_int *info );
-void LAPACK_dtgevc( char* side, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const double* s, lapack_int* lds,
-                    const double* p, lapack_int* ldp, double* vl,
-                    lapack_int* ldvl, double* vr, lapack_int* ldvr,
-                    lapack_int* mm, lapack_int* m, double* work,
-                    lapack_int *info );
-void LAPACK_ctgevc( char* side, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const lapack_complex_float* s,
-                    lapack_int* lds, const lapack_complex_float* p,
-                    lapack_int* ldp, lapack_complex_float* vl, lapack_int* ldvl,
-                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* mm,
-                    lapack_int* m, lapack_complex_float* work, float* rwork,
-                    lapack_int *info );
-void LAPACK_ztgevc( char* side, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const lapack_complex_double* s,
-                    lapack_int* lds, const lapack_complex_double* p,
-                    lapack_int* ldp, lapack_complex_double* vl,
-                    lapack_int* ldvl, lapack_complex_double* vr,
-                    lapack_int* ldvr, lapack_int* mm, lapack_int* m,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int *info );
-void LAPACK_stgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
-                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
-                    lapack_int* ifst, lapack_int* ilst, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dtgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
-                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                    double* q, lapack_int* ldq, double* z, lapack_int* ldz,
-                    lapack_int* ifst, lapack_int* ilst, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_ctgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* z, lapack_int* ldz, lapack_int* ifst,
-                    lapack_int* ilst, lapack_int *info );
-void LAPACK_ztgexc( lapack_logical* wantq, lapack_logical* wantz, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* z, lapack_int* ldz, lapack_int* ifst,
-                    lapack_int* ilst, lapack_int *info );
-void LAPACK_stgsen( lapack_int* ijob, lapack_logical* wantq,
-                    lapack_logical* wantz, const lapack_logical* select,
-                    lapack_int* n, float* a, lapack_int* lda, float* b,
-                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
-                    float* q, lapack_int* ldq, float* z, lapack_int* ldz,
-                    lapack_int* m, float* pl, float* pr, float* dif,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_dtgsen( lapack_int* ijob, lapack_logical* wantq,
-                    lapack_logical* wantz, const lapack_logical* select,
-                    lapack_int* n, double* a, lapack_int* lda, double* b,
-                    lapack_int* ldb, double* alphar, double* alphai,
-                    double* beta, double* q, lapack_int* ldq, double* z,
-                    lapack_int* ldz, lapack_int* m, double* pl, double* pr,
-                    double* dif, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_ctgsen( lapack_int* ijob, lapack_logical* wantq,
-                    lapack_logical* wantz, const lapack_logical* select,
-                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* alpha, lapack_complex_float* beta,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* z, lapack_int* ldz, lapack_int* m,
-                    float* pl, float* pr, float* dif,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_ztgsen( lapack_int* ijob, lapack_logical* wantq,
-                    lapack_logical* wantz, const lapack_logical* select,
-                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* alpha, lapack_complex_double* beta,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* z, lapack_int* ldz, lapack_int* m,
-                    double* pl, double* pr, double* dif,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_stgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
-                    const float* a, lapack_int* lda, const float* b,
-                    lapack_int* ldb, float* c, lapack_int* ldc, const float* d,
-                    lapack_int* ldd, const float* e, lapack_int* lde, float* f,
-                    lapack_int* ldf, float* scale, float* dif, float* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_dtgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
-                    const double* a, lapack_int* lda, const double* b,
-                    lapack_int* ldb, double* c, lapack_int* ldc,
-                    const double* d, lapack_int* ldd, const double* e,
-                    lapack_int* lde, double* f, lapack_int* ldf, double* scale,
-                    double* dif, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_ctgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    const lapack_complex_float* d, lapack_int* ldd,
-                    const lapack_complex_float* e, lapack_int* lde,
-                    lapack_complex_float* f, lapack_int* ldf, float* scale,
-                    float* dif, lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_ztgsyl( char* trans, lapack_int* ijob, lapack_int* m, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    const lapack_complex_double* d, lapack_int* ldd,
-                    const lapack_complex_double* e, lapack_int* lde,
-                    lapack_complex_double* f, lapack_int* ldf, double* scale,
-                    double* dif, lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_stgsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const float* a, lapack_int* lda,
-                    const float* b, lapack_int* ldb, const float* vl,
-                    lapack_int* ldvl, const float* vr, lapack_int* ldvr,
-                    float* s, float* dif, lapack_int* mm, lapack_int* m,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dtgsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const double* a, lapack_int* lda,
-                    const double* b, lapack_int* ldb, const double* vl,
-                    lapack_int* ldvl, const double* vr, lapack_int* ldvr,
-                    double* s, double* dif, lapack_int* mm, lapack_int* m,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_ctgsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* b,
-                    lapack_int* ldb, const lapack_complex_float* vl,
-                    lapack_int* ldvl, const lapack_complex_float* vr,
-                    lapack_int* ldvr, float* s, float* dif, lapack_int* mm,
-                    lapack_int* m, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_ztgsna( char* job, char* howmny, const lapack_logical* select,
-                    lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* b,
-                    lapack_int* ldb, const lapack_complex_double* vl,
-                    lapack_int* ldvl, const lapack_complex_double* vr,
-                    lapack_int* ldvr, double* s, double* dif, lapack_int* mm,
-                    lapack_int* m, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_sggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, float* a, lapack_int* lda,
-                    float* b, lapack_int* ldb, float* tola, float* tolb,
-                    lapack_int* k, lapack_int* l, float* u, lapack_int* ldu,
-                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
-                    lapack_int* iwork, float* tau, float* work,
-                    lapack_int *info );
-void LAPACK_dggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, double* a, lapack_int* lda,
-                    double* b, lapack_int* ldb, double* tola, double* tolb,
-                    lapack_int* k, lapack_int* l, double* u, lapack_int* ldu,
-                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
-                    lapack_int* iwork, double* tau, double* work,
-                    lapack_int *info );
-void LAPACK_cggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                    float* tola, float* tolb, lapack_int* k, lapack_int* l,
-                    lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* v, lapack_int* ldv,
-                    lapack_complex_float* q, lapack_int* ldq, lapack_int* iwork,
-                    float* rwork, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zggsvp( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                    double* tola, double* tolb, lapack_int* k, lapack_int* l,
-                    lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* v, lapack_int* ldv,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_int* iwork, double* rwork,
-                    lapack_complex_double* tau, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_sggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* p, lapack_int* n, float* a, lapack_int* lda,
-                     float* b, lapack_int* ldb, float* tola, float* tolb,
-                     lapack_int* k, lapack_int* l, float* u, lapack_int* ldu,
-                     float* v, lapack_int* ldv, float* q, lapack_int* ldq,
-                     lapack_int* iwork, float* tau, float* work,
-                     lapack_int* lwork, lapack_int *info );
-void LAPACK_dggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* p, lapack_int* n, double* a, lapack_int* lda,
-                     double* b, lapack_int* ldb, double* tola, double* tolb,
-                     lapack_int* k, lapack_int* l, double* u, lapack_int* ldu,
-                     double* v, lapack_int* ldv, double* q, lapack_int* ldq,
-                     lapack_int* iwork, double* tau, double* work,
-                     lapack_int* lwork, lapack_int *info );
-void LAPACK_cggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* p, lapack_int* n, lapack_complex_float* a,
-                     lapack_int* lda, lapack_complex_float* b, lapack_int* ldb,
-                     float* tola, float* tolb, lapack_int* k, lapack_int* l,
-                     lapack_complex_float* u, lapack_int* ldu,
-                     lapack_complex_float* v, lapack_int* ldv,
-                     lapack_complex_float* q, lapack_int* ldq, lapack_int* iwork,
-                     float* rwork, lapack_complex_float* tau,
-                     lapack_complex_float* work, lapack_int* lwork,
-                     lapack_int *info );
-void LAPACK_zggsvp3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* p, lapack_int* n, lapack_complex_double* a,
-                     lapack_int* lda, lapack_complex_double* b, lapack_int* ldb,
-                     double* tola, double* tolb, lapack_int* k, lapack_int* l,
-                     lapack_complex_double* u, lapack_int* ldu,
-                     lapack_complex_double* v, lapack_int* ldv,
-                     lapack_complex_double* q, lapack_int* ldq,
-                     lapack_int* iwork, double* rwork,
-                     lapack_complex_double* tau, lapack_complex_double* work,
-                     lapack_int* lwork, lapack_int *info );
-void LAPACK_stgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
-                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                    float* tola, float* tolb, float* alpha, float* beta,
-                    float* u, lapack_int* ldu, float* v, lapack_int* ldv,
-                    float* q, lapack_int* ldq, float* work, lapack_int* ncycle,
-                    lapack_int *info );
-void LAPACK_dtgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
-                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                    double* tola, double* tolb, double* alpha, double* beta,
-                    double* u, lapack_int* ldu, double* v, lapack_int* ldv,
-                    double* q, lapack_int* ldq, double* work,
-                    lapack_int* ncycle, lapack_int *info );
-void LAPACK_ctgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, float* tola,
-                    float* tolb, float* alpha, float* beta,
-                    lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* v, lapack_int* ldv,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* work, lapack_int* ncycle,
-                    lapack_int *info );
-void LAPACK_ztgsja( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* p, lapack_int* n, lapack_int* k, lapack_int* l,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, double* tola,
-                    double* tolb, double* alpha, double* beta,
-                    lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* v, lapack_int* ldv,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* work, lapack_int* ncycle,
-                    lapack_int *info );
-void LAPACK_sgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                   double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_cgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zgels( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_sgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb,
-                    lapack_int* jpvt, float* rcond, lapack_int* rank,
-                    float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb,
-                    lapack_int* jpvt, double* rcond, lapack_int* rank,
-                    double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_cgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, lapack_int* jpvt,
-                    float* rcond, lapack_int* rank, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int *info );
-void LAPACK_zgelsy( lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, lapack_int* jpvt,
-                    double* rcond, lapack_int* rank,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *info );
-void LAPACK_sgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
-                    float* rcond, lapack_int* rank, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
-                    double* rcond, lapack_int* rank, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, float* s,
-                    float* rcond, lapack_int* rank, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int *info );
-void LAPACK_zgelss( lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, double* s,
-                    double* rcond, lapack_int* rank,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *info );
-void LAPACK_sgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb, float* s,
-                    float* rcond, lapack_int* rank, float* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_dgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb, double* s,
-                    double* rcond, lapack_int* rank, double* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_cgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, float* s,
-                    float* rcond, lapack_int* rank, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_zgelsd( lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, double* s,
-                    double* rcond, lapack_int* rank,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_sgglse( lapack_int* m, lapack_int* n, lapack_int* p, float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb, float* c,
-                    float* d, float* x, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dgglse( lapack_int* m, lapack_int* n, lapack_int* p, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb, double* c,
-                    double* d, double* x, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cgglse( lapack_int* m, lapack_int* n, lapack_int* p,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* c, lapack_complex_float* d,
-                    lapack_complex_float* x, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zgglse( lapack_int* m, lapack_int* n, lapack_int* p,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* c, lapack_complex_double* d,
-                    lapack_complex_double* x, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sggglm( lapack_int* n, lapack_int* m, lapack_int* p, float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb, float* d,
-                    float* x, float* y, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dggglm( lapack_int* n, lapack_int* m, lapack_int* p, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb, double* d,
-                    double* x, double* y, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_cggglm( lapack_int* n, lapack_int* m, lapack_int* p,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* d, lapack_complex_float* x,
-                    lapack_complex_float* y, lapack_complex_float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_zggglm( lapack_int* n, lapack_int* m, lapack_int* p,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* d, lapack_complex_double* x,
-                    lapack_complex_double* y, lapack_complex_double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_ssyev( char* jobz, char* uplo, lapack_int* n, float* a,
-                   lapack_int* lda, float* w, float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_dsyev( char* jobz, char* uplo, lapack_int* n, double* a,
-                   lapack_int* lda, double* w, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_cheev( char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda, float* w,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int *info );
-void LAPACK_zheev( char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda, double* w,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int *info );
-void LAPACK_ssyevd( char* jobz, char* uplo, lapack_int* n, float* a,
-                    lapack_int* lda, float* w, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dsyevd( char* jobz, char* uplo, lapack_int* n, double* a,
-                    lapack_int* lda, double* w, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_cheevd( char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* w,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zheevd( char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* w,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_ssyevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    float* a, lapack_int* lda, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, float* z, lapack_int* ldz,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_dsyevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    double* a, lapack_int* lda, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, double* z, lapack_int* ldz,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_cheevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_zheevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_ssyevr( char* jobz, char* range, char* uplo, lapack_int* n,
-                    float* a, lapack_int* lda, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, float* z, lapack_int* ldz,
-                    lapack_int* isuppz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dsyevr( char* jobz, char* range, char* uplo, lapack_int* n,
-                    double* a, lapack_int* lda, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, double* z, lapack_int* ldz,
-                    lapack_int* isuppz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_cheevr( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_int* isuppz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zheevr( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_int* isuppz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_sspev( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
-                   float* z, lapack_int* ldz, float* work, lapack_int *info );
-void LAPACK_dspev( char* jobz, char* uplo, lapack_int* n, double* ap, double* w,
-                   double* z, lapack_int* ldz, double* work, lapack_int *info );
-void LAPACK_chpev( char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_float* ap, float* w, lapack_complex_float* z,
-                   lapack_int* ldz, lapack_complex_float* work, float* rwork,
-                   lapack_int *info );
-void LAPACK_zhpev( char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_double* ap, double* w,
-                   lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, double* rwork,
-                   lapack_int *info );
-void LAPACK_sspevd( char* jobz, char* uplo, lapack_int* n, float* ap, float* w,
-                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dspevd( char* jobz, char* uplo, lapack_int* n, double* ap,
-                    double* w, double* z, lapack_int* ldz, double* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_chpevd( char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_float* ap, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int* lrwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_zhpevd( char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_double* ap, double* w,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_sspevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    float* ap, float* vl, float* vu, lapack_int* il,
-                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
-                    float* z, lapack_int* ldz, float* work, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_dspevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    double* ap, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_chpevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_float* ap, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_zhpevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_double* ap, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_ssbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   float* ab, lapack_int* ldab, float* w, float* z,
-                   lapack_int* ldz, float* work, lapack_int *info );
-void LAPACK_dsbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   double* ab, lapack_int* ldab, double* w, double* z,
-                   lapack_int* ldz, double* work, lapack_int *info );
-void LAPACK_chbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   lapack_complex_float* ab, lapack_int* ldab, float* w,
-                   lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, float* rwork, lapack_int *info );
-void LAPACK_zhbev( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   lapack_complex_double* ab, lapack_int* ldab, double* w,
-                   lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, double* rwork,
-                   lapack_int *info );
-void LAPACK_ssbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    float* ab, lapack_int* ldab, float* w, float* z,
-                    lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dsbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    double* ab, lapack_int* ldab, double* w, double* z,
-                    lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_chbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_float* ab, lapack_int* ldab, float* w,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zhbevd( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_double* ab, lapack_int* ldab, double* w,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_ssbevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, float* ab, lapack_int* ldab, float* q,
-                    lapack_int* ldq, float* vl, float* vu, lapack_int* il,
-                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
-                    float* z, lapack_int* ldz, float* work,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_dsbevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, double* ab, lapack_int* ldab, double* q,
-                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, double* work,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_chbevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
-                    lapack_complex_float* q, lapack_int* ldq, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work,
-                    float* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_zhbevx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
-                    lapack_complex_double* q, lapack_int* ldq, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work,
-                    double* rwork, lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_sstev( char* jobz, lapack_int* n, float* d, float* e, float* z,
-                   lapack_int* ldz, float* work, lapack_int *info );
-void LAPACK_dstev( char* jobz, lapack_int* n, double* d, double* e, double* z,
-                   lapack_int* ldz, double* work, lapack_int *info );
-void LAPACK_sstevd( char* jobz, lapack_int* n, float* d, float* e, float* z,
-                    lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dstevd( char* jobz, lapack_int* n, double* d, double* e, double* z,
-                    lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_sstevx( char* jobz, char* range, lapack_int* n, float* d, float* e,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    float* abstol, lapack_int* m, float* w, float* z,
-                    lapack_int* ldz, float* work, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_dstevx( char* jobz, char* range, lapack_int* n, double* d,
-                    double* e, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_sstevr( char* jobz, char* range, lapack_int* n, float* d, float* e,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    float* abstol, lapack_int* m, float* w, float* z,
-                    lapack_int* ldz, lapack_int* isuppz, float* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_dstevr( char* jobz, char* range, lapack_int* n, double* d,
-                    double* e, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, lapack_int* isuppz,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_sgees( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
-                   lapack_int* n, float* a, lapack_int* lda, lapack_int* sdim,
-                   float* wr, float* wi, float* vs, lapack_int* ldvs,
-                   float* work, lapack_int* lwork, lapack_logical* bwork,
-                   lapack_int *info );
-void LAPACK_dgees( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
-                   lapack_int* n, double* a, lapack_int* lda, lapack_int* sdim,
-                   double* wr, double* wi, double* vs, lapack_int* ldvs,
-                   double* work, lapack_int* lwork, lapack_logical* bwork,
-                   lapack_int *info );
-void LAPACK_cgees( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
-                   lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   lapack_int* sdim, lapack_complex_float* w,
-                   lapack_complex_float* vs, lapack_int* ldvs,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_logical* bwork, lapack_int *info );
-void LAPACK_zgees( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
-                   lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   lapack_int* sdim, lapack_complex_double* w,
-                   lapack_complex_double* vs, lapack_int* ldvs,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_logical* bwork, lapack_int *info );
-void LAPACK_sgeesx( char* jobvs, char* sort, LAPACK_S_SELECT2 select,
-                    char* sense, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* sdim, float* wr, float* wi, float* vs,
-                    lapack_int* ldvs, float* rconde, float* rcondv, float* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_logical* bwork, lapack_int *info );
-void LAPACK_dgeesx( char* jobvs, char* sort, LAPACK_D_SELECT2 select,
-                    char* sense, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* sdim, double* wr, double* wi, double* vs,
-                    lapack_int* ldvs, double* rconde, double* rcondv,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_cgeesx( char* jobvs, char* sort, LAPACK_C_SELECT1 select,
-                    char* sense, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* sdim, lapack_complex_float* w,
-                    lapack_complex_float* vs, lapack_int* ldvs, float* rconde,
-                    float* rcondv, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_zgeesx( char* jobvs, char* sort, LAPACK_Z_SELECT1 select,
-                    char* sense, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* sdim, lapack_complex_double* w,
-                    lapack_complex_double* vs, lapack_int* ldvs, double* rconde,
-                    double* rcondv, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_sgeev( char* jobvl, char* jobvr, lapack_int* n, float* a,
-                   lapack_int* lda, float* wr, float* wi, float* vl,
-                   lapack_int* ldvl, float* vr, lapack_int* ldvr, float* work,
-                   lapack_int* lwork, lapack_int *info );
-void LAPACK_dgeev( char* jobvl, char* jobvr, lapack_int* n, double* a,
-                   lapack_int* lda, double* wr, double* wi, double* vl,
-                   lapack_int* ldvl, double* vr, lapack_int* ldvr, double* work,
-                   lapack_int* lwork, lapack_int *info );
-void LAPACK_cgeev( char* jobvl, char* jobvr, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* w, lapack_complex_float* vl,
-                   lapack_int* ldvl, lapack_complex_float* vr, lapack_int* ldvr,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int *info );
-void LAPACK_zgeev( char* jobvl, char* jobvr, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* w, lapack_complex_double* vl,
-                   lapack_int* ldvl, lapack_complex_double* vr,
-                   lapack_int* ldvr, lapack_complex_double* work,
-                   lapack_int* lwork, double* rwork, lapack_int *info );
-void LAPACK_sgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, float* a, lapack_int* lda, float* wr,
-                    float* wi, float* vl, lapack_int* ldvl, float* vr,
-                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
-                    float* scale, float* abnrm, float* rconde, float* rcondv,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_dgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, double* a, lapack_int* lda, double* wr,
-                    double* wi, double* vl, lapack_int* ldvl, double* vr,
-                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
-                    double* scale, double* abnrm, double* rconde,
-                    double* rcondv, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_cgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* w, lapack_complex_float* vl,
-                    lapack_int* ldvl, lapack_complex_float* vr,
-                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
-                    float* scale, float* abnrm, float* rconde, float* rcondv,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int *info );
-void LAPACK_zgeevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* w, lapack_complex_double* vl,
-                    lapack_int* ldvl, lapack_complex_double* vr,
-                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
-                    double* scale, double* abnrm, double* rconde,
-                    double* rcondv, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_int *info );
-void LAPACK_sgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
-                    float* a, lapack_int* lda, float* s, float* u,
-                    lapack_int* ldu, float* vt, lapack_int* ldvt, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_dgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
-                    double* a, lapack_int* lda, double* s, double* u,
-                    lapack_int* ldu, double* vt, lapack_int* ldvt, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* s,
-                    lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* vt, lapack_int* ldvt,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int *info );
-void LAPACK_zgesvd( char* jobu, char* jobvt, lapack_int* m, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* s,
-                    lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* vt, lapack_int* ldvt,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *info );
-void LAPACK_sgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n,
-                    float* a, lapack_int* lda, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, lapack_int* ns, float* s, float* u,
-                    lapack_int* ldu, float* vt, lapack_int* ldvt, float* work,
-                    lapack_int* lwork, lapack_int *iwork, lapack_int *info );
-void LAPACK_dgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n,
-                    double* a, lapack_int* lda, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, lapack_int* ns, double* s, double* u,
-                    lapack_int* ldu, double* vt, lapack_int* ldvt, double* work,
-                    lapack_int* lwork, lapack_int *iwork, lapack_int *info );
-void LAPACK_cgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, lapack_int* ns, float* s,
-                    lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* vt, lapack_int* ldvt,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int *iwork, lapack_int *info );
-void LAPACK_zgesvdx( char* jobu, char* jobvt, char* range, lapack_int* m, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, lapack_int* ns, double* s,
-                    lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* vt, lapack_int* ldvt,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *iwork, lapack_int *info );
-void LAPACK_sgesdd( char* jobz, lapack_int* m, lapack_int* n, float* a,
-                    lapack_int* lda, float* s, float* u, lapack_int* ldu,
-                    float* vt, lapack_int* ldvt, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dgesdd( char* jobz, lapack_int* m, lapack_int* n, double* a,
-                    lapack_int* lda, double* s, double* u, lapack_int* ldu,
-                    double* vt, lapack_int* ldvt, double* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_cgesdd( char* jobz, lapack_int* m, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* s,
-                    lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* vt, lapack_int* ldvt,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_zgesdd( char* jobz, lapack_int* m, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* s,
-                    lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* vt, lapack_int* ldvt,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_dgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
-                    char* jobp, lapack_int* m, lapack_int* n, double* a,
-                    lapack_int* lda, double* sva, double* u, lapack_int* ldu,
-                    double* v, lapack_int* ldv, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_sgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
-                    char* jobp, lapack_int* m, lapack_int* n, float* a,
-                    lapack_int* lda, float* sva, float* u, lapack_int* ldu,
-                    float* v, lapack_int* ldv, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_cgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
-                    char* jobp, lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, float* sva, lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* v, lapack_int* ldv, lapack_complex_float* cwork,
-                    lapack_int* lwork, float* work, lapack_int* lrwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_zgejsv( char* joba, char* jobu, char* jobv, char* jobr, char* jobt,
-                    char* jobp, lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, double* sva, lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* v, lapack_int* ldv, lapack_complex_double* cwork,
-                    lapack_int* lwork, double* work, lapack_int* lrwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_dgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
-                    lapack_int* n, double* a, lapack_int* lda, double* sva,
-                    lapack_int* mv, double* v, lapack_int* ldv, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_sgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
-                    lapack_int* n, float* a, lapack_int* lda, float* sva,
-                    lapack_int* mv, float* v, lapack_int* ldv, float* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
-                    lapack_int* n, lapack_complex_float* a, lapack_int* lda, float* sva,
-                    lapack_int* mv, lapack_complex_float* v, lapack_int* ldv,
-                    lapack_complex_float* cwork, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int *info );
-void LAPACK_zgesvj( char* joba, char* jobu, char* jobv, lapack_int* m,
-                    lapack_int* n, lapack_complex_double* a, lapack_int* lda, double* sva,
-                    lapack_int* mv, lapack_complex_double* v, lapack_int* ldv,
-                    lapack_complex_double* cwork, lapack_int* lwork, double* rwork,
-                    lapack_int* lrwork, lapack_int *info );
-void LAPACK_sggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                    float* alpha, float* beta, float* u, lapack_int* ldu,
-                    float* v, lapack_int* ldv, float* q, lapack_int* ldq,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                    double* alpha, double* beta, double* u, lapack_int* ldu,
-                    double* v, lapack_int* ldv, double* q, lapack_int* ldq,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_cggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, float* alpha,
-                    float* beta, lapack_complex_float* u, lapack_int* ldu,
-                    lapack_complex_float* v, lapack_int* ldv,
-                    lapack_complex_float* q, lapack_int* ldq,
-                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
-                    lapack_int *info );
-void LAPACK_zggsvd( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                    lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, double* alpha,
-                    double* beta, lapack_complex_double* u, lapack_int* ldu,
-                    lapack_complex_double* v, lapack_int* ldv,
-                    lapack_complex_double* q, lapack_int* ldq,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int* iwork, lapack_int *info );
-void LAPACK_sggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                     float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                     float* alpha, float* beta, float* u, lapack_int* ldu,
-                     float* v, lapack_int* ldv, float* q, lapack_int* ldq,
-                     float* work, lapack_int* lwork, lapack_int* iwork,
-                     lapack_int *info );
-void LAPACK_dggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                     double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                     double* alpha, double* beta, double* u, lapack_int* ldu,
-                     double* v, lapack_int* ldv, double* q, lapack_int* ldq,
-                     double* work, lapack_int* lwork, lapack_int* iwork,
-                     lapack_int *info );
-void LAPACK_cggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* b, lapack_int* ldb, float* alpha,
-                     float* beta, lapack_complex_float* u, lapack_int* ldu,
-                     lapack_complex_float* v, lapack_int* ldv,
-                     lapack_complex_float* q, lapack_int* ldq,
-                     lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                     lapack_int* iwork, lapack_int *info );
-void LAPACK_zggsvd3( char* jobu, char* jobv, char* jobq, lapack_int* m,
-                     lapack_int* n, lapack_int* p, lapack_int* k, lapack_int* l,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* b, lapack_int* ldb, double* alpha,
-                     double* beta, lapack_complex_double* u, lapack_int* ldu,
-                     lapack_complex_double* v, lapack_int* ldv,
-                     lapack_complex_double* q, lapack_int* ldq,
-                     lapack_complex_double* work, lapack_int* lwork,
-                     double* rwork, lapack_int* iwork, lapack_int *info );
-void LAPACK_ssygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   float* w, float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsygv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                   double* w, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_chegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, float* w,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int *info );
-void LAPACK_zhegv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, double* w,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int *info );
-void LAPACK_ssygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                    float* w, float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_dsygvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                    double* w, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_chegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, float* w,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zhegvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, double* w,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_ssygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, float* a, lapack_int* lda, float* b,
-                    lapack_int* ldb, float* vl, float* vu, lapack_int* il,
-                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
-                    float* z, lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_dsygvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, double* a, lapack_int* lda, double* b,
-                    lapack_int* ldb, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_chegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_zhegvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_sspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   float* ap, float* bp, float* w, float* z, lapack_int* ldz,
-                   float* work, lapack_int *info );
-void LAPACK_dspgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   double* ap, double* bp, double* w, double* z,
-                   lapack_int* ldz, double* work, lapack_int *info );
-void LAPACK_chpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_float* ap, lapack_complex_float* bp, float* w,
-                   lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, float* rwork, lapack_int *info );
-void LAPACK_zhpgv( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_double* ap, lapack_complex_double* bp,
-                   double* w, lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, double* rwork,
-                   lapack_int *info );
-void LAPACK_sspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    float* ap, float* bp, float* w, float* z, lapack_int* ldz,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_dspgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    double* ap, double* bp, double* w, double* z,
-                    lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_chpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_float* ap, lapack_complex_float* bp,
-                    float* w, lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zhpgvd( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_double* ap, lapack_complex_double* bp,
-                    double* w, lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_sspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, float* ap, float* bp, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, float* z, lapack_int* ldz,
-                    float* work, lapack_int* iwork, lapack_int* ifail,
-                    lapack_int *info );
-void LAPACK_dspgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, double* ap, double* bp, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, double* z, lapack_int* ldz,
-                    double* work, lapack_int* iwork, lapack_int* ifail,
-                    lapack_int *info );
-void LAPACK_chpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, lapack_complex_float* ap,
-                    lapack_complex_float* bp, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work, float* rwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_zhpgvx( lapack_int* itype, char* jobz, char* range, char* uplo,
-                    lapack_int* n, lapack_complex_double* ap,
-                    lapack_complex_double* bp, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work, double* rwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_ssbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                   lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
-                   lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
-                   float* work, lapack_int *info );
-void LAPACK_dsbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                   lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
-                   lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
-                   double* work, lapack_int *info );
-void LAPACK_chbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                   lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
-                   lapack_complex_float* bb, lapack_int* ldbb, float* w,
-                   lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, float* rwork, lapack_int *info );
-void LAPACK_zhbgv( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                   lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
-                   lapack_complex_double* bb, lapack_int* ldbb, double* w,
-                   lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, double* rwork,
-                   lapack_int *info );
-void LAPACK_ssbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, float* ab, lapack_int* ldab, float* bb,
-                    lapack_int* ldbb, float* w, float* z, lapack_int* ldz,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_dsbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, double* ab, lapack_int* ldab, double* bb,
-                    lapack_int* ldbb, double* w, double* z, lapack_int* ldz,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_chbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, lapack_complex_float* ab, lapack_int* ldab,
-                    lapack_complex_float* bb, lapack_int* ldbb, float* w,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zhbgvd( char* jobz, char* uplo, lapack_int* n, lapack_int* ka,
-                    lapack_int* kb, lapack_complex_double* ab, lapack_int* ldab,
-                    lapack_complex_double* bb, lapack_int* ldbb, double* w,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_ssbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* ka, lapack_int* kb, float* ab, lapack_int* ldab,
-                    float* bb, lapack_int* ldbb, float* q, lapack_int* ldq,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    float* abstol, lapack_int* m, float* w, float* z,
-                    lapack_int* ldz, float* work, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_dsbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* ka, lapack_int* kb, double* ab,
-                    lapack_int* ldab, double* bb, lapack_int* ldbb, double* q,
-                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, double* work, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_chbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* ka, lapack_int* kb, lapack_complex_float* ab,
-                    lapack_int* ldab, lapack_complex_float* bb,
-                    lapack_int* ldbb, lapack_complex_float* q, lapack_int* ldq,
-                    float* vl, float* vu, lapack_int* il, lapack_int* iu,
-                    float* abstol, lapack_int* m, float* w,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, float* rwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_zhbgvx( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* ka, lapack_int* kb, lapack_complex_double* ab,
-                    lapack_int* ldab, lapack_complex_double* bb,
-                    lapack_int* ldbb, lapack_complex_double* q, lapack_int* ldq,
-                    double* vl, double* vu, lapack_int* il, lapack_int* iu,
-                    double* abstol, lapack_int* m, double* w,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, double* rwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_sgges( char* jobvsl, char* jobvsr, char* sort,
-                   LAPACK_S_SELECT3 selctg, lapack_int* n, float* a,
-                   lapack_int* lda, float* b, lapack_int* ldb, lapack_int* sdim,
-                   float* alphar, float* alphai, float* beta, float* vsl,
-                   lapack_int* ldvsl, float* vsr, lapack_int* ldvsr,
-                   float* work, lapack_int* lwork, lapack_logical* bwork,
-                   lapack_int *info );
-void LAPACK_dgges( char* jobvsl, char* jobvsr, char* sort,
-                   LAPACK_D_SELECT3 selctg, lapack_int* n, double* a,
-                   lapack_int* lda, double* b, lapack_int* ldb,
-                   lapack_int* sdim, double* alphar, double* alphai,
-                   double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
-                   lapack_int* ldvsr, double* work, lapack_int* lwork,
-                   lapack_logical* bwork, lapack_int *info );
-void LAPACK_cgges( char* jobvsl, char* jobvsr, char* sort,
-                   LAPACK_C_SELECT2 selctg, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
-                   lapack_complex_float* alpha, lapack_complex_float* beta,
-                   lapack_complex_float* vsl, lapack_int* ldvsl,
-                   lapack_complex_float* vsr, lapack_int* ldvsr,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_logical* bwork, lapack_int *info );
-void LAPACK_zgges( char* jobvsl, char* jobvsr, char* sort,
-                   LAPACK_Z_SELECT2 selctg, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
-                   lapack_complex_double* alpha, lapack_complex_double* beta,
-                   lapack_complex_double* vsl, lapack_int* ldvsl,
-                   lapack_complex_double* vsr, lapack_int* ldvsr,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_logical* bwork, lapack_int *info );
-void LAPACK_sgges3( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_S_SELECT3 selctg, lapack_int* n,
-                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                    lapack_int* sdim, float* alphar, float* alphai,
-                    float* beta, float* vsl, lapack_int* ldvsl,
-                    float* vsr, lapack_int* ldvsr,
-                    float* work, lapack_int* lwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_dgges3( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_D_SELECT3 selctg, lapack_int* n, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb,
-                    lapack_int* sdim, double* alphar, double* alphai,
-                    double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
-                    lapack_int* ldvsr, double* work, lapack_int* lwork,
-                    lapack_logical* bwork, lapack_int *info );
-void LAPACK_cgges3( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_C_SELECT2 selctg, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_int* sdim,
-                    lapack_complex_float* alpha, lapack_complex_float* beta,
-                    lapack_complex_float* vsl, lapack_int* ldvsl,
-                    lapack_complex_float* vsr, lapack_int* ldvsr,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_logical* bwork, lapack_int *info );
-void LAPACK_zgges3( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_Z_SELECT2 selctg, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
-                    lapack_complex_double* alpha, lapack_complex_double* beta,
-                    lapack_complex_double* vsl, lapack_int* ldvsl,
-                    lapack_complex_double* vsr, lapack_int* ldvsr,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_logical* bwork, lapack_int *info );
-void LAPACK_sggesx( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_S_SELECT3 selctg, char* sense, lapack_int* n,
-                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                    lapack_int* sdim, float* alphar, float* alphai, float* beta,
-                    float* vsl, lapack_int* ldvsl, float* vsr,
-                    lapack_int* ldvsr, float* rconde, float* rcondv,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_dggesx( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_D_SELECT3 selctg, char* sense, lapack_int* n,
-                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                    lapack_int* sdim, double* alphar, double* alphai,
-                    double* beta, double* vsl, lapack_int* ldvsl, double* vsr,
-                    lapack_int* ldvsr, double* rconde, double* rcondv,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_cggesx( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_C_SELECT2 selctg, char* sense, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb, lapack_int* sdim,
-                    lapack_complex_float* alpha, lapack_complex_float* beta,
-                    lapack_complex_float* vsl, lapack_int* ldvsl,
-                    lapack_complex_float* vsr, lapack_int* ldvsr, float* rconde,
-                    float* rcondv, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_zggesx( char* jobvsl, char* jobvsr, char* sort,
-                    LAPACK_Z_SELECT2 selctg, char* sense, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb, lapack_int* sdim,
-                    lapack_complex_double* alpha, lapack_complex_double* beta,
-                    lapack_complex_double* vsl, lapack_int* ldvsl,
-                    lapack_complex_double* vsr, lapack_int* ldvsr,
-                    double* rconde, double* rcondv, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_sggev( char* jobvl, char* jobvr, lapack_int* n, float* a,
-                   lapack_int* lda, float* b, lapack_int* ldb, float* alphar,
-                   float* alphai, float* beta, float* vl, lapack_int* ldvl,
-                   float* vr, lapack_int* ldvr, float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_dggev( char* jobvl, char* jobvr, lapack_int* n, double* a,
-                   lapack_int* lda, double* b, lapack_int* ldb, double* alphar,
-                   double* alphai, double* beta, double* vl, lapack_int* ldvl,
-                   double* vr, lapack_int* ldvr, double* work,
-                   lapack_int* lwork, lapack_int *info );
-void LAPACK_cggev( char* jobvl, char* jobvr, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* alpha, lapack_complex_float* beta,
-                   lapack_complex_float* vl, lapack_int* ldvl,
-                   lapack_complex_float* vr, lapack_int* ldvr,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int *info );
-void LAPACK_zggev( char* jobvl, char* jobvr, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* alpha, lapack_complex_double* beta,
-                   lapack_complex_double* vl, lapack_int* ldvl,
-                   lapack_complex_double* vr, lapack_int* ldvr,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int *info );
-void LAPACK_sggev3( char* jobvl, char* jobvr, lapack_int* n, float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb, float* alphar,
-                    float* alphai, float* beta, float* vl, lapack_int* ldvl,
-                    float* vr, lapack_int* ldvr, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dggev3( char* jobvl, char* jobvr, lapack_int* n, double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb, double* alphar,
-                    double* alphai, double* beta, double* vl, lapack_int* ldvl,
-                    double* vr, lapack_int* ldvr, double* work,
-                    lapack_int* lwork, lapack_int *info );
-void LAPACK_cggev3( char* jobvl, char* jobvr, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* alpha, lapack_complex_float* beta,
-                    lapack_complex_float* vl, lapack_int* ldvl,
-                    lapack_complex_float* vr, lapack_int* ldvr,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int *info );
-void LAPACK_zggev3( char* jobvl, char* jobvr, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* alpha, lapack_complex_double* beta,
-                    lapack_complex_double* vl, lapack_int* ldvl,
-                    lapack_complex_double* vr, lapack_int* ldvr,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int *info );
-void LAPACK_sggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, float* a, lapack_int* lda, float* b,
-                    lapack_int* ldb, float* alphar, float* alphai, float* beta,
-                    float* vl, lapack_int* ldvl, float* vr, lapack_int* ldvr,
-                    lapack_int* ilo, lapack_int* ihi, float* lscale,
-                    float* rscale, float* abnrm, float* bbnrm, float* rconde,
-                    float* rcondv, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_dggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, double* a, lapack_int* lda, double* b,
-                    lapack_int* ldb, double* alphar, double* alphai,
-                    double* beta, double* vl, lapack_int* ldvl, double* vr,
-                    lapack_int* ldvr, lapack_int* ilo, lapack_int* ihi,
-                    double* lscale, double* rscale, double* abnrm,
-                    double* bbnrm, double* rconde, double* rcondv, double* work,
-                    lapack_int* lwork, lapack_int* iwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_cggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* alpha, lapack_complex_float* beta,
-                    lapack_complex_float* vl, lapack_int* ldvl,
-                    lapack_complex_float* vr, lapack_int* ldvr, lapack_int* ilo,
-                    lapack_int* ihi, float* lscale, float* rscale, float* abnrm,
-                    float* bbnrm, float* rconde, float* rcondv,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* iwork, lapack_logical* bwork,
-                    lapack_int *info );
-void LAPACK_zggevx( char* balanc, char* jobvl, char* jobvr, char* sense,
-                    lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* alpha, lapack_complex_double* beta,
-                    lapack_complex_double* vl, lapack_int* ldvl,
-                    lapack_complex_double* vr, lapack_int* ldvr,
-                    lapack_int* ilo, lapack_int* ihi, double* lscale,
-                    double* rscale, double* abnrm, double* bbnrm,
-                    double* rconde, double* rcondv, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_int* iwork,
-                    lapack_logical* bwork, lapack_int *info );
-void LAPACK_dsfrk( char* transr, char* uplo, char* trans, lapack_int* n,
-                   lapack_int* k, double* alpha, const double* a,
-                   lapack_int* lda, double* beta, double* c );
-void LAPACK_ssfrk( char* transr, char* uplo, char* trans, lapack_int* n,
-                   lapack_int* k, float* alpha, const float* a, lapack_int* lda,
-                   float* beta, float* c );
-void LAPACK_zhfrk( char* transr, char* uplo, char* trans, lapack_int* n,
-                   lapack_int* k, double* alpha, const lapack_complex_double* a,
-                   lapack_int* lda, double* beta, lapack_complex_double* c );
-void LAPACK_chfrk( char* transr, char* uplo, char* trans, lapack_int* n,
-                   lapack_int* k, float* alpha, const lapack_complex_float* a,
-                   lapack_int* lda, float* beta, lapack_complex_float* c );
-void LAPACK_dtfsm( char* transr, char* side, char* uplo, char* trans,
-                   char* diag, lapack_int* m, lapack_int* n, double* alpha,
-                   const double* a, double* b, lapack_int* ldb );
-void LAPACK_stfsm( char* transr, char* side, char* uplo, char* trans,
-                   char* diag, lapack_int* m, lapack_int* n, float* alpha,
-                   const float* a, float* b, lapack_int* ldb );
-void LAPACK_ztfsm( char* transr, char* side, char* uplo, char* trans,
-                   char* diag, lapack_int* m, lapack_int* n,
-                   lapack_complex_double* alpha, const lapack_complex_double* a,
-                   lapack_complex_double* b, lapack_int* ldb );
-void LAPACK_ctfsm( char* transr, char* side, char* uplo, char* trans,
-                   char* diag, lapack_int* m, lapack_int* n,
-                   lapack_complex_float* alpha, const lapack_complex_float* a,
-                   lapack_complex_float* b, lapack_int* ldb );
-void LAPACK_dtfttp( char* transr, char* uplo, lapack_int* n, const double* arf,
-                    double* ap, lapack_int *info );
-void LAPACK_stfttp( char* transr, char* uplo, lapack_int* n, const float* arf,
-                    float* ap, lapack_int *info );
-void LAPACK_ztfttp( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_double* arf, lapack_complex_double* ap,
-                    lapack_int *info );
-void LAPACK_ctfttp( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_float* arf, lapack_complex_float* ap,
-                    lapack_int *info );
-void LAPACK_dtfttr( char* transr, char* uplo, lapack_int* n, const double* arf,
-                    double* a, lapack_int* lda, lapack_int *info );
-void LAPACK_stfttr( char* transr, char* uplo, lapack_int* n, const float* arf,
-                    float* a, lapack_int* lda, lapack_int *info );
-void LAPACK_ztfttr( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_double* arf, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_ctfttr( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_float* arf, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_dtpttf( char* transr, char* uplo, lapack_int* n, const double* ap,
-                    double* arf, lapack_int *info );
-void LAPACK_stpttf( char* transr, char* uplo, lapack_int* n, const float* ap,
-                    float* arf, lapack_int *info );
-void LAPACK_ztpttf( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_double* ap, lapack_complex_double* arf,
-                    lapack_int *info );
-void LAPACK_ctpttf( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_float* ap, lapack_complex_float* arf,
-                    lapack_int *info );
-void LAPACK_dtpttr( char* uplo, lapack_int* n, const double* ap, double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_stpttr( char* uplo, lapack_int* n, const float* ap, float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_ztpttr( char* uplo, lapack_int* n, const lapack_complex_double* ap,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_ctpttr( char* uplo, lapack_int* n, const lapack_complex_float* ap,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_dtrttf( char* transr, char* uplo, lapack_int* n, const double* a,
-                    lapack_int* lda, double* arf, lapack_int *info );
-void LAPACK_strttf( char* transr, char* uplo, lapack_int* n, const float* a,
-                    lapack_int* lda, float* arf, lapack_int *info );
-void LAPACK_ztrttf( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* arf, lapack_int *info );
-void LAPACK_ctrttf( char* transr, char* uplo, lapack_int* n,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* arf, lapack_int *info );
-void LAPACK_dtrttp( char* uplo, lapack_int* n, const double* a, lapack_int* lda,
-                    double* ap, lapack_int *info );
-void LAPACK_strttp( char* uplo, lapack_int* n, const float* a, lapack_int* lda,
-                    float* ap, lapack_int *info );
-void LAPACK_ztrttp( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* ap,
-                    lapack_int *info );
-void LAPACK_ctrttp( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* ap,
-                    lapack_int *info );
-void LAPACK_sgeqrfp( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                     float* tau, float* work, lapack_int* lwork,
-                     lapack_int *info );
-void LAPACK_dgeqrfp( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                     double* tau, double* work, lapack_int* lwork,
-                     lapack_int *info );
-void LAPACK_cgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                     lapack_int* lda, lapack_complex_float* tau,
-                     lapack_complex_float* work, lapack_int* lwork,
-                     lapack_int *info );
-void LAPACK_zgeqrfp( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                     lapack_int* lda, lapack_complex_double* tau,
-                     lapack_complex_double* work, lapack_int* lwork,
-                     lapack_int *info );
-void LAPACK_clacgv( lapack_int* n, lapack_complex_float* x, lapack_int* incx );
-void LAPACK_zlacgv( lapack_int* n, lapack_complex_double* x, lapack_int* incx );
-void LAPACK_slarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
-                    float* x );
-void LAPACK_dlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
-                    double* x );
-void LAPACK_clarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
-                    lapack_complex_float* x );
-void LAPACK_zlarnv( lapack_int* idist, lapack_int* iseed, lapack_int* n,
-                    lapack_complex_double* x );
-void LAPACK_sgeqr2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* tau, float* work, lapack_int *info );
-void LAPACK_dgeqr2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* tau, double* work, lapack_int *info );
-void LAPACK_cgeqr2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zgeqr2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_slacn2( lapack_int* n, float* v, float* x, lapack_int* isgn,
-                    float* est, lapack_int* kase, lapack_int* isave );
-void LAPACK_dlacn2( lapack_int* n, double* v, double* x, lapack_int* isgn,
-                    double* est, lapack_int* kase, lapack_int* isave );
-void LAPACK_clacn2( lapack_int* n, lapack_complex_float* v,
-                    lapack_complex_float* x, float* est,
-                    lapack_int* kase, lapack_int* isave );
-void LAPACK_zlacn2( lapack_int* n, lapack_complex_double* v,
-                    lapack_complex_double* x, double* est,
-                    lapack_int* kase, lapack_int* isave );
-void LAPACK_slacpy( char* uplo, lapack_int* m, lapack_int* n, const float* a,
-                    lapack_int* lda, float* b, lapack_int* ldb );
-void LAPACK_dlacpy( char* uplo, lapack_int* m, lapack_int* n, const double* a,
-                    lapack_int* lda, double* b, lapack_int* ldb );
-void LAPACK_clacpy( char* uplo, lapack_int* m, lapack_int* n,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb );
-void LAPACK_zlacpy( char* uplo, lapack_int* m, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb );
-
-void LAPACK_clacp2( char* uplo, lapack_int* m, lapack_int* n, const float* a,
-                    lapack_int* lda, lapack_complex_float* b, lapack_int* ldb );
-void LAPACK_zlacp2( char* uplo, lapack_int* m, lapack_int* n, const double* a,
-                    lapack_int* lda, lapack_complex_double* b,
-                    lapack_int* ldb );
-
-void LAPACK_sgetf2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_dgetf2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* ipiv, lapack_int *info );
-void LAPACK_cgetf2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
-void LAPACK_zgetf2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv, lapack_int *info );
-void LAPACK_slaswp( lapack_int* n, float* a, lapack_int* lda, lapack_int* k1,
-                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
-void LAPACK_dlaswp( lapack_int* n, double* a, lapack_int* lda, lapack_int* k1,
-                    lapack_int* k2, const lapack_int* ipiv, lapack_int* incx );
-void LAPACK_claswp( lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
-                    lapack_int* incx );
-void LAPACK_zlaswp( lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                    lapack_int* k1, lapack_int* k2, const lapack_int* ipiv,
-                    lapack_int* incx );
-float LAPACK_slange( char* norm, lapack_int* m, lapack_int* n, const float* a,
-                    lapack_int* lda, float* work );
-double LAPACK_dlange( char* norm, lapack_int* m, lapack_int* n, const double* a,
-                    lapack_int* lda, double* work );
-float LAPACK_clange( char* norm, lapack_int* m, lapack_int* n,
-                    const lapack_complex_float* a, lapack_int* lda, float* work );
-double LAPACK_zlange( char* norm, lapack_int* m, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda, double* work );
-float LAPACK_clanhe( char* norm, char* uplo, lapack_int* n,
-                    const lapack_complex_float* a, lapack_int* lda, float* work );
-double LAPACK_zlanhe( char* norm, char* uplo, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda, double* work );
-void LAPACK_clarcm( lapack_int* m, lapack_int* n, const float* a,
-                    lapack_int* lda, const lapack_complex_float* b,
-                    lapack_int* ldb, lapack_complex_float* c,
-                    lapack_int* ldc, float* work );
-void LAPACK_zlarcm( lapack_int* m, lapack_int* n, const double* a,
-                    lapack_int* lda, const lapack_complex_double* b,
-                    lapack_int* ldb, lapack_complex_double* c,
-                    lapack_int* ldc, double* work );
-void LAPACK_clacrm( lapack_int* m, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const float* b,
-                    lapack_int* ldb, lapack_complex_float* c,
-                    lapack_int* ldc, float* work );
-void LAPACK_zlacrm( lapack_int* m, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const double* b,
-                    lapack_int* ldb, lapack_complex_double* c,
-                    lapack_int* ldc, double* work );
-float LAPACK_slansy( char* norm, char* uplo, lapack_int* n, const float* a,
-                    lapack_int* lda, float* work );
-double LAPACK_dlansy( char* norm, char* uplo, lapack_int* n, const double* a,
-                    lapack_int* lda, double* work );
-float LAPACK_clansy( char* norm, char* uplo, lapack_int* n,
-                    const lapack_complex_float* a, lapack_int* lda, float* work );
-double LAPACK_zlansy( char* norm, char* uplo, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda, double* work );
-float LAPACK_slantr( char* norm, char* uplo, char* diag, lapack_int* m,
-                    lapack_int* n, const float* a, lapack_int* lda, float* work );
-double LAPACK_dlantr( char* norm, char* uplo, char* diag, lapack_int* m,
-                    lapack_int* n, const double* a, lapack_int* lda, double* work );
-float LAPACK_clantr( char* norm, char* uplo, char* diag, lapack_int* m,
-                    lapack_int* n, const lapack_complex_float* a, lapack_int* lda,
-                    float* work );
-double LAPACK_zlantr( char* norm, char* uplo, char* diag, lapack_int* m,
-                    lapack_int* n, const lapack_complex_double* a, lapack_int* lda,
-                    double* work );
-float LAPACK_slamch( char* cmach );
-double LAPACK_dlamch( char* cmach );
-void LAPACK_sgelq2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                    float* tau, float* work, lapack_int *info );
-void LAPACK_dgelq2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                    double* tau, double* work, lapack_int *info );
-void LAPACK_cgelq2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* tau,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zgelq2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* tau,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_slarfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k, const float* v,
-                    lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
-                    lapack_int* ldc, float* work, lapack_int* ldwork );
-void LAPACK_dlarfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k,
-                    const double* v, lapack_int* ldv, const double* t,
-                    lapack_int* ldt, double* c, lapack_int* ldc, double* work,
-                    lapack_int* ldwork );
-void LAPACK_clarfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k,
-                    const lapack_complex_float* v, lapack_int* ldv,
-                    const lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* ldwork );
-void LAPACK_zlarfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k,
-                    const lapack_complex_double* v, lapack_int* ldv,
-                    const lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* ldwork );
-void LAPACK_slarfg( lapack_int* n, float* alpha, float* x, lapack_int* incx,
-                    float* tau );
-void LAPACK_dlarfg( lapack_int* n, double* alpha, double* x, lapack_int* incx,
-                    double* tau );
-void LAPACK_clarfg( lapack_int* n, lapack_complex_float* alpha,
-                    lapack_complex_float* x, lapack_int* incx,
-                    lapack_complex_float* tau );
-void LAPACK_zlarfg( lapack_int* n, lapack_complex_double* alpha,
-                    lapack_complex_double* x, lapack_int* incx,
-                    lapack_complex_double* tau );
-void LAPACK_slassq( lapack_int *n,                 float* x, lapack_int *incx,  float* scale,  float* sumsq );
-void LAPACK_dlassq( lapack_int *n,                double* x, lapack_int *incx, double* scale, double* sumsq );
-void LAPACK_classq( lapack_int *n,  lapack_complex_float* x, lapack_int *incx,  float* scale,  float* sumsq );
-void LAPACK_zlassq( lapack_int *n, lapack_complex_double* x, lapack_int *incx, double* scale, double* sumsq );
-void LAPACK_slarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
-                    const float* v, lapack_int* ldv, const float* tau, float* t,
-                    lapack_int* ldt );
-void LAPACK_dlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
-                    const double* v, lapack_int* ldv, const double* tau,
-                    double* t, lapack_int* ldt );
-void LAPACK_clarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
-                    const lapack_complex_float* v, lapack_int* ldv,
-                    const lapack_complex_float* tau, lapack_complex_float* t,
-                    lapack_int* ldt );
-void LAPACK_zlarft( char* direct, char* storev, lapack_int* n, lapack_int* k,
-                    const lapack_complex_double* v, lapack_int* ldv,
-                    const lapack_complex_double* tau, lapack_complex_double* t,
-                    lapack_int* ldt );
-void LAPACK_slarfx( char* side, lapack_int* m, lapack_int* n, const float* v,
-                    float* tau, float* c, lapack_int* ldc, float* work );
-void LAPACK_dlarfx( char* side, lapack_int* m, lapack_int* n, const double* v,
-                    double* tau, double* c, lapack_int* ldc, double* work );
-void LAPACK_clarfx( char* side, lapack_int* m, lapack_int* n,
-                    const lapack_complex_float* v, lapack_complex_float* tau,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work );
-void LAPACK_zlarfx( char* side, lapack_int* m, lapack_int* n,
-                    const lapack_complex_double* v, lapack_complex_double* tau,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work );
-void LAPACK_slatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
-                    char* sym, float* d, lapack_int* mode, float* cond,
-                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
-                    float* a, lapack_int* lda, float* work, lapack_int *info );
-void LAPACK_dlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
-                    char* sym, double* d, lapack_int* mode, double* cond,
-                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
-                    double* a, lapack_int* lda, double* work,
-                    lapack_int *info );
-void LAPACK_clatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
-                    char* sym, float* d, lapack_int* mode, float* cond,
-                    float* dmax, lapack_int* kl, lapack_int* ku, char* pack,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zlatms( lapack_int* m, lapack_int* n, char* dist, lapack_int* iseed,
-                    char* sym, double* d, lapack_int* mode, double* cond,
-                    double* dmax, lapack_int* kl, lapack_int* ku, char* pack,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_slag2d( lapack_int* m, lapack_int* n, const float* sa,
-                    lapack_int* ldsa, double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_dlag2s( lapack_int* m, lapack_int* n, const double* a,
-                    lapack_int* lda, float* sa, lapack_int* ldsa,
-                    lapack_int *info );
-void LAPACK_clag2z( lapack_int* m, lapack_int* n,
-                    const lapack_complex_float* sa, lapack_int* ldsa,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_zlag2c( lapack_int* m, lapack_int* n,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_float* sa, lapack_int* ldsa,
-                    lapack_int *info );
-void LAPACK_slauum( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_dlauum( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int *info );
-void LAPACK_clauum( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_zlauum( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_slagge( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const float* d, float* a, lapack_int* lda,
-                    lapack_int* iseed, float* work, lapack_int *info );
-void LAPACK_dlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const double* d, double* a, lapack_int* lda,
-                    lapack_int* iseed, double* work, lapack_int *info );
-void LAPACK_clagge( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const float* d, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* iseed,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zlagge( lapack_int* m, lapack_int* n, lapack_int* kl,
-                    lapack_int* ku, const double* d, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* iseed,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_slascl( char* type, lapack_int* kl, lapack_int* ku, float* cfrom,
-                    float* cto, lapack_int* m, lapack_int* n, float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_dlascl( char* type, lapack_int* kl, lapack_int* ku, double* cfrom,
-                    double* cto, lapack_int* m, lapack_int* n, double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_clascl( char* type, lapack_int* kl, lapack_int* ku, float* cfrom,
-                    float* cto, lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_zlascl( char* type, lapack_int* kl, lapack_int* ku, double* cfrom,
-                    double* cto, lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int *info );
-void LAPACK_slaset( char* uplo, lapack_int* m, lapack_int* n, float* alpha,
-                    float* beta, float* a, lapack_int* lda );
-void LAPACK_dlaset( char* uplo, lapack_int* m, lapack_int* n, double* alpha,
-                    double* beta, double* a, lapack_int* lda );
-void LAPACK_claset( char* uplo, lapack_int* m, lapack_int* n,
-                    lapack_complex_float* alpha, lapack_complex_float* beta,
-                    lapack_complex_float* a, lapack_int* lda );
-void LAPACK_zlaset( char* uplo, lapack_int* m, lapack_int* n,
-                    lapack_complex_double* alpha, lapack_complex_double* beta,
-                    lapack_complex_double* a, lapack_int* lda );
-void LAPACK_slasrt( char* id, lapack_int* n, float* d, lapack_int *info );
-void LAPACK_dlasrt( char* id, lapack_int* n, double* d, lapack_int *info );
-void LAPACK_claghe( lapack_int* n, lapack_int* k, const float* d,
-                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zlaghe( lapack_int* n, lapack_int* k, const double* d,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_int* iseed, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_slagsy( lapack_int* n, lapack_int* k, const float* d, float* a,
-                    lapack_int* lda, lapack_int* iseed, float* work,
-                    lapack_int *info );
-void LAPACK_dlagsy( lapack_int* n, lapack_int* k, const double* d, double* a,
-                    lapack_int* lda, lapack_int* iseed, double* work,
-                    lapack_int *info );
-void LAPACK_clagsy( lapack_int* n, lapack_int* k, const float* d,
-                    lapack_complex_float* a, lapack_int* lda, lapack_int* iseed,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zlagsy( lapack_int* n, lapack_int* k, const double* d,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_int* iseed, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_slapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    float* x, lapack_int* ldx, lapack_int* k );
-void LAPACK_dlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    double* x, lapack_int* ldx, lapack_int* k );
-void LAPACK_clapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    lapack_complex_float* x, lapack_int* ldx, lapack_int* k );
-void LAPACK_zlapmr( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    lapack_complex_double* x, lapack_int* ldx, lapack_int* k );
-void LAPACK_slapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    float* x, lapack_int* ldx, lapack_int* k );
-void LAPACK_dlapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    double* x, lapack_int* ldx, lapack_int* k );
-void LAPACK_clapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    lapack_complex_float* x, lapack_int* ldx, lapack_int* k );
-void LAPACK_zlapmt( lapack_logical* forwrd, lapack_int* m, lapack_int* n,
-                    lapack_complex_double* x, lapack_int* ldx, lapack_int* k );
-float LAPACK_slapy2( float* x, float* y );
-double LAPACK_dlapy2( double* x, double* y );
-float LAPACK_slapy3( float* x, float* y, float* z );
-double LAPACK_dlapy3( double* x, double* y, double* z );
-void LAPACK_slartgp( float* f, float* g, float* cs, float* sn, float* r );
-void LAPACK_dlartgp( double* f, double* g, double* cs, double* sn, double* r );
-void LAPACK_slartgs( float* x, float* y, float* sigma, float* cs, float* sn );
-void LAPACK_dlartgs( double* x, double* y, double* sigma, double* cs,
-                     double* sn );
-// LAPACK 3.3.0
-void LAPACK_cbbcsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    float* theta, float* phi,
-                    lapack_complex_float* u1, lapack_int* ldu1,
-                    lapack_complex_float* u2, lapack_int* ldu2,
-                    lapack_complex_float* v1t, lapack_int* ldv1t,
-                    lapack_complex_float* v2t, lapack_int* ldv2t,
-                    float* b11d, float* b11e, float* b12d,
-                    float* b12e, float* b21d, float* b21e,
-                    float* b22d, float* b22e, float* rwork,
-                    lapack_int* lrwork , lapack_int *info );
-void LAPACK_cheswapr( char* uplo, lapack_int* n, lapack_complex_float* a,
-                      lapack_int* lda, lapack_int* i1, lapack_int* i2 );
-void LAPACK_chetri2( char* uplo, lapack_int* n,
-                     lapack_complex_float* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_chetri2x( char* uplo, lapack_int* n,
-                      lapack_complex_float* a, lapack_int* lda,
-                      const lapack_int* ipiv,
-                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
-void LAPACK_chetrs2( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const lapack_complex_float* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* work , lapack_int *info );
-void LAPACK_csyconv( char* uplo, char* way,
-                     lapack_int* n, lapack_complex_float* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_float* e , lapack_int *info );
-void LAPACK_csyswapr( char* uplo, lapack_int* n,
-                      lapack_complex_float* a, lapack_int* lda,
-                      lapack_int* i1, lapack_int* i2 );
-void LAPACK_csytri2( char* uplo, lapack_int* n,
-                     lapack_complex_float* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_csytri2x( char* uplo, lapack_int* n,
-                      lapack_complex_float* a, lapack_int* lda,
-                      const lapack_int* ipiv,
-                      lapack_complex_float* work, lapack_int* nb , lapack_int *info );
-void LAPACK_csytrs2( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const lapack_complex_float* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* work , lapack_int *info );
-void LAPACK_cunbdb( char* trans, char* signs,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    lapack_complex_float* x11, lapack_int* ldx11,
-                    lapack_complex_float* x12, lapack_int* ldx12,
-                    lapack_complex_float* x21, lapack_int* ldx21,
-                    lapack_complex_float* x22, lapack_int* ldx22,
-                    float* theta, float* phi,
-                    lapack_complex_float* taup1,
-                    lapack_complex_float* taup2,
-                    lapack_complex_float* tauq1,
-                    lapack_complex_float* tauq2,
-                    lapack_complex_float* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_cuncsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    char* signs, lapack_int* m, lapack_int* p,
-                    lapack_int* q, lapack_complex_float* x11,
-                    lapack_int* ldx11, lapack_complex_float* x12,
-                    lapack_int* ldx12, lapack_complex_float* x21,
-                    lapack_int* ldx21, lapack_complex_float* x22,
-                    lapack_int* ldx22, float* theta,
-                    lapack_complex_float* u1, lapack_int* ldu1,
-                    lapack_complex_float* u2, lapack_int* ldu2,
-                    lapack_complex_float* v1t, lapack_int* ldv1t,
-                    lapack_complex_float* v2t, lapack_int* ldv2t,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    float* rwork, lapack_int* lrwork,
-                    lapack_int* iwork , lapack_int *info );
-void LAPACK_cuncsd2by1( char* jobu1, char* jobu2,
-                    char* jobv1t, lapack_int* m, lapack_int* p,
-                    lapack_int* q, lapack_complex_float* x11,
-                    lapack_int* ldx11, lapack_complex_float* x21,
-                    lapack_int* ldx21, float* theta,
-                    lapack_complex_float* u1, lapack_int* ldu1,
-                    lapack_complex_float* u2, lapack_int* ldu2,
-                    lapack_complex_float* v1t, lapack_int* ldv1t,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    float* rwork, lapack_int* lrwork,
-                    lapack_int* iwork , lapack_int *info );
-void LAPACK_dbbcsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    double* theta, double* phi, double* u1,
-                    lapack_int* ldu1, double* u2, lapack_int* ldu2,
-                    double* v1t, lapack_int* ldv1t, double* v2t,
-                    lapack_int* ldv2t, double* b11d, double* b11e,
-                    double* b12d, double* b12e, double* b21d,
-                    double* b21e, double* b22d, double* b22e,
-                    double* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_dorbdb( char* trans, char* signs,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    double* x11, lapack_int* ldx11, double* x12,
-                    lapack_int* ldx12, double* x21, lapack_int* ldx21,
-                    double* x22, lapack_int* ldx22, double* theta,
-                    double* phi, double* taup1, double* taup2,
-                    double* tauq1, double* tauq2, double* work,
-                    lapack_int* lwork , lapack_int *info );
-void LAPACK_dorcsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    char* signs, lapack_int* m, lapack_int* p,
-                    lapack_int* q, double* x11, lapack_int* ldx11,
-                    double* x12, lapack_int* ldx12, double* x21,
-                    lapack_int* ldx21, double* x22, lapack_int* ldx22,
-                    double* theta, double* u1, lapack_int* ldu1,
-                    double* u2, lapack_int* ldu2, double* v1t,
-                    lapack_int* ldv1t, double* v2t, lapack_int* ldv2t,
-                    double* work, lapack_int* lwork,
-                    lapack_int* iwork , lapack_int *info );
-void LAPACK_dorcsd2by1( char* jobu1, char* jobu2,
-                    char* jobv1t, lapack_int* m, lapack_int* p,
-                    lapack_int* q, double* x11, lapack_int* ldx11,
-                    double* x21, lapack_int* ldx21,
-                    double* theta, double* u1, lapack_int* ldu1,
-                    double* u2, lapack_int* ldu2, double* v1t,
-                    lapack_int* ldv1t, double* work, lapack_int* lwork,
-                    lapack_int* iwork , lapack_int *info );
-void LAPACK_dsyconv( char* uplo, char* way,
-                     lapack_int* n, double* a, lapack_int* lda,
-                     const lapack_int* ipiv, double* e , lapack_int *info );
-void LAPACK_dsyswapr( char* uplo, lapack_int* n, double* a,
-                      lapack_int* lda, lapack_int* i1, lapack_int* i2 );
-void LAPACK_dsytri2( char* uplo, lapack_int* n,
-                     double* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     double* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_dsytri2x( char* uplo, lapack_int* n,
-                      double* a, lapack_int* lda,
-                      const lapack_int* ipiv, double* work,
-                      lapack_int* nb , lapack_int *info );
-void LAPACK_dsytrs2( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const double* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     double* b, lapack_int* ldb, double* work , lapack_int *info );
-void LAPACK_sbbcsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    float* theta, float* phi, float* u1,
-                    lapack_int* ldu1, float* u2, lapack_int* ldu2,
-                    float* v1t, lapack_int* ldv1t, float* v2t,
-                    lapack_int* ldv2t, float* b11d, float* b11e,
-                    float* b12d, float* b12e, float* b21d,
-                    float* b21e, float* b22d, float* b22e,
-                    float* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_sorbdb( char* trans, char* signs,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    float* x11, lapack_int* ldx11, float* x12,
-                    lapack_int* ldx12, float* x21, lapack_int* ldx21,
-                    float* x22, lapack_int* ldx22, float* theta,
-                    float* phi, float* taup1, float* taup2,
-                    float* tauq1, float* tauq2, float* work,
-                    lapack_int* lwork , lapack_int *info );
-void LAPACK_sorcsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    char* signs, lapack_int* m, lapack_int* p,
-                    lapack_int* q, float* x11, lapack_int* ldx11,
-                    float* x12, lapack_int* ldx12, float* x21,
-                    lapack_int* ldx21, float* x22, lapack_int* ldx22,
-                    float* theta, float* u1, lapack_int* ldu1,
-                    float* u2, lapack_int* ldu2, float* v1t,
-                    lapack_int* ldv1t, float* v2t, lapack_int* ldv2t,
-                    float* work, lapack_int* lwork,
-                    lapack_int* iwork , lapack_int *info );
-void LAPACK_sorcsd2by1( char* jobu1, char* jobu2,
-                    char* jobv1t, lapack_int* m, lapack_int* p,
-                    lapack_int* q, float* x11, lapack_int* ldx11,
-                    float* x21, lapack_int* ldx21,
-                    float* theta, float* u1, lapack_int* ldu1,
-                    float* u2, lapack_int* ldu2, float* v1t,
-                    lapack_int* ldv1t, float* work, lapack_int* lwork,
-                    lapack_int* iwork , lapack_int *info );
-void LAPACK_ssyconv( char* uplo, char* way,
-                     lapack_int* n, float* a, lapack_int* lda,
-                     const lapack_int* ipiv, float* e , lapack_int *info );
-void LAPACK_ssyswapr( char* uplo, lapack_int* n, float* a,
-                      lapack_int* lda, lapack_int* i1, lapack_int* i2 );
-void LAPACK_ssytri2( char* uplo, lapack_int* n,
-                     float* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     float* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_ssytri2x( char* uplo, lapack_int* n,
-                      float* a, lapack_int* lda,
-                      const lapack_int* ipiv, float* work,
-                      lapack_int* nb , lapack_int *info );
-void LAPACK_ssytrs2( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const float* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     float* b, lapack_int* ldb, float* work , lapack_int *info );
-void LAPACK_zbbcsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    double* theta, double* phi,
-                    lapack_complex_double* u1, lapack_int* ldu1,
-                    lapack_complex_double* u2, lapack_int* ldu2,
-                    lapack_complex_double* v1t, lapack_int* ldv1t,
-                    lapack_complex_double* v2t, lapack_int* ldv2t,
-                    double* b11d, double* b11e, double* b12d,
-                    double* b12e, double* b21d, double* b21e,
-                    double* b22d, double* b22e, double* rwork,
-                    lapack_int* lrwork , lapack_int *info );
-void LAPACK_zheswapr( char* uplo, lapack_int* n, lapack_complex_double* a,
-                      lapack_int* lda, lapack_int* i1, lapack_int* i2 );
-void LAPACK_zhetri2( char* uplo, lapack_int* n,
-                     lapack_complex_double* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_zhetri2x( char* uplo, lapack_int* n,
-                      lapack_complex_double* a, lapack_int* lda,
-                      const lapack_int* ipiv,
-                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
-void LAPACK_zhetrs2( char* uplo, lapack_int* n,
-                     lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* work , lapack_int *info );
-void LAPACK_zsyconv( char* uplo, char* way,
-                     lapack_int* n, lapack_complex_double* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_double* e , lapack_int *info );
-void LAPACK_zsyswapr( char* uplo, lapack_int* n, lapack_complex_double* a,
-                      lapack_int* lda, lapack_int* i1,
-                      lapack_int* i2 );
-void LAPACK_zsytri2( char* uplo, lapack_int* n,
-                     lapack_complex_double* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_zsytri2x( char* uplo, lapack_int* n,
-                      lapack_complex_double* a, lapack_int* lda,
-                      const lapack_int* ipiv,
-                      lapack_complex_double* work, lapack_int* nb , lapack_int *info );
-void LAPACK_zsytrs2( char* uplo, lapack_int* n,
-                     lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* work , lapack_int *info );
-void LAPACK_zunbdb( char* trans, char* signs,
-                    lapack_int* m, lapack_int* p, lapack_int* q,
-                    lapack_complex_double* x11, lapack_int* ldx11,
-                    lapack_complex_double* x12, lapack_int* ldx12,
-                    lapack_complex_double* x21, lapack_int* ldx21,
-                    lapack_complex_double* x22, lapack_int* ldx22,
-                    double* theta, double* phi,
-                    lapack_complex_double* taup1,
-                    lapack_complex_double* taup2,
-                    lapack_complex_double* tauq1,
-                    lapack_complex_double* tauq2,
-                    lapack_complex_double* work, lapack_int* lwork , lapack_int *info );
-void LAPACK_zuncsd( char* jobu1, char* jobu2,
-                    char* jobv1t, char* jobv2t, char* trans,
-                    char* signs, lapack_int* m, lapack_int* p,
-                    lapack_int* q, lapack_complex_double* x11,
-                    lapack_int* ldx11, lapack_complex_double* x12,
-                    lapack_int* ldx12, lapack_complex_double* x21,
-                    lapack_int* ldx21, lapack_complex_double* x22,
-                    lapack_int* ldx22, double* theta,
-                    lapack_complex_double* u1, lapack_int* ldu1,
-                    lapack_complex_double* u2, lapack_int* ldu2,
-                    lapack_complex_double* v1t, lapack_int* ldv1t,
-                    lapack_complex_double* v2t, lapack_int* ldv2t,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork,
-                    lapack_int* iwork , lapack_int *info );
-void LAPACK_zuncsd2by1( char* jobu1, char* jobu2,
-                    char* jobv1t, lapack_int* m, lapack_int* p,
-                    lapack_int* q, lapack_complex_double* x11,
-                    lapack_int* ldx11, lapack_complex_double* x21,
-                    lapack_int* ldx21, double* theta,
-                    lapack_complex_double* u1, lapack_int* ldu1,
-                    lapack_complex_double* u2, lapack_int* ldu2,
-                    lapack_complex_double* v1t, lapack_int* ldv1t,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork,
-                    lapack_int* iwork , lapack_int *info );
-// LAPACK 3.4.0
-void LAPACK_sgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* nb, const float* v,
-                     lapack_int* ldv, const float* t, lapack_int* ldt, float* c,
-                     lapack_int* ldc, float* work, lapack_int *info );
-void LAPACK_dgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* nb, const double* v,
-                     lapack_int* ldv, const double* t, lapack_int* ldt,
-                     double* c, lapack_int* ldc, double* work,
-                     lapack_int *info );
-void LAPACK_cgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* nb,
-                     const lapack_complex_float* v, lapack_int* ldv,
-                     const lapack_complex_float* t, lapack_int* ldt,
-                     lapack_complex_float* c, lapack_int* ldc,
-                     lapack_complex_float* work, lapack_int *info );
-void LAPACK_zgemqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* nb,
-                     const lapack_complex_double* v, lapack_int* ldv,
-                     const lapack_complex_double* t, lapack_int* ldt,
-                     lapack_complex_double* c, lapack_int* ldc,
-                     lapack_complex_double* work, lapack_int *info );
-void LAPACK_sgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, float* a,
-                    lapack_int* lda, float* t, lapack_int* ldt, float* work,
-                    lapack_int *info );
-void LAPACK_dgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb, double* a,
-                    lapack_int* lda, double* t, lapack_int* ldt, double* work,
-                    lapack_int *info );
-void LAPACK_cgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_zgeqrt( lapack_int* m, lapack_int* n, lapack_int* nb,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_sgeqrt2( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                     float* t, lapack_int* ldt, lapack_int *info );
-void LAPACK_dgeqrt2( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                     double* t, lapack_int* ldt, lapack_int *info );
-void LAPACK_cgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_zgeqrt2( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_sgeqrt3( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                     float* t, lapack_int* ldt, lapack_int *info );
-void LAPACK_dgeqrt3( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                     double* t, lapack_int* ldt, lapack_int *info );
-void LAPACK_cgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_float* a,
-                     lapack_int* lda, lapack_complex_float* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_zgeqrt3( lapack_int* m, lapack_int* n, lapack_complex_double* a,
-                     lapack_int* lda, lapack_complex_double* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_stpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* l, lapack_int* nb,
-                     const float* v, lapack_int* ldv, const float* t,
-                     lapack_int* ldt, float* a, lapack_int* lda, float* b,
-                     lapack_int* ldb, float* work, lapack_int *info );
-void LAPACK_dtpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* l, lapack_int* nb,
-                     const double* v, lapack_int* ldv, const double* t,
-                     lapack_int* ldt, double* a, lapack_int* lda, double* b,
-                     lapack_int* ldb, double* work, lapack_int *info );
-void LAPACK_ctpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* l, lapack_int* nb,
-                     const lapack_complex_float* v, lapack_int* ldv,
-                     const lapack_complex_float* t, lapack_int* ldt,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* work, lapack_int *info );
-void LAPACK_ztpmqrt( char* side, char* trans, lapack_int* m, lapack_int* n,
-                     lapack_int* k, lapack_int* l, lapack_int* nb,
-                     const lapack_complex_double* v, lapack_int* ldv,
-                     const lapack_complex_double* t, lapack_int* ldt,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* work, lapack_int *info );
-void LAPACK_stpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
-                    float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                    float* t, lapack_int* ldt, float* work, lapack_int *info );
-void LAPACK_dtpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
-                    double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                    double* t, lapack_int* ldt, double* work,
-                    lapack_int *info );
-void LAPACK_ctpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* work, lapack_int *info );
-void LAPACK_ztpqrt( lapack_int* m, lapack_int* n, lapack_int* l, lapack_int* nb,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* work, lapack_int *info );
-void LAPACK_stpqrt2( lapack_int* m, lapack_int* n, lapack_int* l,
-                     float* a, lapack_int* lda,
-                     float* b, lapack_int* ldb,
-                     float* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_dtpqrt2( lapack_int* m, lapack_int* n, lapack_int* l,
-                     double* a, lapack_int* lda,
-                     double* b, lapack_int* ldb,
-                     double* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_ctpqrt2( lapack_int* m, lapack_int* n, lapack_int* l,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_ztpqrt2( lapack_int* m, lapack_int* n, lapack_int* l,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* t, lapack_int* ldt,
-                     lapack_int *info );
-void LAPACK_stprfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
-                    const float* v, lapack_int* ldv, const float* t,
-                    lapack_int* ldt, float* a, lapack_int* lda, float* b,
-                    lapack_int* ldb, const float* work,
-                    lapack_int* ldwork );
-void LAPACK_dtprfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
-                    const double* v, lapack_int* ldv, const double* t,
-                    lapack_int* ldt, double* a, lapack_int* lda, double* b,
-                    lapack_int* ldb, const double* work,
-                    lapack_int* ldwork );
-void LAPACK_ctprfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
-                    const lapack_complex_float* v, lapack_int* ldv,
-                    const lapack_complex_float* t, lapack_int* ldt,
-                    lapack_complex_float* a, lapack_int* lda,
-                    lapack_complex_float* b, lapack_int* ldb,
-                    lapack_complex_float* work, lapack_int* ldwork );
-void LAPACK_ztprfb( char* side, char* trans, char* direct, char* storev,
-                    lapack_int* m, lapack_int* n, lapack_int* k, lapack_int* l,
-                    const lapack_complex_double* v, lapack_int* ldv,
-                    const lapack_complex_double* t, lapack_int* ldt,
-                    lapack_complex_double* a, lapack_int* lda,
-                    lapack_complex_double* b, lapack_int* ldb,
-                    lapack_complex_double* work, lapack_int* ldwork );
-// LAPACK 3.5.0
-void LAPACK_ssysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
-                        lapack_int* lda, lapack_int* ipiv, float* b,
-                        lapack_int* ldb, float* work, lapack_int* lwork,
-                        lapack_int *info );
-void LAPACK_ssytrf_rook( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* ipiv, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dsysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                        lapack_int* lda, lapack_int* ipiv, double* b,
-                        lapack_int* ldb, double* work, lapack_int* lwork,
-                        lapack_int *info );
-void LAPACK_dsytrf_rook( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* ipiv, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_csysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs,
-                        lapack_complex_float* a, lapack_int* lda,
-                        lapack_int* ipiv, lapack_complex_float* b,
-                        lapack_int* ldb, lapack_complex_float* work,
-                        lapack_int* lwork, lapack_int *info );
-void LAPACK_csytrf_rook( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zsysv_rook( char* uplo, lapack_int* n, lapack_int* nrhs,
-                        lapack_complex_double* a, lapack_int* lda,
-                        lapack_int* ipiv, lapack_complex_double* b,
-                        lapack_int* ldb, lapack_complex_double* work,
-                        lapack_int* lwork, lapack_int *info );
-void LAPACK_zsytrf_rook( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_ssytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs, const float* a,
-                    lapack_int* lda, const lapack_int* ipiv, float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_dsytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const double* a, lapack_int* lda, const lapack_int* ipiv,
-                    double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_csytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_zsytrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_chetrf_rook( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zhetrf_rook( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_chetrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_float* b,
-                    lapack_int* ldb, lapack_int *info );
-void LAPACK_zhetrs_rook( char* uplo, lapack_int* n, lapack_int* nrhs,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_int* ipiv, lapack_complex_double* b,
-                    lapack_int* ldb, lapack_int *info );
-
-void LAPACK_csyr( char* uplo, lapack_int* n, lapack_complex_float* alpha,
-                      const lapack_complex_float* x, lapack_int* incx,
-                      lapack_complex_float* a, lapack_int* lda );
-void LAPACK_zsyr( char* uplo, lapack_int* n, lapack_complex_double* alpha,
-                      const lapack_complex_double* x, lapack_int* incx,
-                      lapack_complex_double* a, lapack_int* lda );
-void LAPACK_ilaver( const lapack_int* vers_major, const lapack_int* vers_minor,
-                     const lapack_int* vers_patch );
-
-// LAPACK 3.7.0
-void LAPACK_ssysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
-                   lapack_int* lda, lapack_int* ipiv, float* b, lapack_int* ldb,
-                   float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                   lapack_int* lda, lapack_int* ipiv, double* b,
-                   lapack_int* ldb, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_csysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zsysv_aa( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_chesv_aa( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zhesv_aa( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-
-void LAPACK_ssytrf_aa( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    lapack_int* ipiv, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dsytrf_aa( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    lapack_int* ipiv, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_csytrf_aa( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zsytrf_aa( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_chetrf_aa( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zhetrf_aa( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-
-void LAPACK_ssytrs_aa( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const float* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     float* b, lapack_int* ldb, float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsytrs_aa( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const double* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     double* b, lapack_int* ldb, double* work, lapack_int* lwork,  lapack_int *info );
-void LAPACK_csytrs_aa( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const lapack_complex_float* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* work , lapack_int* lwork, lapack_int *info );
-void LAPACK_zsytrs_aa( char* uplo, lapack_int* n,
-                     lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_chetrs_aa( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const lapack_complex_float* a,
-                     lapack_int* lda, const lapack_int* ipiv,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* work , lapack_int* lwork, lapack_int *info );
-void LAPACK_zhetrs_aa( char* uplo, lapack_int* n,
-                     lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_int* ipiv,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* work, lapack_int* lwork, lapack_int *info );
-
-void LAPACK_ssysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, float* a,
-                   lapack_int* lda, float* e, lapack_int* ipiv, float* b, lapack_int* ldb,
-                   float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                   lapack_int* lda, double* e, lapack_int* ipiv, double* b,
-                   lapack_int* ldb, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_csysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* e, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zsysv_rk( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* e, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_chesv_rk( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* e, lapack_int* ipiv,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zhesv_rk( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* e, lapack_int* ipiv,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-
-void LAPACK_ssytrf_rk( char* uplo, lapack_int* n, float* a, lapack_int* lda,
-                    float* e, lapack_int* ipiv, float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_dsytrf_rk( char* uplo, lapack_int* n, double* a, lapack_int* lda,
-                    double* e, lapack_int* ipiv, double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_csytrf_rk( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* e, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zsytrf_rk( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* e, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_chetrf_rk( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, lapack_complex_float* e, lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int *info );
-void LAPACK_zhetrf_rk( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, lapack_complex_double* e, lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int *info );
-
-void LAPACK_ssytrs_3( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const float* a,
-                     lapack_int* lda, const float* e, const lapack_int* ipiv,
-                     float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_dsytrs_3( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const double* a,
-                     lapack_int* lda, const double* e, const lapack_int* ipiv,
-                     double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_csytrs_3( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const lapack_complex_float* a,
-                     lapack_int* lda, const lapack_complex_float* e,
-                     const lapack_int* ipiv,
-                     lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_zsytrs_3( char* uplo, lapack_int* n,
-                     lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_complex_double* e, const lapack_int* ipiv,
-                     lapack_complex_double* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_chetrs_3( char* uplo, lapack_int* n,
-                     lapack_int* nrhs, const lapack_complex_float* a,
-                     lapack_int* lda, const lapack_complex_float* e,
-                     const lapack_int* ipiv,
-                     lapack_complex_float* b, lapack_int* ldb, lapack_int *info );
-void LAPACK_zhetrs_3( char* uplo, lapack_int* n,
-                     lapack_int* nrhs,
-                     const lapack_complex_double* a, lapack_int* lda,
-                     const lapack_complex_double* e,
-                     const lapack_int* ipiv,
-                     lapack_complex_double* b, lapack_int* ldb,  lapack_int *info );
-
-void LAPACK_ssytri_3( char* uplo, lapack_int* n, float* a, lapack_int* lda, const float* e,
-                    const lapack_int* ipiv, float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsytri_3( char* uplo, lapack_int* n, double* a, lapack_int* lda, const double* e,
-                    const lapack_int* ipiv, double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_csytri_3( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_zsytri_3( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_chetri_3( char* uplo, lapack_int* n, lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv,
-                    lapack_complex_float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_zhetri_3( char* uplo, lapack_int* n, lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv,
-                    lapack_complex_double* work, lapack_int* lwork, lapack_int *info );
-
-void LAPACK_ssycon_3( char* uplo, lapack_int* n, const float* a, lapack_int* lda, const float* e,
-                    const lapack_int* ipiv, float* anorm, float* rcond,
-                    float* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_dsycon_3( char* uplo, lapack_int* n, const double* a, lapack_int* lda, const double* e,
-                    const lapack_int* ipiv, double* anorm, double* rcond,
-                    double* work, lapack_int* iwork, lapack_int *info );
-void LAPACK_csycon_3( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv, float* anorm,
-                    float* rcond, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zsycon_3( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv, double* anorm,
-                    double* rcond, lapack_complex_double* work,
-                    lapack_int *info );
-void LAPACK_checon_3( char* uplo, lapack_int* n, const lapack_complex_float* a,
-                    lapack_int* lda, const lapack_complex_float* e, const lapack_int* ipiv, float* anorm,
-                    float* rcond, lapack_complex_float* work,
-                    lapack_int *info );
-void LAPACK_zhecon_3( char* uplo, lapack_int* n, const lapack_complex_double* a,
-                    lapack_int* lda, const lapack_complex_double* e, const lapack_int* ipiv, double* anorm,
-                    double* rcond, lapack_complex_double* work,
-                    lapack_int *info );
-
-void LAPACK_sgelq( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                   float* t, lapack_int* tsize, float* work, lapack_int* lwork,
-                   lapack_int* info );
-void LAPACK_dgelq( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                   double* t, lapack_int* tsize, double* work, lapack_int* lwork,
-                   lapack_int* info );
-void LAPACK_cgelq( lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* t, lapack_int* tsize, lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int* info );
-void LAPACK_zgelq( lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* t, lapack_int* tsize, lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int* info );
-
-void LAPACK_sgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const float* a, lapack_int* lda,
-                    const float* t, lapack_int* tsize,
-                    float* c, lapack_int* ldc,
-                    float* work, lapack_int* lwork,
-                    lapack_int* info );
-void LAPACK_dgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const double* a, lapack_int* lda,
-                    const double* t, lapack_int* tsize,
-                    double* c, lapack_int* ldc,
-                    double* work, lapack_int* lwork,
-                    lapack_int* info );
-void LAPACK_cgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* t, lapack_int* tsize,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int* info );
-void LAPACK_zgemlq( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* t, lapack_int* tsize,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int* info );
-
-void LAPACK_sgeqr( lapack_int* m, lapack_int* n, float* a, lapack_int* lda,
-                   float* t, lapack_int* tsize, float* work, lapack_int* lwork,
-                   lapack_int* info );
-void LAPACK_dgeqr( lapack_int* m, lapack_int* n, double* a, lapack_int* lda,
-                   double* t, lapack_int* tsize, double* work, lapack_int* lwork,
-                   lapack_int* info );
-void LAPACK_cgeqr( lapack_int* m, lapack_int* n, lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* t, lapack_int* tsize, lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int* info );
-void LAPACK_zgeqr( lapack_int* m, lapack_int* n, lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* t, lapack_int* tsize, lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int* info );
-
-void LAPACK_sgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const float* a, lapack_int* lda,
-                    const float* t, lapack_int* tsize,
-                    float* c, lapack_int* ldc,
-                    float* work, lapack_int* lwork,
-                    lapack_int* info );
-void LAPACK_dgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const double* a, lapack_int* lda,
-                    const double* t, lapack_int* tsize,
-                    double* c, lapack_int* ldc,
-                    double* work, lapack_int* lwork,
-                    lapack_int* info );
-void LAPACK_cgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const lapack_complex_float* a, lapack_int* lda,
-                    const lapack_complex_float* t, lapack_int* tsize,
-                    lapack_complex_float* c, lapack_int* ldc,
-                    lapack_complex_float* work, lapack_int* lwork,
-                    lapack_int* info );
-void LAPACK_zgemqr( char* side, char* trans, lapack_int* m, lapack_int* n, lapack_int* k,
-                    const lapack_complex_double* a, lapack_int* lda,
-                    const lapack_complex_double* t, lapack_int* tsize,
-                    lapack_complex_double* c, lapack_int* ldc,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    lapack_int* info );
-
-void LAPACK_sgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                     float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                     float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                     double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                     double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_cgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_float* a, lapack_int* lda,
-                     lapack_complex_float* b, lapack_int* ldb,
-                     lapack_complex_float* work, lapack_int* lwork,
-                     lapack_int *info );
-void LAPACK_zgetsls( char* trans, lapack_int* m, lapack_int* n, lapack_int* nrhs,
-                     lapack_complex_double* a, lapack_int* lda,
-                     lapack_complex_double* b, lapack_int* ldb,
-                     lapack_complex_double* work, lapack_int* lwork,
-                     lapack_int *info );
-
-void LAPACK_ssyev_2stage( char* jobz, char* uplo, lapack_int* n, float* a,
-                   lapack_int* lda, float* w, float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_dsyev_2stage( char* jobz, char* uplo, lapack_int* n, double* a,
-                   lapack_int* lda, double* w, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_cheev_2stage( char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda, float* w,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int *info );
-void LAPACK_zheev_2stage( char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda, double* w,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int *info );
-void LAPACK_ssyevd_2stage( char* jobz, char* uplo, lapack_int* n, float* a,
-                    lapack_int* lda, float* w, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dsyevd_2stage( char* jobz, char* uplo, lapack_int* n, double* a,
-                    lapack_int* lda, double* w, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_cheevd_2stage( char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* w,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zheevd_2stage( char* jobz, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* w,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_ssyevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    float* a, lapack_int* lda, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, float* z, lapack_int* ldz,
-                    float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_dsyevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    double* a, lapack_int* lda, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, double* z, lapack_int* ldz,
-                    double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_cheevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work,
-                    lapack_int* lwork, float* rwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_zheevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work,
-                    lapack_int* lwork, double* rwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_ssyevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    float* a, lapack_int* lda, float* vl, float* vu,
-                    lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, float* z, lapack_int* ldz,
-                    lapack_int* isuppz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dsyevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    double* a, lapack_int* lda, double* vl, double* vu,
-                    lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, double* z, lapack_int* ldz,
-                    lapack_int* isuppz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_cheevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_float* a, lapack_int* lda, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_int* isuppz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zheevr_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_complex_double* a, lapack_int* lda, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_int* isuppz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_ssbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   float* ab, lapack_int* ldab, float* w, float* z,
-                   lapack_int* ldz, float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   double* ab, lapack_int* ldab, double* w, double* z,
-                   lapack_int* ldz, double* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_chbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   lapack_complex_float* ab, lapack_int* ldab, float* w,
-                   lapack_complex_float* z, lapack_int* ldz,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork, lapack_int *info );
-void LAPACK_zhbev_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                   lapack_complex_double* ab, lapack_int* ldab, double* w,
-                   lapack_complex_double* z, lapack_int* ldz,
-                   lapack_complex_double* work, lapack_int* lwork, double* rwork,
-                   lapack_int *info );
-void LAPACK_ssbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    float* ab, lapack_int* ldab, float* w, float* z,
-                    lapack_int* ldz, float* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_dsbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    double* ab, lapack_int* ldab, double* w, double* z,
-                    lapack_int* ldz, double* work, lapack_int* lwork,
-                    lapack_int* iwork, lapack_int* liwork, lapack_int *info );
-void LAPACK_chbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_float* ab, lapack_int* ldab, float* w,
-                    lapack_complex_float* z, lapack_int* ldz,
-                    lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* lrwork, lapack_int* iwork, lapack_int* liwork,
-                    lapack_int *info );
-void LAPACK_zhbevd_2stage( char* jobz, char* uplo, lapack_int* n, lapack_int* kd,
-                    lapack_complex_double* ab, lapack_int* ldab, double* w,
-                    lapack_complex_double* z, lapack_int* ldz,
-                    lapack_complex_double* work, lapack_int* lwork,
-                    double* rwork, lapack_int* lrwork, lapack_int* iwork,
-                    lapack_int* liwork, lapack_int *info );
-void LAPACK_ssbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, float* ab, lapack_int* ldab, float* q,
-                    lapack_int* ldq, float* vl, float* vu, lapack_int* il,
-                    lapack_int* iu, float* abstol, lapack_int* m, float* w,
-                    float* z, lapack_int* ldz, float* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_dsbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, double* ab, lapack_int* ldab, double* q,
-                    lapack_int* ldq, double* vl, double* vu, lapack_int* il,
-                    lapack_int* iu, double* abstol, lapack_int* m, double* w,
-                    double* z, lapack_int* ldz, double* work, lapack_int* lwork, lapack_int* iwork,
-                    lapack_int* ifail, lapack_int *info );
-void LAPACK_chbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, lapack_complex_float* ab, lapack_int* ldab,
-                    lapack_complex_float* q, lapack_int* ldq, float* vl,
-                    float* vu, lapack_int* il, lapack_int* iu, float* abstol,
-                    lapack_int* m, float* w, lapack_complex_float* z,
-                    lapack_int* ldz, lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_zhbevx_2stage( char* jobz, char* range, char* uplo, lapack_int* n,
-                    lapack_int* kd, lapack_complex_double* ab, lapack_int* ldab,
-                    lapack_complex_double* q, lapack_int* ldq, double* vl,
-                    double* vu, lapack_int* il, lapack_int* iu, double* abstol,
-                    lapack_int* m, double* w, lapack_complex_double* z,
-                    lapack_int* ldz, lapack_complex_double* work, lapack_int* lwork, double* rwork,
-                    lapack_int* iwork, lapack_int* ifail, lapack_int *info );
-void LAPACK_ssygv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   float* a, lapack_int* lda, float* b, lapack_int* ldb,
-                   float* w, float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsygv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   double* a, lapack_int* lda, double* b, lapack_int* ldb,
-                   double* w, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_chegv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* b, lapack_int* ldb, float* w,
-                   lapack_complex_float* work, lapack_int* lwork, float* rwork,
-                   lapack_int *info );
-void LAPACK_zhegv_2stage( lapack_int* itype, char* jobz, char* uplo, lapack_int* n,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* b, lapack_int* ldb, double* w,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   double* rwork, lapack_int *info );
-
-//LAPACK 3.8.0
-
-void LAPACK_ssysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, 
-				   float* a, lapack_int* lda, float* tb, lapack_int* ltb,
-				   lapack_int* ipiv, lapack_int* ipiv2, float* b, lapack_int* ldb,
-                   float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                   lapack_int* lda, double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2, double* b,
-                   lapack_int* ldb, double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_csysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda, 
-                   lapack_complex_float* tb, lapack_int* ltb, 
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zsysv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_chesv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zhesv_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-
-void LAPACK_ssytrf_aa_2stage( char* uplo, lapack_int* n,
-				   float* a, lapack_int* lda, float* tb, lapack_int* ltb,
-				   lapack_int* ipiv, lapack_int* ipiv2, 
-                   float* work, lapack_int* lwork, lapack_int *info );
-void LAPACK_dsytrf_aa_2stage( char* uplo, lapack_int* n, double* a,
-                   lapack_int* lda, double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_csytrf_aa_2stage( char* uplo, lapack_int* n, 
-                   lapack_complex_float* a, lapack_int* lda, 
-                   lapack_complex_float* tb, lapack_int* ltb, 
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zsytrf_aa_2stage( char* uplo, lapack_int* n, 
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_chetrf_aa_2stage( char* uplo, lapack_int* n, 
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_float* work, lapack_int* lwork,
-                   lapack_int *info );
-void LAPACK_zhetrf_aa_2stage( char* uplo, lapack_int* n, 
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_double* work, lapack_int* lwork,
-                   lapack_int *info );
-
-void LAPACK_ssytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, 
-				   float* a, lapack_int* lda, float* tb, lapack_int* ltb,
-				   lapack_int* ipiv, lapack_int* ipiv2, float* b, lapack_int* ldb,
-                lapack_int *info );
-void LAPACK_dsytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs, double* a,
-                   lapack_int* lda, double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2, double* b,
-                   lapack_int* ldb, lapack_int *info );
-void LAPACK_csytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda, 
-                   lapack_complex_float* tb, lapack_int* ltb, 
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_zsytrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_chetrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_float* a, lapack_int* lda,
-                   lapack_complex_float* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_float* b, lapack_int* ldb,
-                   lapack_int *info );
-void LAPACK_zhetrs_aa_2stage( char* uplo, lapack_int* n, lapack_int* nrhs,
-                   lapack_complex_double* a, lapack_int* lda,
-                   lapack_complex_double* tb, lapack_int* ltb,
-                   lapack_int* ipiv, lapack_int* ipiv2,
-                   lapack_complex_double* b, lapack_int* ldb,
-                   lapack_int *info );
                    
 /* APIs for set/get nancheck flags */
 void LAPACKE_set_nancheck( int flag );
diff --git a/lapack-netlib/LAPACKE/src/CMakeLists.txt b/lapack-netlib/LAPACKE/src/CMakeLists.txt
index 26e52acfa..4c13dce0b 100644
--- a/lapack-netlib/LAPACKE/src/CMakeLists.txt
+++ b/lapack-netlib/LAPACKE/src/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(SOURCES
+set(SOURCES_COMPLEX
 lapacke_cbbcsd.c
 lapacke_cbbcsd_work.c
 lapacke_cbdsqr.c
@@ -78,11 +78,11 @@ lapacke_cgeqrf_work.c
 lapacke_cgeqrfp.c
 lapacke_cgeqrfp_work.c
 lapacke_cgeqrt.c
+lapacke_cgeqrt_work.c
 lapacke_cgeqrt2.c
 lapacke_cgeqrt2_work.c
 lapacke_cgeqrt3.c
 lapacke_cgeqrt3_work.c
-lapacke_cgeqrt_work.c
 lapacke_cgerfs.c
 lapacke_cgerfs_work.c
 lapacke_cgerqf.c
@@ -93,6 +93,8 @@ lapacke_cgesv.c
 lapacke_cgesv_work.c
 lapacke_cgesvd.c
 lapacke_cgesvd_work.c
+lapacke_cgesvdq.c
+lapacke_cgesvdq_work.c
 lapacke_cgesvdx.c
 lapacke_cgesvdx_work.c
 lapacke_cgesvj.c
@@ -129,10 +131,10 @@ lapacke_cggevx.c
 lapacke_cggevx_work.c
 lapacke_cggglm.c
 lapacke_cggglm_work.c
-lapacke_cgghrd.c
-lapacke_cgghrd_work.c
 lapacke_cgghd3.c
 lapacke_cgghd3_work.c
+lapacke_cgghrd.c
+lapacke_cgghrd_work.c
 lapacke_cgglse.c
 lapacke_cgglse_work.c
 lapacke_cggqrf.c
@@ -157,14 +159,14 @@ lapacke_cgttrs.c
 lapacke_cgttrs_work.c
 lapacke_chbev.c
 lapacke_chbev_work.c
-lapacke_chbevd.c
-lapacke_chbevd_work.c
-lapacke_chbevx.c
-lapacke_chbevx_work.c
 lapacke_chbev_2stage.c
 lapacke_chbev_2stage_work.c
+lapacke_chbevd.c
+lapacke_chbevd_work.c
 lapacke_chbevd_2stage.c
 lapacke_chbevd_2stage_work.c
+lapacke_chbevx.c
+lapacke_chbevx_work.c
 lapacke_chbevx_2stage.c
 lapacke_chbevx_2stage_work.c
 lapacke_chbgst.c
@@ -185,18 +187,18 @@ lapacke_cheequb.c
 lapacke_cheequb_work.c
 lapacke_cheev.c
 lapacke_cheev_work.c
-lapacke_cheevd.c
-lapacke_cheevd_work.c
-lapacke_cheevr.c
-lapacke_cheevr_work.c
-lapacke_cheevx.c
-lapacke_cheevx_work.c
 lapacke_cheev_2stage.c
 lapacke_cheev_2stage_work.c
+lapacke_cheevd.c
+lapacke_cheevd_work.c
 lapacke_cheevd_2stage.c
 lapacke_cheevd_2stage_work.c
+lapacke_cheevr.c
+lapacke_cheevr_work.c
 lapacke_cheevr_2stage.c
 lapacke_cheevr_2stage_work.c
+lapacke_cheevx.c
+lapacke_cheevx_work.c
 lapacke_cheevx_2stage.c
 lapacke_cheevx_2stage_work.c
 lapacke_chegst.c
@@ -214,8 +216,8 @@ lapacke_cherfs_work.c
 lapacke_chesv.c
 lapacke_chesv_work.c
 lapacke_chesv_aa.c
-lapacke_chesv_aa_2stage.c
 lapacke_chesv_aa_work.c
+lapacke_chesv_aa_2stage.c
 lapacke_chesv_aa_2stage_work.c
 lapacke_chesv_rk.c
 lapacke_chesv_rk_work.c
@@ -226,35 +228,35 @@ lapacke_cheswapr_work.c
 lapacke_chetrd.c
 lapacke_chetrd_work.c
 lapacke_chetrf.c
-lapacke_chetrf_rook.c
 lapacke_chetrf_work.c
-lapacke_chetrf_rook_work.c
 lapacke_chetrf_aa.c
-lapacke_chetrf_aa_2stage.c
 lapacke_chetrf_aa_work.c
+lapacke_chetrf_aa_2stage.c
 lapacke_chetrf_aa_2stage_work.c
 lapacke_chetrf_rk.c
 lapacke_chetrf_rk_work.c
+lapacke_chetrf_rook.c
+lapacke_chetrf_rook_work.c
 lapacke_chetri.c
+lapacke_chetri_work.c
 lapacke_chetri2.c
 lapacke_chetri2_work.c
-lapacke_chetri_3.c
-lapacke_chetri_3_work.c
 lapacke_chetri2x.c
 lapacke_chetri2x_work.c
-lapacke_chetri_work.c
+lapacke_chetri_3.c
+lapacke_chetri_3_work.c
 lapacke_chetrs.c
-lapacke_chetrs_rook.c
+lapacke_chetrs_work.c
 lapacke_chetrs2.c
 lapacke_chetrs2_work.c
-lapacke_chetrs_work.c
-lapacke_chetrs_rook_work.c
+lapacke_chetrs_3.c
+lapacke_chetrs_3_work.c
 lapacke_chetrs_aa.c
-lapacke_chetrs_aa_2stage.c
 lapacke_chetrs_aa_work.c
+lapacke_chetrs_aa_2stage.c
 lapacke_chetrs_aa_2stage_work.c
-lapacke_chetrs_3.c
-lapacke_chetrs_3_work.c
+lapacke_chetrs_rook.c
+lapacke_chetrs_rook_work.c
 lapacke_chfrk.c
 lapacke_chfrk_work.c
 lapacke_chgeqz.c
@@ -445,52 +447,54 @@ lapacke_csyconv.c
 lapacke_csyconv_work.c
 lapacke_csyequb.c
 lapacke_csyequb_work.c
+lapacke_csyr.c
+lapacke_csyr_work.c
 lapacke_csyrfs.c
 lapacke_csyrfs_work.c
 lapacke_csysv.c
-lapacke_csysv_rook.c
-lapacke_csysv_rook_work.c
 lapacke_csysv_work.c
 lapacke_csysv_aa.c
-lapacke_csysv_aa_2stage.c
 lapacke_csysv_aa_work.c
+lapacke_csysv_aa_2stage.c
 lapacke_csysv_aa_2stage_work.c
 lapacke_csysv_rk.c
 lapacke_csysv_rk_work.c
+lapacke_csysv_rook.c
+lapacke_csysv_rook_work.c
 lapacke_csysvx.c
 lapacke_csysvx_work.c
 lapacke_csyswapr.c
 lapacke_csyswapr_work.c
 lapacke_csytrf.c
 lapacke_csytrf_work.c
-lapacke_csytrf_rook.c
-lapacke_csytrf_rook_work.c
 lapacke_csytrf_aa.c
-lapacke_csytrf_aa_2stage.c
 lapacke_csytrf_aa_work.c
+lapacke_csytrf_aa_2stage.c
 lapacke_csytrf_aa_2stage_work.c
 lapacke_csytrf_rk.c
 lapacke_csytrf_rk_work.c
+lapacke_csytrf_rook.c
+lapacke_csytrf_rook_work.c
 lapacke_csytri.c
+lapacke_csytri_work.c
 lapacke_csytri2.c
 lapacke_csytri2_work.c
-lapacke_csytri_3.c
-lapacke_csytri_3_work.c
 lapacke_csytri2x.c
 lapacke_csytri2x_work.c
-lapacke_csytri_work.c
+lapacke_csytri_3.c
+lapacke_csytri_3_work.c
 lapacke_csytrs.c
-lapacke_csytrs_rook.c
+lapacke_csytrs_work.c
 lapacke_csytrs2.c
 lapacke_csytrs2_work.c
-lapacke_csytrs_work.c
-lapacke_csytrs_rook_work.c
+lapacke_csytrs_3.c
+lapacke_csytrs_3_work.c
 lapacke_csytrs_aa.c
-lapacke_csytrs_aa_2stage.c
 lapacke_csytrs_aa_work.c
+lapacke_csytrs_aa_2stage.c
 lapacke_csytrs_aa_2stage_work.c
-lapacke_csytrs_3.c
-lapacke_csytrs_3_work.c
+lapacke_csytrs_rook.c
+lapacke_csytrs_rook_work.c
 lapacke_ctbcon.c
 lapacke_ctbcon_work.c
 lapacke_ctbrfs.c
@@ -522,9 +526,9 @@ lapacke_ctpcon_work.c
 lapacke_ctpmqrt.c
 lapacke_ctpmqrt_work.c
 lapacke_ctpqrt.c
+lapacke_ctpqrt_work.c
 lapacke_ctpqrt2.c
 lapacke_ctpqrt2_work.c
-lapacke_ctpqrt_work.c
 lapacke_ctprfb.c
 lapacke_ctprfb_work.c
 lapacke_ctprfs.c
@@ -601,14 +605,16 @@ lapacke_cupgtr.c
 lapacke_cupgtr_work.c
 lapacke_cupmtr.c
 lapacke_cupmtr_work.c
+)
+set(SOURCES_DOUBLE
 lapacke_dbbcsd.c
 lapacke_dbbcsd_work.c
 lapacke_dbdsdc.c
 lapacke_dbdsdc_work.c
-lapacke_dbdsvdx.c
-lapacke_dbdsvdx_work.c
 lapacke_dbdsqr.c
 lapacke_dbdsqr_work.c
+lapacke_dbdsvdx.c
+lapacke_dbdsvdx_work.c
 lapacke_ddisna.c
 lapacke_ddisna_work.c
 lapacke_dgbbrd.c
@@ -686,11 +692,11 @@ lapacke_dgeqrf_work.c
 lapacke_dgeqrfp.c
 lapacke_dgeqrfp_work.c
 lapacke_dgeqrt.c
+lapacke_dgeqrt_work.c
 lapacke_dgeqrt2.c
 lapacke_dgeqrt2_work.c
 lapacke_dgeqrt3.c
 lapacke_dgeqrt3_work.c
-lapacke_dgeqrt_work.c
 lapacke_dgerfs.c
 lapacke_dgerfs_work.c
 lapacke_dgerqf.c
@@ -701,6 +707,8 @@ lapacke_dgesv.c
 lapacke_dgesv_work.c
 lapacke_dgesvd.c
 lapacke_dgesvd_work.c
+lapacke_dgesvdq.c
+lapacke_dgesvdq_work.c
 lapacke_dgesvdx.c
 lapacke_dgesvdx_work.c
 lapacke_dgesvj.c
@@ -737,10 +745,10 @@ lapacke_dggevx.c
 lapacke_dggevx_work.c
 lapacke_dggglm.c
 lapacke_dggglm_work.c
-lapacke_dgghrd.c
-lapacke_dgghrd_work.c
 lapacke_dgghd3.c
 lapacke_dgghd3_work.c
+lapacke_dgghrd.c
+lapacke_dgghrd_work.c
 lapacke_dgglse.c
 lapacke_dgglse_work.c
 lapacke_dggqrf.c
@@ -823,10 +831,10 @@ lapacke_dopmtr.c
 lapacke_dopmtr_work.c
 lapacke_dorbdb.c
 lapacke_dorbdb_work.c
-lapacke_dorcsd2by1.c
-lapacke_dorcsd2by1_work.c
 lapacke_dorcsd.c
 lapacke_dorcsd_work.c
+lapacke_dorcsd2by1.c
+lapacke_dorcsd2by1_work.c
 lapacke_dorgbr.c
 lapacke_dorgbr_work.c
 lapacke_dorghr.c
@@ -933,14 +941,14 @@ lapacke_dpttrs.c
 lapacke_dpttrs_work.c
 lapacke_dsbev.c
 lapacke_dsbev_work.c
-lapacke_dsbevd.c
-lapacke_dsbevd_work.c
-lapacke_dsbevx.c
-lapacke_dsbevx_work.c
 lapacke_dsbev_2stage.c
 lapacke_dsbev_2stage_work.c
+lapacke_dsbevd.c
+lapacke_dsbevd_work.c
 lapacke_dsbevd_2stage.c
 lapacke_dsbevd_2stage_work.c
+lapacke_dsbevx.c
+lapacke_dsbevx_work.c
 lapacke_dsbevx_2stage.c
 lapacke_dsbevx_2stage_work.c
 lapacke_dsbgst.c
@@ -1021,18 +1029,18 @@ lapacke_dsyequb.c
 lapacke_dsyequb_work.c
 lapacke_dsyev.c
 lapacke_dsyev_work.c
-lapacke_dsyevd.c
-lapacke_dsyevd_work.c
-lapacke_dsyevr.c
-lapacke_dsyevr_work.c
-lapacke_dsyevx.c
-lapacke_dsyevx_work.c
 lapacke_dsyev_2stage.c
 lapacke_dsyev_2stage_work.c
+lapacke_dsyevd.c
+lapacke_dsyevd_work.c
 lapacke_dsyevd_2stage.c
 lapacke_dsyevd_2stage_work.c
+lapacke_dsyevr.c
+lapacke_dsyevr_work.c
 lapacke_dsyevr_2stage.c
 lapacke_dsyevr_2stage_work.c
+lapacke_dsyevx.c
+lapacke_dsyevx_work.c
 lapacke_dsyevx_2stage.c
 lapacke_dsyevx_2stage_work.c
 lapacke_dsygst.c
@@ -1048,15 +1056,15 @@ lapacke_dsygvx_work.c
 lapacke_dsyrfs.c
 lapacke_dsyrfs_work.c
 lapacke_dsysv.c
-lapacke_dsysv_rook.c
-lapacke_dsysv_rook_work.c
 lapacke_dsysv_work.c
 lapacke_dsysv_aa.c
-lapacke_dsysv_aa_2stage.c
 lapacke_dsysv_aa_work.c
+lapacke_dsysv_aa_2stage.c
 lapacke_dsysv_aa_2stage_work.c
 lapacke_dsysv_rk.c
 lapacke_dsysv_rk_work.c
+lapacke_dsysv_rook.c
+lapacke_dsysv_rook_work.c
 lapacke_dsysvx.c
 lapacke_dsysvx_work.c
 lapacke_dsyswapr.c
@@ -1065,33 +1073,33 @@ lapacke_dsytrd.c
 lapacke_dsytrd_work.c
 lapacke_dsytrf.c
 lapacke_dsytrf_work.c
-lapacke_dsytrf_rook.c
-lapacke_dsytrf_rook_work.c
 lapacke_dsytrf_aa.c
-lapacke_dsytrf_aa_2stage.c
 lapacke_dsytrf_aa_work.c
+lapacke_dsytrf_aa_2stage.c
 lapacke_dsytrf_aa_2stage_work.c
 lapacke_dsytrf_rk.c
 lapacke_dsytrf_rk_work.c
+lapacke_dsytrf_rook.c
+lapacke_dsytrf_rook_work.c
 lapacke_dsytri.c
+lapacke_dsytri_work.c
 lapacke_dsytri2.c
 lapacke_dsytri2_work.c
-lapacke_dsytri_3.c
-lapacke_dsytri_3_work.c
 lapacke_dsytri2x.c
 lapacke_dsytri2x_work.c
-lapacke_dsytri_work.c
+lapacke_dsytri_3.c
+lapacke_dsytri_3_work.c
 lapacke_dsytrs.c
-lapacke_dsytrs_rook.c
+lapacke_dsytrs_work.c
 lapacke_dsytrs2.c
 lapacke_dsytrs2_work.c
+lapacke_dsytrs_3.c
+lapacke_dsytrs_3_work.c
 lapacke_dsytrs_aa.c
-lapacke_dsytrs_aa_2stage.c
 lapacke_dsytrs_aa_work.c
+lapacke_dsytrs_aa_2stage.c
 lapacke_dsytrs_aa_2stage_work.c
-lapacke_dsytrs_3.c
-lapacke_dsytrs_3_work.c
-lapacke_dsytrs_work.c
+lapacke_dsytrs_rook.c
 lapacke_dsytrs_rook_work.c
 lapacke_dtbcon.c
 lapacke_dtbcon_work.c
@@ -1124,9 +1132,9 @@ lapacke_dtpcon_work.c
 lapacke_dtpmqrt.c
 lapacke_dtpmqrt_work.c
 lapacke_dtpqrt.c
+lapacke_dtpqrt_work.c
 lapacke_dtpqrt2.c
 lapacke_dtpqrt2_work.c
-lapacke_dtpqrt_work.c
 lapacke_dtprfb.c
 lapacke_dtprfb_work.c
 lapacke_dtprfs.c
@@ -1163,15 +1171,21 @@ lapacke_dtrttp.c
 lapacke_dtrttp_work.c
 lapacke_dtzrzf.c
 lapacke_dtzrzf_work.c
+)
+
+set(SOURCES
 lapacke_nancheck.c
+lapacke_ilaver.c
+)
+set(SOURCES_SINGLE
 lapacke_sbbcsd.c
 lapacke_sbbcsd_work.c
 lapacke_sbdsdc.c
 lapacke_sbdsdc_work.c
-lapacke_sbdsvdx.c
-lapacke_sbdsvdx_work.c
 lapacke_sbdsqr.c
 lapacke_sbdsqr_work.c
+lapacke_sbdsvdx.c
+lapacke_sbdsvdx_work.c
 lapacke_sdisna.c
 lapacke_sdisna_work.c
 lapacke_sgbbrd.c
@@ -1249,11 +1263,11 @@ lapacke_sgeqrf_work.c
 lapacke_sgeqrfp.c
 lapacke_sgeqrfp_work.c
 lapacke_sgeqrt.c
+lapacke_sgeqrt_work.c
 lapacke_sgeqrt2.c
 lapacke_sgeqrt2_work.c
 lapacke_sgeqrt3.c
 lapacke_sgeqrt3_work.c
-lapacke_sgeqrt_work.c
 lapacke_sgerfs.c
 lapacke_sgerfs_work.c
 lapacke_sgerqf.c
@@ -1264,6 +1278,8 @@ lapacke_sgesv.c
 lapacke_sgesv_work.c
 lapacke_sgesvd.c
 lapacke_sgesvd_work.c
+lapacke_sgesvdq.c
+lapacke_sgesvdq_work.c
 lapacke_sgesvdx.c
 lapacke_sgesvdx_work.c
 lapacke_sgesvj.c
@@ -1300,10 +1316,10 @@ lapacke_sggevx.c
 lapacke_sggevx_work.c
 lapacke_sggglm.c
 lapacke_sggglm_work.c
-lapacke_sgghrd.c
-lapacke_sgghrd_work.c
 lapacke_sgghd3.c
 lapacke_sgghd3_work.c
+lapacke_sgghrd.c
+lapacke_sgghrd_work.c
 lapacke_sgglse.c
 lapacke_sgglse_work.c
 lapacke_sggqrf.c
@@ -1496,14 +1512,14 @@ lapacke_spttrs.c
 lapacke_spttrs_work.c
 lapacke_ssbev.c
 lapacke_ssbev_work.c
-lapacke_ssbevd.c
-lapacke_ssbevd_work.c
-lapacke_ssbevx.c
-lapacke_ssbevx_work.c
 lapacke_ssbev_2stage.c
 lapacke_ssbev_2stage_work.c
+lapacke_ssbevd.c
+lapacke_ssbevd_work.c
 lapacke_ssbevd_2stage.c
 lapacke_ssbevd_2stage_work.c
+lapacke_ssbevx.c
+lapacke_ssbevx_work.c
 lapacke_ssbevx_2stage.c
 lapacke_ssbevx_2stage_work.c
 lapacke_ssbgst.c
@@ -1580,18 +1596,18 @@ lapacke_ssyequb.c
 lapacke_ssyequb_work.c
 lapacke_ssyev.c
 lapacke_ssyev_work.c
-lapacke_ssyevd.c
-lapacke_ssyevd_work.c
-lapacke_ssyevr.c
-lapacke_ssyevr_work.c
-lapacke_ssyevx.c
-lapacke_ssyevx_work.c
 lapacke_ssyev_2stage.c
 lapacke_ssyev_2stage_work.c
+lapacke_ssyevd.c
+lapacke_ssyevd_work.c
 lapacke_ssyevd_2stage.c
 lapacke_ssyevd_2stage_work.c
+lapacke_ssyevr.c
+lapacke_ssyevr_work.c
 lapacke_ssyevr_2stage.c
 lapacke_ssyevr_2stage_work.c
+lapacke_ssyevx.c
+lapacke_ssyevx_work.c
 lapacke_ssyevx_2stage.c
 lapacke_ssyevx_2stage_work.c
 lapacke_ssygst.c
@@ -1607,8 +1623,6 @@ lapacke_ssygvx_work.c
 lapacke_ssyrfs.c
 lapacke_ssyrfs_work.c
 lapacke_ssysv.c
-lapacke_ssysv_rook.c
-lapacke_ssysv_rook_work.c
 lapacke_ssysv_work.c
 lapacke_ssysv_aa.c
 lapacke_ssysv_aa_work.c
@@ -1616,6 +1630,8 @@ lapacke_ssysv_aa_2stage.c
 lapacke_ssysv_aa_2stage_work.c
 lapacke_ssysv_rk.c
 lapacke_ssysv_rk_work.c
+lapacke_ssysv_rook.c
+lapacke_ssysv_rook_work.c
 lapacke_ssysvx.c
 lapacke_ssysvx_work.c
 lapacke_ssyswapr.c
@@ -1624,33 +1640,33 @@ lapacke_ssytrd.c
 lapacke_ssytrd_work.c
 lapacke_ssytrf.c
 lapacke_ssytrf_work.c
-lapacke_ssytrf_rook.c
-lapacke_ssytrf_rook_work.c
 lapacke_ssytrf_aa.c
-lapacke_ssytrf_aa_2stage.c
 lapacke_ssytrf_aa_work.c
+lapacke_ssytrf_aa_2stage.c
 lapacke_ssytrf_aa_2stage_work.c
 lapacke_ssytrf_rk.c
 lapacke_ssytrf_rk_work.c
+lapacke_ssytrf_rook.c
+lapacke_ssytrf_rook_work.c
 lapacke_ssytri.c
+lapacke_ssytri_work.c
 lapacke_ssytri2.c
 lapacke_ssytri2_work.c
-lapacke_ssytri_3.c
-lapacke_ssytri_3_work.c
 lapacke_ssytri2x.c
 lapacke_ssytri2x_work.c
-lapacke_ssytri_work.c
+lapacke_ssytri_3.c
+lapacke_ssytri_3_work.c
 lapacke_ssytrs.c
-lapacke_ssytrs_rook.c
+lapacke_ssytrs_work.c
 lapacke_ssytrs2.c
 lapacke_ssytrs2_work.c
+lapacke_ssytrs_3.c
+lapacke_ssytrs_3_work.c
 lapacke_ssytrs_aa.c
-lapacke_ssytrs_aa_2stage.c
 lapacke_ssytrs_aa_work.c
+lapacke_ssytrs_aa_2stage.c
 lapacke_ssytrs_aa_2stage_work.c
-lapacke_ssytrs_3.c
-lapacke_ssytrs_3_work.c
-lapacke_ssytrs_work.c
+lapacke_ssytrs_rook.c
 lapacke_ssytrs_rook_work.c
 lapacke_stbcon.c
 lapacke_stbcon_work.c
@@ -1722,6 +1738,8 @@ lapacke_strttp.c
 lapacke_strttp_work.c
 lapacke_stzrzf.c
 lapacke_stzrzf_work.c
+)
+set(SOURCES_COMPLEX16
 lapacke_zbbcsd.c
 lapacke_zbbcsd_work.c
 lapacke_zbdsqr.c
@@ -1805,11 +1823,11 @@ lapacke_zgeqrf_work.c
 lapacke_zgeqrfp.c
 lapacke_zgeqrfp_work.c
 lapacke_zgeqrt.c
+lapacke_zgeqrt_work.c
 lapacke_zgeqrt2.c
 lapacke_zgeqrt2_work.c
 lapacke_zgeqrt3.c
 lapacke_zgeqrt3_work.c
-lapacke_zgeqrt_work.c
 lapacke_zgerfs.c
 lapacke_zgerfs_work.c
 lapacke_zgerqf.c
@@ -1820,6 +1838,8 @@ lapacke_zgesv.c
 lapacke_zgesv_work.c
 lapacke_zgesvd.c
 lapacke_zgesvd_work.c
+lapacke_zgesvdq.c
+lapacke_zgesvdq_work.c
 lapacke_zgesvdx.c
 lapacke_zgesvdx_work.c
 lapacke_zgesvj.c
@@ -1856,10 +1876,10 @@ lapacke_zggevx.c
 lapacke_zggevx_work.c
 lapacke_zggglm.c
 lapacke_zggglm_work.c
-lapacke_zgghrd.c
-lapacke_zgghrd_work.c
 lapacke_zgghd3.c
 lapacke_zgghd3_work.c
+lapacke_zgghrd.c
+lapacke_zgghrd_work.c
 lapacke_zgglse.c
 lapacke_zgglse_work.c
 lapacke_zggqrf.c
@@ -1884,14 +1904,14 @@ lapacke_zgttrs.c
 lapacke_zgttrs_work.c
 lapacke_zhbev.c
 lapacke_zhbev_work.c
-lapacke_zhbevd.c
-lapacke_zhbevd_work.c
-lapacke_zhbevx.c
-lapacke_zhbevx_work.c
 lapacke_zhbev_2stage.c
 lapacke_zhbev_2stage_work.c
+lapacke_zhbevd.c
+lapacke_zhbevd_work.c
 lapacke_zhbevd_2stage.c
 lapacke_zhbevd_2stage_work.c
+lapacke_zhbevx.c
+lapacke_zhbevx_work.c
 lapacke_zhbevx_2stage.c
 lapacke_zhbevx_2stage_work.c
 lapacke_zhbgst.c
@@ -1912,18 +1932,18 @@ lapacke_zheequb.c
 lapacke_zheequb_work.c
 lapacke_zheev.c
 lapacke_zheev_work.c
-lapacke_zheevd.c
-lapacke_zheevd_work.c
-lapacke_zheevr.c
-lapacke_zheevr_work.c
-lapacke_zheevx.c
-lapacke_zheevx_work.c
 lapacke_zheev_2stage.c
 lapacke_zheev_2stage_work.c
+lapacke_zheevd.c
+lapacke_zheevd_work.c
 lapacke_zheevd_2stage.c
 lapacke_zheevd_2stage_work.c
+lapacke_zheevr.c
+lapacke_zheevr_work.c
 lapacke_zheevr_2stage.c
 lapacke_zheevr_2stage_work.c
+lapacke_zheevx.c
+lapacke_zheevx_work.c
 lapacke_zheevx_2stage.c
 lapacke_zheevx_2stage_work.c
 lapacke_zhegst.c
@@ -1941,8 +1961,8 @@ lapacke_zherfs_work.c
 lapacke_zhesv.c
 lapacke_zhesv_work.c
 lapacke_zhesv_aa.c
-lapacke_zhesv_aa_2stage.c
 lapacke_zhesv_aa_work.c
+lapacke_zhesv_aa_2stage.c
 lapacke_zhesv_aa_2stage_work.c
 lapacke_zhesv_rk.c
 lapacke_zhesv_rk_work.c
@@ -1953,34 +1973,34 @@ lapacke_zheswapr_work.c
 lapacke_zhetrd.c
 lapacke_zhetrd_work.c
 lapacke_zhetrf.c
-lapacke_zhetrf_rook.c
 lapacke_zhetrf_work.c
-lapacke_zhetrf_rook_work.c
 lapacke_zhetrf_aa.c
-lapacke_zhetrf_aa_2stage.c
 lapacke_zhetrf_aa_work.c
+lapacke_zhetrf_aa_2stage.c
 lapacke_zhetrf_aa_2stage_work.c
 lapacke_zhetrf_rk.c
 lapacke_zhetrf_rk_work.c
+lapacke_zhetrf_rook.c
+lapacke_zhetrf_rook_work.c
 lapacke_zhetri.c
+lapacke_zhetri_work.c
 lapacke_zhetri2.c
 lapacke_zhetri2_work.c
-lapacke_zhetri_3.c
-lapacke_zhetri_3_work.c
 lapacke_zhetri2x.c
 lapacke_zhetri2x_work.c
-lapacke_zhetri_work.c
+lapacke_zhetri_3.c
+lapacke_zhetri_3_work.c
 lapacke_zhetrs.c
-lapacke_zhetrs_rook.c
+lapacke_zhetrs_work.c
 lapacke_zhetrs2.c
 lapacke_zhetrs2_work.c
-lapacke_zhetrs_work.c
+lapacke_zhetrs_3.c
+lapacke_zhetrs_3_work.c
 lapacke_zhetrs_aa.c
-lapacke_zhetrs_aa_2stage.c
 lapacke_zhetrs_aa_work.c
+lapacke_zhetrs_aa_2stage.c
 lapacke_zhetrs_aa_2stage_work.c
-lapacke_zhetrs_3.c
-lapacke_zhetrs_3_work.c
+lapacke_zhetrs_rook.c
 lapacke_zhetrs_rook_work.c
 lapacke_zhfrk.c
 lapacke_zhfrk_work.c
@@ -2172,52 +2192,54 @@ lapacke_zsyconv.c
 lapacke_zsyconv_work.c
 lapacke_zsyequb.c
 lapacke_zsyequb_work.c
+lapacke_zsyr.c
+lapacke_zsyr_work.c
 lapacke_zsyrfs.c
 lapacke_zsyrfs_work.c
 lapacke_zsysv.c
-lapacke_zsysv_rook.c
-lapacke_zsysv_rook_work.c
 lapacke_zsysv_work.c
 lapacke_zsysv_aa.c
-lapacke_zsysv_aa_2stage.c
 lapacke_zsysv_aa_work.c
+lapacke_zsysv_aa_2stage.c
 lapacke_zsysv_aa_2stage_work.c
 lapacke_zsysv_rk.c
 lapacke_zsysv_rk_work.c
+lapacke_zsysv_rook.c
+lapacke_zsysv_rook_work.c
 lapacke_zsysvx.c
 lapacke_zsysvx_work.c
 lapacke_zsyswapr.c
 lapacke_zsyswapr_work.c
 lapacke_zsytrf.c
 lapacke_zsytrf_work.c
-lapacke_zsytrf_rook.c
-lapacke_zsytrf_rook_work.c
 lapacke_zsytrf_aa.c
-lapacke_zsytrf_aa_2stage.c
 lapacke_zsytrf_aa_work.c
+lapacke_zsytrf_aa_2stage.c
 lapacke_zsytrf_aa_2stage_work.c
 lapacke_zsytrf_rk.c
 lapacke_zsytrf_rk_work.c
+lapacke_zsytrf_rook.c
+lapacke_zsytrf_rook_work.c
 lapacke_zsytri.c
+lapacke_zsytri_work.c
 lapacke_zsytri2.c
 lapacke_zsytri2_work.c
-lapacke_zsytri_3.c
-lapacke_zsytri_3_work.c
 lapacke_zsytri2x.c
 lapacke_zsytri2x_work.c
-lapacke_zsytri_work.c
+lapacke_zsytri_3.c
+lapacke_zsytri_3_work.c
 lapacke_zsytrs.c
-lapacke_zsytrs_rook.c
+lapacke_zsytrs_work.c
 lapacke_zsytrs2.c
 lapacke_zsytrs2_work.c
-lapacke_zsytrs_work.c
-lapacke_zsytrs_rook_work.c
+lapacke_zsytrs_3.c
+lapacke_zsytrs_3_work.c
 lapacke_zsytrs_aa.c
-lapacke_zsytrs_aa_2stage.c
 lapacke_zsytrs_aa_work.c
+lapacke_zsytrs_aa_2stage.c
 lapacke_zsytrs_aa_2stage_work.c
-lapacke_zsytrs_3.c
-lapacke_zsytrs_3_work.c
+lapacke_zsytrs_rook.c
+lapacke_zsytrs_rook_work.c
 lapacke_ztbcon.c
 lapacke_ztbcon_work.c
 lapacke_ztbrfs.c
@@ -2249,9 +2271,9 @@ lapacke_ztpcon_work.c
 lapacke_ztpmqrt.c
 lapacke_ztpmqrt_work.c
 lapacke_ztpqrt.c
+lapacke_ztpqrt_work.c
 lapacke_ztpqrt2.c
 lapacke_ztpqrt2_work.c
-lapacke_ztpqrt_work.c
 lapacke_ztprfb.c
 lapacke_ztprfb_work.c
 lapacke_ztprfs.c
@@ -2328,11 +2350,6 @@ lapacke_zupgtr.c
 lapacke_zupgtr_work.c
 lapacke_zupmtr.c
 lapacke_zupmtr_work.c
-lapacke_zsyr.c
-lapacke_csyr.c
-lapacke_zsyr_work.c
-lapacke_csyr_work.c
-lapacke_ilaver.c
 )
 
 set(DEPRECATED
diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 7672f9f73..8060151ae 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -32,12 +32,21 @@
 ##############################################################################
 # makefile for LAPACKE, used to build lapacke binary.
 #
-# Note: we use multiple OBJ_A, OBJ_B, etc, instead of a single OBJ
+# Note: we use multiple OBJ_S, OBJ_C, etc, instead of a single OBJ
 # to allow build with mingw (argument list too long for the msys ar)
 #
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
 
-OBJ_A = \
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+
+OBJ = \
+lapacke_ilaver.o \
+lapacke_nancheck.o
+
+OBJ_C = \
 lapacke_cbbcsd.o \
 lapacke_cbbcsd_work.o \
 lapacke_cbdsqr.o \
@@ -82,12 +91,12 @@ lapacke_cgeevx.o \
 lapacke_cgeevx_work.o \
 lapacke_cgehrd.o \
 lapacke_cgehrd_work.o \
+lapacke_cgejsv.o \
+lapacke_cgejsv_work.o \
 lapacke_cgelq.o \
 lapacke_cgelq_work.o \
 lapacke_cgelq2.o \
 lapacke_cgelq2_work.o \
-lapacke_cgejsv.o \
-lapacke_cgejsv_work.o \
 lapacke_cgelqf.o \
 lapacke_cgelqf_work.o \
 lapacke_cgels.o \
@@ -117,11 +126,11 @@ lapacke_cgeqrf_work.o \
 lapacke_cgeqrfp.o \
 lapacke_cgeqrfp_work.o \
 lapacke_cgeqrt.o \
+lapacke_cgeqrt_work.o \
 lapacke_cgeqrt2.o \
 lapacke_cgeqrt2_work.o \
 lapacke_cgeqrt3.o \
 lapacke_cgeqrt3_work.o \
-lapacke_cgeqrt_work.o \
 lapacke_cgerfs.o \
 lapacke_cgerfs_work.o \
 lapacke_cgerqf.o \
@@ -132,6 +141,8 @@ lapacke_cgesv.o \
 lapacke_cgesv_work.o \
 lapacke_cgesvd.o \
 lapacke_cgesvd_work.o \
+lapacke_cgesvdq.o \
+lapacke_cgesvdq_work.o \
 lapacke_cgesvdx.o \
 lapacke_cgesvdx_work.o \
 lapacke_cgesvj.o \
@@ -168,10 +179,10 @@ lapacke_cggevx.o \
 lapacke_cggevx_work.o \
 lapacke_cggglm.o \
 lapacke_cggglm_work.o \
-lapacke_cgghrd.o \
-lapacke_cgghrd_work.o \
 lapacke_cgghd3.o \
 lapacke_cgghd3_work.o \
+lapacke_cgghrd.o \
+lapacke_cgghrd_work.o \
 lapacke_cgglse.o \
 lapacke_cgglse_work.o \
 lapacke_cggqrf.o \
@@ -196,14 +207,14 @@ lapacke_cgttrs.o \
 lapacke_cgttrs_work.o \
 lapacke_chbev.o \
 lapacke_chbev_work.o \
-lapacke_chbevd.o \
-lapacke_chbevd_work.o \
-lapacke_chbevx.o \
-lapacke_chbevx_work.o \
 lapacke_chbev_2stage.o \
 lapacke_chbev_2stage_work.o \
+lapacke_chbevd.o \
+lapacke_chbevd_work.o \
 lapacke_chbevd_2stage.o \
 lapacke_chbevd_2stage_work.o \
+lapacke_chbevx.o \
+lapacke_chbevx_work.o \
 lapacke_chbevx_2stage.o \
 lapacke_chbevx_2stage_work.o \
 lapacke_chbgst.o \
@@ -224,18 +235,18 @@ lapacke_cheequb.o \
 lapacke_cheequb_work.o \
 lapacke_cheev.o \
 lapacke_cheev_work.o \
-lapacke_cheevd.o \
-lapacke_cheevd_work.o \
-lapacke_cheevr.o \
-lapacke_cheevr_work.o \
-lapacke_cheevx.o \
-lapacke_cheevx_work.o \
 lapacke_cheev_2stage.o \
 lapacke_cheev_2stage_work.o \
+lapacke_cheevd.o \
+lapacke_cheevd_work.o \
 lapacke_cheevd_2stage.o \
 lapacke_cheevd_2stage_work.o \
+lapacke_cheevr.o \
+lapacke_cheevr_work.o \
 lapacke_cheevr_2stage.o \
 lapacke_cheevr_2stage_work.o \
+lapacke_cheevx.o \
+lapacke_cheevx_work.o \
 lapacke_cheevx_2stage.o \
 lapacke_cheevx_2stage_work.o \
 lapacke_chegst.o \
@@ -265,35 +276,35 @@ lapacke_cheswapr_work.o \
 lapacke_chetrd.o \
 lapacke_chetrd_work.o \
 lapacke_chetrf.o \
-lapacke_chetrf_rook.o \
 lapacke_chetrf_work.o \
-lapacke_chetrf_rook_work.o \
 lapacke_chetrf_aa.o \
-lapacke_chetrf_aa_2stage.o \
 lapacke_chetrf_aa_work.o \
+lapacke_chetrf_aa_2stage.o \
 lapacke_chetrf_aa_2stage_work.o \
 lapacke_chetrf_rk.o \
 lapacke_chetrf_rk_work.o \
+lapacke_chetrf_rook.o \
+lapacke_chetrf_rook_work.o \
 lapacke_chetri.o \
+lapacke_chetri_work.o \
 lapacke_chetri2.o \
 lapacke_chetri2_work.o \
-lapacke_chetri_3.o \
-lapacke_chetri_3_work.o \
 lapacke_chetri2x.o \
 lapacke_chetri2x_work.o \
-lapacke_chetri_work.o \
+lapacke_chetri_3.o \
+lapacke_chetri_3_work.o \
 lapacke_chetrs.o \
-lapacke_chetrs_rook.o \
+lapacke_chetrs_work.o \
 lapacke_chetrs2.o \
 lapacke_chetrs2_work.o \
-lapacke_chetrs_work.o \
-lapacke_chetrs_rook_work.o \
+lapacke_chetrs_3.o \
+lapacke_chetrs_3_work.o \
 lapacke_chetrs_aa.o \
-lapacke_chetrs_aa_2stage.o \
 lapacke_chetrs_aa_work.o \
+lapacke_chetrs_aa_2stage.o \
 lapacke_chetrs_aa_2stage_work.o \
-lapacke_chetrs_3.o \
-lapacke_chetrs_3_work.o \
+lapacke_chetrs_rook.o \
+lapacke_chetrs_rook_work.o \
 lapacke_chfrk.o \
 lapacke_chfrk_work.o \
 lapacke_chgeqz.o \
@@ -484,11 +495,11 @@ lapacke_csyconv.o \
 lapacke_csyconv_work.o \
 lapacke_csyequb.o \
 lapacke_csyequb_work.o \
+lapacke_csyr.o \
+lapacke_csyr_work.o \
 lapacke_csyrfs.o \
 lapacke_csyrfs_work.o \
 lapacke_csysv.o \
-lapacke_csysv_rook.o \
-lapacke_csysv_rook_work.o \
 lapacke_csysv_work.o \
 lapacke_csysv_aa.o \
 lapacke_csysv_aa_work.o \
@@ -496,40 +507,42 @@ lapacke_csysv_aa_2stage.o \
 lapacke_csysv_aa_2stage_work.o \
 lapacke_csysv_rk.o \
 lapacke_csysv_rk_work.o \
+lapacke_csysv_rook.o \
+lapacke_csysv_rook_work.o \
 lapacke_csysvx.o \
 lapacke_csysvx_work.o \
 lapacke_csyswapr.o \
 lapacke_csyswapr_work.o \
 lapacke_csytrf.o \
 lapacke_csytrf_work.o \
-lapacke_csytrf_rook.o \
-lapacke_csytrf_rook_work.o \
 lapacke_csytrf_aa.o \
-lapacke_csytrf_aa_2stage.o \
 lapacke_csytrf_aa_work.o \
+lapacke_csytrf_aa_2stage.o \
 lapacke_csytrf_aa_2stage_work.o \
 lapacke_csytrf_rk.o \
 lapacke_csytrf_rk_work.o \
+lapacke_csytrf_rook.o \
+lapacke_csytrf_rook_work.o \
 lapacke_csytri.o \
+lapacke_csytri_work.o \
 lapacke_csytri2.o \
 lapacke_csytri2_work.o \
-lapacke_csytri_3.o \
-lapacke_csytri_3_work.o \
 lapacke_csytri2x.o \
 lapacke_csytri2x_work.o \
-lapacke_csytri_work.o \
+lapacke_csytri_3.o \
+lapacke_csytri_3_work.o \
 lapacke_csytrs.o \
-lapacke_csytrs_rook.o \
+lapacke_csytrs_work.o \
 lapacke_csytrs2.o \
 lapacke_csytrs2_work.o \
-lapacke_csytrs_work.o \
-lapacke_csytrs_rook_work.o \
+lapacke_csytrs_3.o \
+lapacke_csytrs_3_work.o \
 lapacke_csytrs_aa.o \
-lapacke_csytrs_aa_2stage.o \
 lapacke_csytrs_aa_work.o \
+lapacke_csytrs_aa_2stage.o \
 lapacke_csytrs_aa_2stage_work.o \
-lapacke_csytrs_3.o \
-lapacke_csytrs_3_work.o \
+lapacke_csytrs_rook.o \
+lapacke_csytrs_rook_work.o \
 lapacke_ctbcon.o \
 lapacke_ctbcon_work.o \
 lapacke_ctbrfs.o \
@@ -561,9 +574,9 @@ lapacke_ctpcon_work.o \
 lapacke_ctpmqrt.o \
 lapacke_ctpmqrt_work.o \
 lapacke_ctpqrt.o \
+lapacke_ctpqrt_work.o \
 lapacke_ctpqrt2.o \
 lapacke_ctpqrt2_work.o \
-lapacke_ctpqrt_work.o \
 lapacke_ctprfb.o \
 lapacke_ctprfb_work.o \
 lapacke_ctprfs.o \
@@ -639,15 +652,17 @@ lapacke_cunmtr_work.o \
 lapacke_cupgtr.o \
 lapacke_cupgtr_work.o \
 lapacke_cupmtr.o \
-lapacke_cupmtr_work.o \
+lapacke_cupmtr_work.o
+
+OBJ_D = \
 lapacke_dbbcsd.o \
 lapacke_dbbcsd_work.o \
 lapacke_dbdsdc.o \
 lapacke_dbdsdc_work.o \
-lapacke_dbdsvdx.o \
-lapacke_dbdsvdx_work.o \
 lapacke_dbdsqr.o \
 lapacke_dbdsqr_work.o \
+lapacke_dbdsvdx.o \
+lapacke_dbdsvdx_work.o \
 lapacke_ddisna.o \
 lapacke_ddisna_work.o \
 lapacke_dgbbrd.o \
@@ -725,11 +740,11 @@ lapacke_dgeqrf_work.o \
 lapacke_dgeqrfp.o \
 lapacke_dgeqrfp_work.o \
 lapacke_dgeqrt.o \
+lapacke_dgeqrt_work.o \
 lapacke_dgeqrt2.o \
 lapacke_dgeqrt2_work.o \
 lapacke_dgeqrt3.o \
 lapacke_dgeqrt3_work.o \
-lapacke_dgeqrt_work.o \
 lapacke_dgerfs.o \
 lapacke_dgerfs_work.o \
 lapacke_dgerqf.o \
@@ -740,6 +755,8 @@ lapacke_dgesv.o \
 lapacke_dgesv_work.o \
 lapacke_dgesvd.o \
 lapacke_dgesvd_work.o \
+lapacke_dgesvdq.o \
+lapacke_dgesvdq_work.o \
 lapacke_dgesvdx.o \
 lapacke_dgesvdx_work.o \
 lapacke_dgesvj.o \
@@ -776,10 +793,10 @@ lapacke_dggevx.o \
 lapacke_dggevx_work.o \
 lapacke_dggglm.o \
 lapacke_dggglm_work.o \
-lapacke_dgghrd.o \
-lapacke_dgghrd_work.o \
 lapacke_dgghd3.o \
 lapacke_dgghd3_work.o \
+lapacke_dgghrd.o \
+lapacke_dgghrd_work.o \
 lapacke_dgglse.o \
 lapacke_dgglse_work.o \
 lapacke_dggqrf.o \
@@ -972,14 +989,14 @@ lapacke_dpttrs.o \
 lapacke_dpttrs_work.o \
 lapacke_dsbev.o \
 lapacke_dsbev_work.o \
-lapacke_dsbevd.o \
-lapacke_dsbevd_work.o \
-lapacke_dsbevx.o \
-lapacke_dsbevx_work.o \
 lapacke_dsbev_2stage.o \
 lapacke_dsbev_2stage_work.o \
+lapacke_dsbevd.o \
+lapacke_dsbevd_work.o \
 lapacke_dsbevd_2stage.o \
 lapacke_dsbevd_2stage_work.o \
+lapacke_dsbevx.o \
+lapacke_dsbevx_work.o \
 lapacke_dsbevx_2stage.o \
 lapacke_dsbevx_2stage_work.o \
 lapacke_dsbgst.o \
@@ -1060,18 +1077,18 @@ lapacke_dsyequb.o \
 lapacke_dsyequb_work.o \
 lapacke_dsyev.o \
 lapacke_dsyev_work.o \
-lapacke_dsyevd.o \
-lapacke_dsyevd_work.o \
-lapacke_dsyevr.o \
-lapacke_dsyevr_work.o \
-lapacke_dsyevx.o \
-lapacke_dsyevx_work.o \
 lapacke_dsyev_2stage.o \
 lapacke_dsyev_2stage_work.o \
+lapacke_dsyevd.o \
+lapacke_dsyevd_work.o \
 lapacke_dsyevd_2stage.o \
 lapacke_dsyevd_2stage_work.o \
+lapacke_dsyevr.o \
+lapacke_dsyevr_work.o \
 lapacke_dsyevr_2stage.o \
 lapacke_dsyevr_2stage_work.o \
+lapacke_dsyevx.o \
+lapacke_dsyevx_work.o \
 lapacke_dsyevx_2stage.o \
 lapacke_dsyevx_2stage_work.o \
 lapacke_dsygst.o \
@@ -1087,8 +1104,6 @@ lapacke_dsygvx_work.o \
 lapacke_dsyrfs.o \
 lapacke_dsyrfs_work.o \
 lapacke_dsysv.o \
-lapacke_dsysv_rook.o \
-lapacke_dsysv_rook_work.o \
 lapacke_dsysv_work.o \
 lapacke_dsysv_aa.o \
 lapacke_dsysv_aa_work.o \
@@ -1096,6 +1111,8 @@ lapacke_dsysv_aa_2stage.o \
 lapacke_dsysv_aa_2stage_work.o \
 lapacke_dsysv_rk.o \
 lapacke_dsysv_rk_work.o \
+lapacke_dsysv_rook.o \
+lapacke_dsysv_rook_work.o \
 lapacke_dsysvx.o \
 lapacke_dsysvx_work.o \
 lapacke_dsyswapr.o \
@@ -1104,36 +1121,34 @@ lapacke_dsytrd.o \
 lapacke_dsytrd_work.o \
 lapacke_dsytrf.o \
 lapacke_dsytrf_work.o \
-lapacke_dsytrf_rook.o \
-lapacke_dsytrf_rook_work.o \
 lapacke_dsytrf_aa.o \
 lapacke_dsytrf_aa_work.o \
 lapacke_dsytrf_aa_2stage.o \
 lapacke_dsytrf_aa_2stage_work.o \
 lapacke_dsytrf_rk.o \
 lapacke_dsytrf_rk_work.o \
+lapacke_dsytrf_rook.o \
+lapacke_dsytrf_rook_work.o \
 lapacke_dsytri.o \
+lapacke_dsytri_work.o \
 lapacke_dsytri2.o \
 lapacke_dsytri2_work.o \
-lapacke_dsytri_3.o \
-lapacke_dsytri_3_work.o \
 lapacke_dsytri2x.o \
 lapacke_dsytri2x_work.o \
-lapacke_dsytri_work.o
-
-OBJ_B = \
+lapacke_dsytri_3.o \
+lapacke_dsytri_3_work.o \
 lapacke_dsytrs.o \
-lapacke_dsytrs_rook.o \
+lapacke_dsytrs_work.o \
 lapacke_dsytrs2.o \
 lapacke_dsytrs2_work.o \
-lapacke_dsytrs_work.o \
-lapacke_dsytrs_rook_work.o \
+lapacke_dsytrs_3.o \
+lapacke_dsytrs_3_work.o \
 lapacke_dsytrs_aa.o \
-lapacke_dsytrs_aa_2stage.o \
 lapacke_dsytrs_aa_work.o \
+lapacke_dsytrs_aa_2stage.o \
 lapacke_dsytrs_aa_2stage_work.o \
-lapacke_dsytrs_3.o \
-lapacke_dsytrs_3_work.o \
+lapacke_dsytrs_rook.o \
+lapacke_dsytrs_rook_work.o \
 lapacke_dtbcon.o \
 lapacke_dtbcon_work.o \
 lapacke_dtbrfs.o \
@@ -1165,9 +1180,9 @@ lapacke_dtpcon_work.o \
 lapacke_dtpmqrt.o \
 lapacke_dtpmqrt_work.o \
 lapacke_dtpqrt.o \
+lapacke_dtpqrt_work.o \
 lapacke_dtpqrt2.o \
 lapacke_dtpqrt2_work.o \
-lapacke_dtpqrt_work.o \
 lapacke_dtprfb.o \
 lapacke_dtprfb_work.o \
 lapacke_dtprfs.o \
@@ -1203,16 +1218,17 @@ lapacke_dtrttf_work.o \
 lapacke_dtrttp.o \
 lapacke_dtrttp_work.o \
 lapacke_dtzrzf.o \
-lapacke_dtzrzf_work.o \
-lapacke_nancheck.o  \
+lapacke_dtzrzf_work.o
+
+OBJ_S = \
 lapacke_sbbcsd.o \
 lapacke_sbbcsd_work.o \
 lapacke_sbdsdc.o \
 lapacke_sbdsdc_work.o \
-lapacke_sbdsvdx.o \
-lapacke_sbdsvdx_work.o \
 lapacke_sbdsqr.o \
 lapacke_sbdsqr_work.o \
+lapacke_sbdsvdx.o \
+lapacke_sbdsvdx_work.o \
 lapacke_sdisna.o \
 lapacke_sdisna_work.o \
 lapacke_sgbbrd.o \
@@ -1290,11 +1306,11 @@ lapacke_sgeqrf_work.o \
 lapacke_sgeqrfp.o \
 lapacke_sgeqrfp_work.o \
 lapacke_sgeqrt.o \
+lapacke_sgeqrt_work.o \
 lapacke_sgeqrt2.o \
 lapacke_sgeqrt2_work.o \
 lapacke_sgeqrt3.o \
 lapacke_sgeqrt3_work.o \
-lapacke_sgeqrt_work.o \
 lapacke_sgerfs.o \
 lapacke_sgerfs_work.o \
 lapacke_sgerqf.o \
@@ -1305,6 +1321,8 @@ lapacke_sgesv.o \
 lapacke_sgesv_work.o \
 lapacke_sgesvd.o \
 lapacke_sgesvd_work.o \
+lapacke_sgesvdq.o \
+lapacke_sgesvdq_work.o \
 lapacke_sgesvdx.o \
 lapacke_sgesvdx_work.o \
 lapacke_sgesvj.o \
@@ -1341,10 +1359,10 @@ lapacke_sggevx.o \
 lapacke_sggevx_work.o \
 lapacke_sggglm.o \
 lapacke_sggglm_work.o \
-lapacke_sgghrd.o \
-lapacke_sgghrd_work.o \
 lapacke_sgghd3.o \
 lapacke_sgghd3_work.o \
+lapacke_sgghrd.o \
+lapacke_sgghrd_work.o \
 lapacke_sgglse.o \
 lapacke_sgglse_work.o \
 lapacke_sggqrf.o \
@@ -1537,14 +1555,14 @@ lapacke_spttrs.o \
 lapacke_spttrs_work.o \
 lapacke_ssbev.o \
 lapacke_ssbev_work.o \
-lapacke_ssbevd.o \
-lapacke_ssbevd_work.o \
-lapacke_ssbevx.o \
-lapacke_ssbevx_work.o \
 lapacke_ssbev_2stage.o \
 lapacke_ssbev_2stage_work.o \
+lapacke_ssbevd.o \
+lapacke_ssbevd_work.o \
 lapacke_ssbevd_2stage.o \
 lapacke_ssbevd_2stage_work.o \
+lapacke_ssbevx.o \
+lapacke_ssbevx_work.o \
 lapacke_ssbevx_2stage.o \
 lapacke_ssbevx_2stage_work.o \
 lapacke_ssbgst.o \
@@ -1621,18 +1639,18 @@ lapacke_ssyequb.o \
 lapacke_ssyequb_work.o \
 lapacke_ssyev.o \
 lapacke_ssyev_work.o \
-lapacke_ssyevd.o \
-lapacke_ssyevd_work.o \
-lapacke_ssyevr.o \
-lapacke_ssyevr_work.o \
-lapacke_ssyevx.o \
-lapacke_ssyevx_work.o \
 lapacke_ssyev_2stage.o \
 lapacke_ssyev_2stage_work.o \
+lapacke_ssyevd.o \
+lapacke_ssyevd_work.o \
 lapacke_ssyevd_2stage.o \
 lapacke_ssyevd_2stage_work.o \
+lapacke_ssyevr.o \
+lapacke_ssyevr_work.o \
 lapacke_ssyevr_2stage.o \
 lapacke_ssyevr_2stage_work.o \
+lapacke_ssyevx.o \
+lapacke_ssyevx_work.o \
 lapacke_ssyevx_2stage.o \
 lapacke_ssyevx_2stage_work.o \
 lapacke_ssygst.o \
@@ -1648,8 +1666,6 @@ lapacke_ssygvx_work.o \
 lapacke_ssyrfs.o \
 lapacke_ssyrfs_work.o \
 lapacke_ssysv.o \
-lapacke_ssysv_rook.o \
-lapacke_ssysv_rook_work.o \
 lapacke_ssysv_work.o \
 lapacke_ssysv_aa.o \
 lapacke_ssysv_aa_work.o \
@@ -1657,6 +1673,8 @@ lapacke_ssysv_aa_2stage.o \
 lapacke_ssysv_aa_2stage_work.o \
 lapacke_ssysv_rk.o \
 lapacke_ssysv_rk_work.o \
+lapacke_ssysv_rook.o \
+lapacke_ssysv_rook_work.o \
 lapacke_ssysvx.o \
 lapacke_ssysvx_work.o \
 lapacke_ssyswapr.o \
@@ -1665,34 +1683,34 @@ lapacke_ssytrd.o \
 lapacke_ssytrd_work.o \
 lapacke_ssytrf.o \
 lapacke_ssytrf_work.o \
-lapacke_ssytrf_rook.o \
-lapacke_ssytrf_rook_work.o \
 lapacke_ssytrf_aa.o \
 lapacke_ssytrf_aa_work.o \
 lapacke_ssytrf_aa_2stage.o \
 lapacke_ssytrf_aa_2stage_work.o \
 lapacke_ssytrf_rk.o \
 lapacke_ssytrf_rk_work.o \
+lapacke_ssytrf_rook.o \
+lapacke_ssytrf_rook_work.o \
 lapacke_ssytri.o \
+lapacke_ssytri_work.o \
 lapacke_ssytri2.o \
 lapacke_ssytri2_work.o \
-lapacke_ssytri_3.o \
-lapacke_ssytri_3_work.o \
 lapacke_ssytri2x.o \
 lapacke_ssytri2x_work.o \
-lapacke_ssytri_work.o \
+lapacke_ssytri_3.o \
+lapacke_ssytri_3_work.o \
 lapacke_ssytrs.o \
-lapacke_ssytrs_rook.o \
+lapacke_ssytrs_work.o \
 lapacke_ssytrs2.o \
 lapacke_ssytrs2_work.o \
-lapacke_ssytrs_work.o \
-lapacke_ssytrs_rook_work.o \
+lapacke_ssytrs_3.o \
+lapacke_ssytrs_3_work.o \
 lapacke_ssytrs_aa.o \
-lapacke_ssytrs_aa_2stage.o \
 lapacke_ssytrs_aa_work.o \
+lapacke_ssytrs_aa_2stage.o \
 lapacke_ssytrs_aa_2stage_work.o \
-lapacke_ssytrs_3.o \
-lapacke_ssytrs_3_work.o \
+lapacke_ssytrs_rook.o \
+lapacke_ssytrs_rook_work.o \
 lapacke_stbcon.o \
 lapacke_stbcon_work.o \
 lapacke_stbrfs.o \
@@ -1762,7 +1780,9 @@ lapacke_strttf_work.o \
 lapacke_strttp.o \
 lapacke_strttp_work.o \
 lapacke_stzrzf.o \
-lapacke_stzrzf_work.o \
+lapacke_stzrzf_work.o
+
+OBJ_Z = \
 lapacke_zbbcsd.o \
 lapacke_zbbcsd_work.o \
 lapacke_zbdsqr.o \
@@ -1846,11 +1866,11 @@ lapacke_zgeqrf_work.o \
 lapacke_zgeqrfp.o \
 lapacke_zgeqrfp_work.o \
 lapacke_zgeqrt.o \
+lapacke_zgeqrt_work.o \
 lapacke_zgeqrt2.o \
 lapacke_zgeqrt2_work.o \
 lapacke_zgeqrt3.o \
 lapacke_zgeqrt3_work.o \
-lapacke_zgeqrt_work.o \
 lapacke_zgerfs.o \
 lapacke_zgerfs_work.o \
 lapacke_zgerqf.o \
@@ -1861,6 +1881,8 @@ lapacke_zgesv.o \
 lapacke_zgesv_work.o \
 lapacke_zgesvd.o \
 lapacke_zgesvd_work.o \
+lapacke_zgesvdq.o \
+lapacke_zgesvdq_work.o \
 lapacke_zgesvdx.o \
 lapacke_zgesvdx_work.o \
 lapacke_zgesvj.o \
@@ -1897,10 +1919,10 @@ lapacke_zggevx.o \
 lapacke_zggevx_work.o \
 lapacke_zggglm.o \
 lapacke_zggglm_work.o \
-lapacke_zgghrd.o \
-lapacke_zgghrd_work.o \
 lapacke_zgghd3.o \
 lapacke_zgghd3_work.o \
+lapacke_zgghrd.o \
+lapacke_zgghrd_work.o \
 lapacke_zgglse.o \
 lapacke_zgglse_work.o \
 lapacke_zggqrf.o \
@@ -1925,14 +1947,14 @@ lapacke_zgttrs.o \
 lapacke_zgttrs_work.o \
 lapacke_zhbev.o \
 lapacke_zhbev_work.o \
-lapacke_zhbevd.o \
-lapacke_zhbevd_work.o \
-lapacke_zhbevx.o \
-lapacke_zhbevx_work.o \
 lapacke_zhbev_2stage.o \
 lapacke_zhbev_2stage_work.o \
+lapacke_zhbevd.o \
+lapacke_zhbevd_work.o \
 lapacke_zhbevd_2stage.o \
 lapacke_zhbevd_2stage_work.o \
+lapacke_zhbevx.o \
+lapacke_zhbevx_work.o \
 lapacke_zhbevx_2stage.o \
 lapacke_zhbevx_2stage_work.o \
 lapacke_zhbgst.o \
@@ -1953,18 +1975,18 @@ lapacke_zheequb.o \
 lapacke_zheequb_work.o \
 lapacke_zheev.o \
 lapacke_zheev_work.o \
-lapacke_zheevd.o \
-lapacke_zheevd_work.o \
-lapacke_zheevr.o \
-lapacke_zheevr_work.o \
-lapacke_zheevx.o \
-lapacke_zheevx_work.o \
 lapacke_zheev_2stage.o \
 lapacke_zheev_2stage_work.o \
+lapacke_zheevd.o \
+lapacke_zheevd_work.o \
 lapacke_zheevd_2stage.o \
 lapacke_zheevd_2stage_work.o \
+lapacke_zheevr.o \
+lapacke_zheevr_work.o \
 lapacke_zheevr_2stage.o \
 lapacke_zheevr_2stage_work.o \
+lapacke_zheevx.o \
+lapacke_zheevx_work.o \
 lapacke_zheevx_2stage.o \
 lapacke_zheevx_2stage_work.o \
 lapacke_zhegst.o \
@@ -1994,35 +2016,35 @@ lapacke_zheswapr_work.o \
 lapacke_zhetrd.o \
 lapacke_zhetrd_work.o \
 lapacke_zhetrf.o \
-lapacke_zhetrf_rook.o \
 lapacke_zhetrf_work.o \
-lapacke_zhetrf_rook_work.o \
 lapacke_zhetrf_aa.o \
-lapacke_zhetrf_aa_2stage.o \
 lapacke_zhetrf_aa_work.o \
+lapacke_zhetrf_aa_2stage.o \
 lapacke_zhetrf_aa_2stage_work.o \
 lapacke_zhetrf_rk.o \
 lapacke_zhetrf_rk_work.o \
+lapacke_zhetrf_rook.o \
+lapacke_zhetrf_rook_work.o \
 lapacke_zhetri.o \
+lapacke_zhetri_work.o \
 lapacke_zhetri2.o \
 lapacke_zhetri2_work.o \
-lapacke_zhetri_3.o \
-lapacke_zhetri_3_work.o \
 lapacke_zhetri2x.o \
 lapacke_zhetri2x_work.o \
-lapacke_zhetri_work.o \
+lapacke_zhetri_3.o \
+lapacke_zhetri_3_work.o \
 lapacke_zhetrs.o \
-lapacke_zhetrs_rook.o \
+lapacke_zhetrs_work.o \
 lapacke_zhetrs2.o \
 lapacke_zhetrs2_work.o \
-lapacke_zhetrs_work.o \
-lapacke_zhetrs_rook_work.o \
+lapacke_zhetrs_3.o \
+lapacke_zhetrs_3_work.o \
 lapacke_zhetrs_aa.o \
-lapacke_zhetrs_aa_2stage.o \
 lapacke_zhetrs_aa_work.o \
+lapacke_zhetrs_aa_2stage.o \
 lapacke_zhetrs_aa_2stage_work.o \
-lapacke_zhetrs_3.o \
-lapacke_zhetrs_3_work.o \
+lapacke_zhetrs_rook.o \
+lapacke_zhetrs_rook_work.o \
 lapacke_zhfrk.o \
 lapacke_zhfrk_work.o \
 lapacke_zhgeqz.o \
@@ -2213,11 +2235,11 @@ lapacke_zsyconv.o \
 lapacke_zsyconv_work.o \
 lapacke_zsyequb.o \
 lapacke_zsyequb_work.o \
+lapacke_zsyr.o \
+lapacke_zsyr_work.o \
 lapacke_zsyrfs.o \
 lapacke_zsyrfs_work.o \
 lapacke_zsysv.o \
-lapacke_zsysv_rook.o \
-lapacke_zsysv_rook_work.o \
 lapacke_zsysv_work.o \
 lapacke_zsysv_aa.o \
 lapacke_zsysv_aa_work.o \
@@ -2225,40 +2247,42 @@ lapacke_zsysv_aa_2stage.o \
 lapacke_zsysv_aa_2stage_work.o \
 lapacke_zsysv_rk.o \
 lapacke_zsysv_rk_work.o \
+lapacke_zsysv_rook.o \
+lapacke_zsysv_rook_work.o \
 lapacke_zsysvx.o \
 lapacke_zsysvx_work.o \
 lapacke_zsyswapr.o \
 lapacke_zsyswapr_work.o \
 lapacke_zsytrf.o \
 lapacke_zsytrf_work.o \
-lapacke_zsytrf_rook.o \
-lapacke_zsytrf_rook_work.o \
 lapacke_zsytrf_aa.o \
-lapacke_zsytrf_aa_2stage.o \
 lapacke_zsytrf_aa_work.o \
+lapacke_zsytrf_aa_2stage.o \
 lapacke_zsytrf_aa_2stage_work.o \
 lapacke_zsytrf_rk.o \
 lapacke_zsytrf_rk_work.o \
+lapacke_zsytrf_rook.o \
+lapacke_zsytrf_rook_work.o \
 lapacke_zsytri.o \
+lapacke_zsytri_work.o \
 lapacke_zsytri2.o \
 lapacke_zsytri2_work.o \
-lapacke_zsytri_3.o \
-lapacke_zsytri_3_work.o \
 lapacke_zsytri2x.o \
 lapacke_zsytri2x_work.o \
-lapacke_zsytri_work.o \
+lapacke_zsytri_3.o \
+lapacke_zsytri_3_work.o \
 lapacke_zsytrs.o \
-lapacke_zsytrs_rook.o \
+lapacke_zsytrs_work.o \
 lapacke_zsytrs2.o \
 lapacke_zsytrs2_work.o \
-lapacke_zsytrs_work.o \
-lapacke_zsytrs_rook_work.o \
+lapacke_zsytrs_3.o \
+lapacke_zsytrs_3_work.o \
 lapacke_zsytrs_aa.o \
-lapacke_zsytrs_aa_2stage.o \
 lapacke_zsytrs_aa_work.o \
+lapacke_zsytrs_aa_2stage.o \
 lapacke_zsytrs_aa_2stage_work.o \
-lapacke_zsytrs_3.o \
-lapacke_zsytrs_3_work.o \
+lapacke_zsytrs_rook.o \
+lapacke_zsytrs_rook_work.o \
 lapacke_ztbcon.o \
 lapacke_ztbcon_work.o \
 lapacke_ztbrfs.o \
@@ -2290,9 +2314,9 @@ lapacke_ztpcon_work.o \
 lapacke_ztpmqrt.o \
 lapacke_ztpmqrt_work.o \
 lapacke_ztpqrt.o \
+lapacke_ztpqrt_work.o \
 lapacke_ztpqrt2.o \
 lapacke_ztpqrt2_work.o \
-lapacke_ztpqrt_work.o \
 lapacke_ztprfb.o \
 lapacke_ztprfb_work.o \
 lapacke_ztprfs.o \
@@ -2368,12 +2392,7 @@ lapacke_zunmtr_work.o \
 lapacke_zupgtr.o \
 lapacke_zupgtr_work.o \
 lapacke_zupmtr.o \
-lapacke_zupmtr_work.o \
-lapacke_zsyr.o \
-lapacke_csyr.o \
-lapacke_zsyr_work.o \
-lapacke_csyr_work.o \
-lapacke_ilaver.o
+lapacke_zupmtr_work.o
 
 ifdef BUILD_DEPRECATED
 DEPRECATED = \
@@ -2452,27 +2471,29 @@ lapacke_zlagsy.o \
 lapacke_zlagsy_work.o
 endif
 
-all: ../../$(LAPACKELIB)
+.PHONY: all
+all: $(LAPACKELIB)
 
-.PHONY: ../../$(LAPACKELIB)
-
-../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_A)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_B)
+$(LAPACKELIB): $(OBJ) $(OBJ_S) $(OBJ_C) $(OBJ_D) $(OBJ_Z) $(DEPRECATED) $(EXTENDED) $(MATGEN)
+	$(AR) $(ARFLAGS) $@ $(OBJ)
+	$(AR) $(ARFLAGS) $@ $(OBJ_S)
+	$(AR) $(ARFLAGS) $@ $(OBJ_C)
+	$(AR) $(ARFLAGS) $@ $(OBJ_D)
+	$(AR) $(ARFLAGS) $@ $(OBJ_Z)
 ifdef BUILD_DEPRECATED
-	$(ARCH) $(ARCHFLAGS) $@ $(DEPRECATED)
+	$(AR) $(ARFLAGS) $@ $(DEPRECATED)
 endif
 ifdef (USEXBLAS)
-	$(ARCH) $(ARCHFLAGS) $@ $(EXTENDED)
+	$(AR) $(ARFLAGS) $@ $(EXTENDED)
 endif
 ifdef LAPACKE_WITH_TMG
-	$(ARCH) $(ARCHFLAGS) $@ $(MATGEN)
+	$(AR) $(ARFLAGS) $@ $(MATGEN)
 endif
 	$(RANLIB) $@
 
-clean: cleanobj
+.PHONY: clean cleanobj cleanlib
+clean: cleanobj cleanlib
 cleanobj:
 	rm -f *.o
-
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
+cleanlib:
+	rm -f $(LAPACKELIB)
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c
index 7d371f660..41278428b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgejsv.c
@@ -124,7 +124,6 @@ lapack_int LAPACKE_cgejsv( int matrix_layout, char joba, char jobu, char jobv,
     float* rwork = NULL;
     lapack_complex_float* cwork = NULL;
     lapack_int i;
-    lapack_int nu, nv;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_cgejsv", -1 );
         return -1;
@@ -132,8 +131,6 @@ lapack_int LAPACKE_cgejsv( int matrix_layout, char joba, char jobu, char jobv,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m;
-        nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n;
         if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
             return -10;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c
index 2ee891977..9d022dae6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgelsd.c
@@ -75,7 +75,7 @@ lapack_int LAPACKE_cgelsd( int matrix_layout, lapack_int m, lapack_int n,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
new file mode 100644
index 000000000..91458136c
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function cgesvdq
+* Author: Intel Corporation
+* Generated November 2018
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgesvdq( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv, lapack_int* numrank)
+{
+    lapack_int info = 0;
+    lapack_int liwork = -1;
+    lapack_int* iwork = NULL;
+    lapack_int iwork_query;
+    lapack_int lcwork = -1;
+    lapack_complex_float* cwork = NULL;
+    lapack_complex_float cwork_query;
+    lapack_int lrwork = -1;
+    double* rwork = NULL;
+    double rwork_query;
+    lapack_int i;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_cgesvdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_cgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 &iwork_query, liwork, &cwork_query, lcwork,
+                                 &rwork_query, lrwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    liwork = iwork_query;
+    lcwork = LAPACK_C2INT(cwork_query);
+    lrwork = (lapack_int)rwork_query;
+    /* Allocate memory for work arrays */
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    cwork = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lcwork );
+    if( cwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork );
+    if( rwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_cgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 iwork, liwork, cwork, lcwork, rwork, lrwork );
+
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+    LAPACKE_free( cwork );
+    LAPACKE_free( rwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_cgesvdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c
new file mode 100644
index 000000000..e86f76e4b
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvdq_work.c
@@ -0,0 +1,149 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function cgesvdq
+* Author: Intel Corporation
+* Generated November 2015
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_cgesvdq_work( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_float* a,
+                           lapack_int lda, float* s, lapack_complex_float* u, lapack_int ldu,
+                           lapack_complex_float* v, lapack_int ldv, lapack_int* numrank,
+                           lapack_int* iwork, lapack_int liwork,
+                           lapack_complex_float* cwork, lapack_int lcwork,
+                           float* rwork, lapack_int lrwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv,
+                       numrank, iwork, &liwork, cwork, &lcwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) ||
+                             LAPACKE_lsame( jobu, 's' ) ) ? m : 1;
+        lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m :
+                             (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
+        lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n :
+                              ( LAPACKE_lsame( jobv, 's' ) ? MIN(m,n) : 1);
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldu_t = MAX(1,nrows_u);
+        lapack_int ldv_t = MAX(1,nrows_v);
+        lapack_complex_float* a_t = NULL;
+        lapack_complex_float* u_t = NULL;
+        lapack_complex_float* v_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+            return info;
+        }
+        if( ldu < ncols_u ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lcwork == -1 ) {
+            LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             cwork, &lcwork, rwork, &lrwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_float*)LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            u_t = (lapack_complex_float*)
+                LAPACKE_malloc( sizeof(lapack_complex_float) * ldu_t * MAX(1,ncols_u) );
+            if( u_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            v_t = (lapack_complex_float*)
+                LAPACKE_malloc( sizeof(lapack_complex_float) * ldv_t * MAX(1,n) );
+            if( v_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_2;
+            }
+        }
+        /* Transpose input matrices */
+        LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+            LAPACK_cgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             cwork, &lcwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t,
+                               u, ldu );
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_cge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v,
+                               ldv );
+        }
+        /* Release memory and exit */
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_free( v_t );
+        }
+exit_level_2:
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_free( u_t );
+        }
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_cgesvdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cggesx.c b/lapack-netlib/LAPACKE/src/lapacke_cggesx.c
index fc939a314..9581691c6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cggesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cggesx.c
@@ -91,7 +91,7 @@ lapack_int LAPACKE_cggesx( int matrix_layout, char jobvsl, char jobvsr,
     if( info != 0 ) {
         goto exit_level_2;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chbevd.c b/lapack-netlib/LAPACKE/src/lapacke_chbevd.c
index 024cf2585..b4af255a9 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chbevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chbevd.c
@@ -67,7 +67,7 @@ lapack_int LAPACKE_chbevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c
index 63f7d8ccb..e8e9a6830 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chbevd_2stage.c
@@ -67,7 +67,7 @@ lapack_int LAPACKE_chbevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c
index d44f6c622..5a7331d87 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chbgvd.c
@@ -71,7 +71,7 @@ lapack_int LAPACKE_chbgvd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
index 40224607c..f505dfab0 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_cheev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                       &info );
@@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd.c
index d0dea375b..75fa47915 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd.c
@@ -53,7 +53,7 @@ lapack_int LAPACKE_cheevd( int matrix_layout, char jobz, char uplo, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -65,7 +65,7 @@ lapack_int LAPACKE_cheevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c
index d87481abf..cb4d34a09 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage.c
@@ -53,7 +53,7 @@ lapack_int LAPACKE_cheevd_2stage( int matrix_layout, char jobz, char uplo, lapac
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -65,7 +65,7 @@ lapack_int LAPACKE_cheevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
index cb51f9ee4..e9e6a5d1d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
@@ -71,7 +71,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_cheevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                        &lrwork, iwork, &liwork, &info );
@@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
index 81869c564..4c5f352a8 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
@@ -71,7 +71,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_cge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_che_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_cheevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                        &lrwork, iwork, &liwork, &info );
@@ -79,7 +79,8 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevr.c b/lapack-netlib/LAPACKE/src/lapacke_cheevr.c
index 6fe261624..f277e7f70 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevr.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_cheevr( int matrix_layout, char jobz, char range, char uplo,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c
index 5b3f5c77a..a09eac1bd 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevr_2stage.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_cheevr_2stage( int matrix_layout, char jobz, char range, char
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c
index c628017c2..ff7dd3532 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_float* a,
-                           lapack_int lda, const lapack_complex_float* b,
+                           lapack_int lda, lapack_complex_float* b,
                            lapack_int ldb )
 {
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
index 001863819..a29e01961 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_float* a,
-                                lapack_int lda, const lapack_complex_float* b,
+                                lapack_int lda, lapack_complex_float* b,
                                 lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
index 2959cb0dc..98c901982 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chpevd.c b/lapack-netlib/LAPACKE/src/lapacke_chpevd.c
index 47c7bbe23..fbdb73802 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chpevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chpevd.c
@@ -66,7 +66,7 @@ lapack_int LAPACKE_chpevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c b/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c
index 568882ec9..587d1509a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_chpgvd.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_chpgvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
index 7f74a9789..8c4c21935 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c
@@ -43,12 +43,10 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_complex_float* a_t = NULL;
+        float* work_lapack = NULL;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
@@ -62,12 +60,23 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo,
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
         }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
         /* Transpose input matrices */
         LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work );
-        info = 0;  /* LAPACK call is ok! */
+        res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
         /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_1:
         LAPACKE_free( a_t );
 exit_level_0:
         if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cstedc.c b/lapack-netlib/LAPACKE/src/lapacke_cstedc.c
index 5be3cec70..3c0be27d5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cstedc.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cstedc.c
@@ -73,7 +73,7 @@ lapack_int LAPACKE_cstedc( int matrix_layout, char compz, lapack_int n, float* d
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cstegr.c b/lapack-netlib/LAPACKE/src/lapacke_cstegr.c
index 986702e62..86a0cd72d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cstegr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cstegr.c
@@ -81,7 +81,7 @@ lapack_int LAPACKE_cstegr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cstemr.c b/lapack-netlib/LAPACKE/src/lapacke_cstemr.c
index 9b9b84e49..51e63c675 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cstemr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cstemr.c
@@ -75,7 +75,7 @@ lapack_int LAPACKE_cstemr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
index f4a0a4334..44405c993 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const lapack_complex_float* a,
+                            lapack_int nrhs, lapack_complex_float* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_float* b, lapack_int ldb )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
index d914c1d69..8567a07d5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, const lapack_complex_float* a,
+                                 lapack_int nrhs, lapack_complex_float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  lapack_complex_float* b, lapack_int ldb,
                                  lapack_complex_float* work )
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c b/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c
index e2f38c87b..6bfcdc996 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctgsen.c
@@ -84,7 +84,7 @@ lapack_int LAPACKE_ctgsen( int matrix_layout, lapack_int ijob,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = LAPACK_C2INT( work_query );
     /* Allocate memory for work arrays */
     if( ijob != 0 ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c b/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c
index 9d2684e4c..fd49d6a7f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ctprfb.c
@@ -41,7 +41,7 @@ lapack_int LAPACKE_ctprfb( int matrix_layout, char side, char trans, char direct
                            lapack_complex_float* a, lapack_int lda,
                            lapack_complex_float* b, lapack_int ldb )
 {
-    lapack_int ncols_v, nrows_v;
+    lapack_int ncols_v, nrows_v, ncols_a, nrows_a;
     lapack_int info = 0;
     lapack_int ldwork;
     lapack_int work_size;
@@ -52,20 +52,33 @@ lapack_int LAPACKE_ctprfb( int matrix_layout, char side, char trans, char direct
     }
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
-        /* Optionally check input matrices for NaNs */
+        /* Optionally check input matrices for NaNs
+         * V is m-by-k (left,  columnwise)
+         *   or n-by-k (right, columnwise)
+         *   or k-by-m (left,  rowwise)
+         *   or k-by-n (right, rowwise)
+         * T is k-by-k
+         * A is k-by-n (left)
+         *   or m-by-k (right)
+         * B is m-by-n
+         */
         if( LAPACKE_lsame( storev, 'C' ) ) {
             ncols_v = k;
             nrows_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
         } else if( LAPACKE_lsame( storev, 'R' ) ) {
             ncols_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
             nrows_v = k;
         } else {
             ncols_v = 0;
             nrows_v = 0;
         }
-        if( LAPACKE_cge_nancheck( matrix_layout, k, m, a, lda ) ) {
+        nrows_a = LAPACKE_lsame( side, 'L' ) ? k :
+                  LAPACKE_lsame( side, 'R' ) ? m : 0;
+        ncols_a = LAPACKE_lsame( side, 'L' ) ? n :
+                  LAPACKE_lsame( side, 'R' ) ? k : 0;
+        if( LAPACKE_cge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) {
             return -14;
         }
         if( LAPACKE_cge_nancheck( matrix_layout, m, n, b, ldb ) ) {
@@ -80,13 +93,13 @@ lapack_int LAPACKE_ctprfb( int matrix_layout, char side, char trans, char direct
     }
 #endif
     if (side=='l' ||  side=='L') {
-       ldwork = k;
-       work_size = MAX(1,ldwork) * MAX(1,n);
-       }
+        ldwork = k;
+        work_size = MAX(1,ldwork) * MAX(1,n);
+    }
     else {
-       ldwork = m;
-       work_size = MAX(1,ldwork) * MAX(1,k);
-       }
+        ldwork = m;
+        work_size = MAX(1,ldwork) * MAX(1,k);
+    }
     /* Allocate memory for working array(s) */
     work = (lapack_complex_float*)
         LAPACKE_malloc( sizeof(lapack_complex_float) * work_size );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c
index 592c6de45..127dd8c57 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cunmhr.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_cunmhr( int matrix_layout, char side, char trans,
         if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) {
             return -11;
         }
-        if( LAPACKE_c_nancheck( m-1, tau, 1 ) ) {
+        if( LAPACKE_c_nancheck( r-1, tau, 1 ) ) {
             return -10;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
index 27647954b..193d65737 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgeesx.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_dgeesx( int matrix_layout, char jobvs, char sort,
     if( info != 0 ) {
         goto exit_level_1;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c
index 444a07b35..d9709bf89 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgejsv.c
@@ -74,7 +74,6 @@ lapack_int LAPACKE_dgejsv( int matrix_layout, char joba, char jobu, char jobv,
     lapack_int* iwork = NULL;
     double* work = NULL;
     lapack_int i;
-    lapack_int nu, nv;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_dgejsv", -1 );
         return -1;
@@ -82,8 +81,6 @@ lapack_int LAPACKE_dgejsv( int matrix_layout, char joba, char jobu, char jobv,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m;
-        nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n;
         if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
             return -10;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c
index 6750597bb..790119596 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgelsd.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_dgelsd( int matrix_layout, lapack_int m, lapack_int n,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
new file mode 100644
index 000000000..7bf831f8b
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function dgesvdq
+* Author: Intel Corporation
+* Generated November 2018
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgesvdq( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* s, double* u, lapack_int ldu,
+                           double* v, lapack_int ldv, lapack_int* numrank)
+{
+    lapack_int info = 0;
+    lapack_int liwork = -1;
+    lapack_int* iwork = NULL;
+    lapack_int iwork_query;
+    lapack_int lwork = -1;
+    double* work = NULL;
+    double work_query;
+    lapack_int lrwork = -1;
+    double* rwork = NULL;
+    double rwork_query;
+    lapack_int i;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_dgesvdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_dgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 &iwork_query, liwork, &work_query, lwork,
+                                 &rwork_query, lrwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    liwork = iwork_query;
+    lwork = (lapack_int)work_query;
+    lrwork = (lapack_int)rwork_query;
+    /* Allocate memory for work arrays */
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work = (double*)LAPACKE_malloc( sizeof(double) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork );
+    if( rwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_dgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 iwork, liwork, work, lwork, rwork, lrwork );
+
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+    LAPACKE_free( work );
+    LAPACKE_free( rwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_dgesvdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c
new file mode 100644
index 000000000..0de92a254
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvdq_work.c
@@ -0,0 +1,149 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function dgesvdq
+* Author: Intel Corporation
+* Generated November 2015
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_dgesvdq_work( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, double* a,
+                           lapack_int lda, double* s, double* u, lapack_int ldu,
+                           double* v, lapack_int ldv, lapack_int* numrank,
+                           lapack_int* iwork, lapack_int liwork,
+                           double* work, lapack_int lwork,
+                           double* rwork, lapack_int lrwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_dgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv,
+                       numrank, iwork, &liwork, work, &lwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) ||
+                             LAPACKE_lsame( jobu, 's' ) ) ? m : 1;
+        lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m :
+                             (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
+        lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n :
+                              ( LAPACKE_lsame( jobv, 's' ) ? MIN(m,n) : 1);
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldu_t = MAX(1,nrows_u);
+        lapack_int ldv_t = MAX(1,nrows_v);
+        double* a_t = NULL;
+        double* u_t = NULL;
+        double* v_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info );
+            return info;
+        }
+        if( ldu < ncols_u ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_dgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             work, &lwork, rwork, &lrwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            u_t = (double*)
+                LAPACKE_malloc( sizeof(double) * ldu_t * MAX(1,ncols_u) );
+            if( u_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            v_t = (double*)
+                LAPACKE_malloc( sizeof(double) * ldv_t * MAX(1,n) );
+            if( v_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_2;
+            }
+        }
+        /* Transpose input matrices */
+        LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+            LAPACK_dgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             work, &lwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t,
+                               u, ldu );
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_dge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v,
+                               ldv );
+        }
+        /* Release memory and exit */
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_free( v_t );
+        }
+exit_level_2:
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_free( u_t );
+        }
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_dgesvdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dggesx.c b/lapack-netlib/LAPACKE/src/lapacke_dggesx.c
index 36addda74..91eb7bf8c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dggesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dggesx.c
@@ -82,7 +82,7 @@ lapack_int LAPACKE_dggesx( int matrix_layout, char jobvsl, char jobvsr,
     if( info != 0 ) {
         goto exit_level_1;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
index 2d570ce42..5b2a6c535 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c
@@ -42,12 +42,10 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         double* a_t = NULL;
+        double* work_lapack = NULL;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
@@ -60,12 +58,23 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo,
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
         }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
         /* Transpose input matrices */
         LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work );
-        info = 0;  /* LAPACK call is ok! */
+        res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
         /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_1:
         LAPACKE_free( a_t );
 exit_level_0:
         if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormhr.c b/lapack-netlib/LAPACKE/src/lapacke_dormhr.c
index de4355a74..4b9526f14 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dormhr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dormhr.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_dormhr( int matrix_layout, char side, char trans,
         if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) {
             return -11;
         }
-        if( LAPACKE_d_nancheck( m-1, tau, 1 ) ) {
+        if( LAPACKE_d_nancheck( r-1, tau, 1 ) ) {
             return -10;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c b/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c
index 4ecd1b522..3a9abbbe1 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsbevd.c
@@ -62,7 +62,7 @@ lapack_int LAPACKE_dsbevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c
index b0ccc0b1e..4d42b6208 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsbevd_2stage.c
@@ -62,7 +62,7 @@ lapack_int LAPACKE_dsbevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c
index 36f912ee5..cab2a64bb 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsbgvd.c
@@ -67,7 +67,7 @@ lapack_int LAPACKE_dsbgvd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dspevd.c b/lapack-netlib/LAPACKE/src/lapacke_dspevd.c
index 3b6b25d5e..c7d93b6b3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dspevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dspevd.c
@@ -61,7 +61,7 @@ lapack_int LAPACKE_dspevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c b/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c
index 8ca478ed1..b49ce95ec 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dspgvd.c
@@ -66,7 +66,7 @@ lapack_int LAPACKE_dspgvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstedc.c b/lapack-netlib/LAPACKE/src/lapacke_dstedc.c
index 4f88a04c4..16e308450 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dstedc.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dstedc.c
@@ -69,7 +69,7 @@ lapack_int LAPACKE_dstedc( int matrix_layout, char compz, lapack_int n,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstegr.c b/lapack-netlib/LAPACKE/src/lapacke_dstegr.c
index 9191f0a9f..7e4f9d694 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dstegr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dstegr.c
@@ -81,7 +81,7 @@ lapack_int LAPACKE_dstegr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstemr.c b/lapack-netlib/LAPACKE/src/lapacke_dstemr.c
index 8dc2bd237..1a3b0ac7b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dstemr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dstemr.c
@@ -75,7 +75,7 @@ lapack_int LAPACKE_dstemr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstevd.c b/lapack-netlib/LAPACKE/src/lapacke_dstevd.c
index e824a164b..251a2ae2e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dstevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dstevd.c
@@ -64,7 +64,7 @@ lapack_int LAPACKE_dstevd( int matrix_layout, char jobz, lapack_int n, double* d
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dstevr.c b/lapack-netlib/LAPACKE/src/lapacke_dstevr.c
index fd53e0ac0..d49e0ff1c 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dstevr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dstevr.c
@@ -81,7 +81,7 @@ lapack_int LAPACKE_dstevr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
index 9dc67f022..5a416ff45 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
@@ -65,14 +65,14 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_dge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_dsyev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c
index 870148b31..d6772ea01 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd.c
@@ -50,7 +50,7 @@ lapack_int LAPACKE_dsyevd( int matrix_layout, char jobz, char uplo, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -61,7 +61,7 @@ lapack_int LAPACKE_dsyevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c
index a5507394c..e866451a5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage.c
@@ -50,7 +50,7 @@ lapack_int LAPACKE_dsyevd_2stage( int matrix_layout, char jobz, char uplo, lapac
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -61,7 +61,7 @@ lapack_int LAPACKE_dsyevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
index 1d06250d1..90d8ce8dc 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
@@ -68,7 +68,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_dge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_dsyevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork,
                        &liwork, &info );
@@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
index 925912619..fff476445 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
@@ -68,7 +68,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_dge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_dsy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_dsyevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork,
                        &liwork, &info );
@@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c
index bae72f6c3..290ae0bd4 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevr.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_dsyevr( int matrix_layout, char jobz, char range, char uplo,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c
index dad20209e..7ee7dbc0b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevr_2stage.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_dsyevr_2stage( int matrix_layout, char jobz, char range, char
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
index 907ad50bd..51f333359 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c
@@ -66,7 +66,7 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
index 46c90190f..4d73ef3c1 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const double* a, lapack_int lda,
+                            lapack_int nrhs, double* a, lapack_int lda,
                             const lapack_int* ipiv, double* b, lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
index c937c39c5..caffa5b4b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, const double* a,
+                                 lapack_int nrhs, double* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  double* b, lapack_int ldb, double* work )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c b/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c
index 2cb7fce4b..baa63abe7 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtgsen.c
@@ -81,7 +81,7 @@ lapack_int LAPACKE_dtgsen( int matrix_layout, lapack_int ijob,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     if( ijob != 0 ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c b/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c
index 5191f79bb..11031b9bb 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtprfb.c
@@ -39,7 +39,7 @@ lapack_int LAPACKE_dtprfb( int matrix_layout, char side, char trans, char direct
                            lapack_int ldv, const double* t, lapack_int ldt,
                            double* a, lapack_int lda, double* b, lapack_int ldb )
 {
-    lapack_int ncols_v, nrows_v;
+    lapack_int ncols_v, nrows_v, ncols_a, nrows_a;
     lapack_int info = 0;
     lapack_int ldwork;
     lapack_int work_size;
@@ -50,20 +50,33 @@ lapack_int LAPACKE_dtprfb( int matrix_layout, char side, char trans, char direct
     }
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
-        /* Optionally check input matrices for NaNs */
+        /* Optionally check input matrices for NaNs
+         * V is m-by-k (left,  columnwise)
+         *   or n-by-k (right, columnwise)
+         *   or k-by-m (left,  rowwise)
+         *   or k-by-n (right, rowwise)
+         * T is k-by-k
+         * A is k-by-n (left)
+         *   or m-by-k (right)
+         * B is m-by-n
+         */
         if( LAPACKE_lsame( storev, 'C' ) ) {
             ncols_v = k;
             nrows_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
         } else if( LAPACKE_lsame( storev, 'R' ) ) {
             ncols_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
             nrows_v = k;
         } else {
             ncols_v = 0;
             nrows_v = 0;
         }
-        if( LAPACKE_dge_nancheck( matrix_layout, k, m, a, lda ) ) {
+        nrows_a = LAPACKE_lsame( side, 'L' ) ? k :
+                  LAPACKE_lsame( side, 'R' ) ? m : 0;
+        ncols_a = LAPACKE_lsame( side, 'L' ) ? n :
+                  LAPACKE_lsame( side, 'R' ) ? k : 0;
+        if( LAPACKE_dge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) {
             return -14;
         }
         if( LAPACKE_dge_nancheck( matrix_layout, m, n, b, ldb ) ) {
@@ -78,16 +91,16 @@ lapack_int LAPACKE_dtprfb( int matrix_layout, char side, char trans, char direct
     }
 #endif
     if (side=='l' ||  side=='L') {
-       ldwork = k;
-       work_size = MAX(1,ldwork) * MAX(1,n);
-       }
+        ldwork = k;
+        work_size = MAX(1,ldwork) * MAX(1,n);
+    }
     else {
-       ldwork = m;
-       work_size = MAX(1,ldwork) * MAX(1,k);
-       }
+        ldwork = m;
+        work_size = MAX(1,ldwork) * MAX(1,k);
+    }
     /* Allocate memory for working array(s) */
     work = (double*)
-    LAPACKE_malloc( sizeof(double) * work_size );
+        LAPACKE_malloc( sizeof(double) * work_size );
     if( work == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
         goto exit_level_0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c b/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c
index 521bc2701..67932fd98 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dtrsen.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_dtrsen( int matrix_layout, char job, char compq,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( job, 'b' ) || LAPACKE_lsame( job, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
index 91cfc4fa5..0bc14b33e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgeesx.c
@@ -76,7 +76,7 @@ lapack_int LAPACKE_sgeesx( int matrix_layout, char jobvs, char sort,
     if( info != 0 ) {
         goto exit_level_1;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( sense, 'b' ) || LAPACKE_lsame( sense, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c
index aa0eeb746..0703e902f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgejsv.c
@@ -74,7 +74,6 @@ lapack_int LAPACKE_sgejsv( int matrix_layout, char joba, char jobu, char jobv,
     lapack_int* iwork = NULL;
     float* work = NULL;
     lapack_int i;
-    lapack_int nu, nv;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_sgejsv", -1 );
         return -1;
@@ -82,8 +81,6 @@ lapack_int LAPACKE_sgejsv( int matrix_layout, char joba, char jobu, char jobv,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m;
-        nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n;
         if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
             return -10;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c
index fc42b1eec..9d00ded10 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgelsd.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_sgelsd( int matrix_layout, lapack_int m, lapack_int n,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
new file mode 100644
index 000000000..5ff543d10
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function sgesvdq
+* Author: Intel Corporation
+* Generated November 2018
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgesvdq( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, float* a,
+                           lapack_int lda, float* s, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, lapack_int* numrank)
+{
+    lapack_int info = 0;
+    lapack_int liwork = -1;
+    lapack_int* iwork = NULL;
+    lapack_int iwork_query;
+    lapack_int lwork = -1;
+    float* work = NULL;
+    float work_query;
+    lapack_int lrwork = -1;
+    float* rwork = NULL;
+    float rwork_query;
+    lapack_int i;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_sgesvdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_sgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 &iwork_query, liwork, &work_query, lwork,
+                                 &rwork_query, lrwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    liwork = iwork_query;
+    lwork = (lapack_int)work_query;
+    lrwork = (lapack_int)rwork_query;
+    /* Allocate memory for work arrays */
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    work = (float*)LAPACKE_malloc( sizeof(float) * lwork );
+    if( work == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    rwork = (float*)LAPACKE_malloc( sizeof(float) * lrwork );
+    if( rwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_sgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 iwork, liwork, work, lwork, rwork, lrwork );
+
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+    LAPACKE_free( work );
+    LAPACKE_free( rwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_sgesvdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c
new file mode 100644
index 000000000..9eab982c2
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvdq_work.c
@@ -0,0 +1,148 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function sgesvdq
+* Author: Intel Corporation
+* Generated November 2015
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_sgesvdq_work( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, float* a,
+                           lapack_int lda, float* s, float* u, lapack_int ldu,
+                           float* v, lapack_int ldv, lapack_int* numrank,
+                           lapack_int* iwork, lapack_int liwork,
+                           float* work, lapack_int lwork,
+                           float* rwork, lapack_int lrwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_sgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv,
+                       numrank, iwork, &liwork, work, &lwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) ||
+                             LAPACKE_lsame( jobu, 's' ) ) ? m : 1;
+        lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m :
+                             (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
+        lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n : 1;
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldu_t = MAX(1,nrows_u);
+        lapack_int ldv_t = MAX(1,nrows_v);
+        float* a_t = NULL;
+        float* u_t = NULL;
+        float* v_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info );
+            return info;
+        }
+        if( ldu < ncols_u ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lwork == -1 ) {
+            LAPACK_sgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             work, &lwork, rwork, &lrwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            u_t = (float*)
+                LAPACKE_malloc( sizeof(float) * ldu_t * MAX(1,ncols_u) );
+            if( u_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            v_t = (float*)
+                LAPACKE_malloc( sizeof(float) * ldv_t * MAX(1,n) );
+            if( v_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_2;
+            }
+        }
+        /* Transpose input matrices */
+        LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+            LAPACK_sgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             work, &lwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t,
+                               u, ldu );
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_sge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v,
+                               ldv );
+        }
+        /* Release memory and exit */
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_free( v_t );
+        }
+exit_level_2:
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_free( u_t );
+        }
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_sgesvdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sggesx.c b/lapack-netlib/LAPACKE/src/lapacke_sggesx.c
index f0acb70a4..d552a2010 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sggesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sggesx.c
@@ -82,7 +82,7 @@ lapack_int LAPACKE_sggesx( int matrix_layout, char jobvsl, char jobvsr,
     if( info != 0 ) {
         goto exit_level_1;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
index e9f84b55c..e1d4c270d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c
@@ -42,12 +42,10 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         float* a_t = NULL;
+        float* work_lapack = NULL;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
@@ -60,12 +58,23 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo,
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
         }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm, 'i' ) ) {
+            work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
         /* Transpose input matrices */
         LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work );
-        info = 0;  /* LAPACK call is ok! */
+        res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
         /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_1:
         LAPACKE_free( a_t );
 exit_level_0:
         if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormhr.c b/lapack-netlib/LAPACKE/src/lapacke_sormhr.c
index a5cca2c45..fba215a19 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sormhr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sormhr.c
@@ -57,7 +57,7 @@ lapack_int LAPACKE_sormhr( int matrix_layout, char side, char trans,
         if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) {
             return -11;
         }
-        if( LAPACKE_s_nancheck( m-1, tau, 1 ) ) {
+        if( LAPACKE_s_nancheck( r-1, tau, 1 ) ) {
             return -10;
         }
     }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c b/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c
index 3acdeb95d..b41e5b156 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssbevd.c
@@ -62,7 +62,7 @@ lapack_int LAPACKE_ssbevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c
index 2eda9cde9..a76d92c71 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssbevd_2stage.c
@@ -62,7 +62,7 @@ lapack_int LAPACKE_ssbevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c
index a6c036846..b40ccb9e5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssbgvd.c
@@ -67,7 +67,7 @@ lapack_int LAPACKE_ssbgvd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sspevd.c b/lapack-netlib/LAPACKE/src/lapacke_sspevd.c
index bd06a8ba6..9b518751b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sspevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sspevd.c
@@ -61,7 +61,7 @@ lapack_int LAPACKE_sspevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c b/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c
index 749abb0b1..e80e24647 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sspgvd.c
@@ -66,7 +66,7 @@ lapack_int LAPACKE_sspgvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstedc.c b/lapack-netlib/LAPACKE/src/lapacke_sstedc.c
index 157874668..f902e8c30 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sstedc.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sstedc.c
@@ -69,7 +69,7 @@ lapack_int LAPACKE_sstedc( int matrix_layout, char compz, lapack_int n, float* d
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstegr.c b/lapack-netlib/LAPACKE/src/lapacke_sstegr.c
index c6a73b2b4..c02372ba2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sstegr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sstegr.c
@@ -81,7 +81,7 @@ lapack_int LAPACKE_sstegr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstemr.c b/lapack-netlib/LAPACKE/src/lapacke_sstemr.c
index 4229819ab..65dcc9170 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sstemr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sstemr.c
@@ -74,7 +74,7 @@ lapack_int LAPACKE_sstemr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstevd.c b/lapack-netlib/LAPACKE/src/lapacke_sstevd.c
index 9f9e2e79e..c5db5d79d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sstevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sstevd.c
@@ -64,7 +64,7 @@ lapack_int LAPACKE_sstevd( int matrix_layout, char jobz, lapack_int n, float* d,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_sstevr.c b/lapack-netlib/LAPACKE/src/lapacke_sstevr.c
index f45c49087..4043e3090 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_sstevr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_sstevr.c
@@ -81,7 +81,7 @@ lapack_int LAPACKE_sstevr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
index fb8c8971b..6a2f8fce3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
@@ -65,14 +65,14 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_sge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_ssyev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, &info );
         if( info < 0 ) {
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c
index 1995e7950..f5924bd94 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd.c
@@ -50,7 +50,7 @@ lapack_int LAPACKE_ssyevd( int matrix_layout, char jobz, char uplo, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -61,7 +61,7 @@ lapack_int LAPACKE_ssyevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c
index 6d6785acc..40ef1bcc2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage.c
@@ -50,7 +50,7 @@ lapack_int LAPACKE_ssyevd_2stage( int matrix_layout, char jobz, char uplo, lapac
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -61,7 +61,7 @@ lapack_int LAPACKE_ssyevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
index 5942a9abb..9394f822f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
@@ -68,7 +68,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_sge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_ssyevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork,
                        &liwork, &info );
@@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
index 7b2e19adc..12d9e84e6 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
@@ -68,7 +68,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_sge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_ssy_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_ssyevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, iwork,
                        &liwork, &info );
@@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c
index d7e050143..3274f6bab 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevr.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_ssyevr( int matrix_layout, char jobz, char range, char uplo,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c
index cbc3014e9..8958be31d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevr_2stage.c
@@ -78,7 +78,7 @@ lapack_int LAPACKE_ssyevr_2stage( int matrix_layout, char jobz, char range, char
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
index 2a1c62aef..5afe8d2de 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c
@@ -66,7 +66,7 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
index a95a71469..19f447cd8 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const float* a, lapack_int lda,
+                            lapack_int nrhs, float* a, lapack_int lda,
                             const lapack_int* ipiv, float* b, lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
index cf98f443d..7d348b382 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n,
-                                 lapack_int nrhs, const float* a,
+                                 lapack_int nrhs, float* a,
                                  lapack_int lda, const lapack_int* ipiv,
                                  float* b, lapack_int ldb, float* work )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_stgsen.c b/lapack-netlib/LAPACKE/src/lapacke_stgsen.c
index 5464fd22b..d0250eb63 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_stgsen.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_stgsen.c
@@ -81,7 +81,7 @@ lapack_int LAPACKE_stgsen( int matrix_layout, lapack_int ijob,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     if( ijob != 0 ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_stprfb.c b/lapack-netlib/LAPACKE/src/lapacke_stprfb.c
index 846d4ccb3..2ea20f08d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_stprfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_stprfb.c
@@ -39,7 +39,7 @@ lapack_int LAPACKE_stprfb( int matrix_layout, char side, char trans, char direct
                            lapack_int ldv, const float* t, lapack_int ldt,
                            float* a, lapack_int lda, float* b, lapack_int ldb)
 {
-    lapack_int ncols_v, nrows_v;
+    lapack_int ncols_v, nrows_v, ncols_a, nrows_a;
     lapack_int info = 0;
     lapack_int ldwork;
     lapack_int work_size;
@@ -50,20 +50,33 @@ lapack_int LAPACKE_stprfb( int matrix_layout, char side, char trans, char direct
     }
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
-        /* Optionally check input matrices for NaNs */
+        /* Optionally check input matrices for NaNs
+         * V is m-by-k (left,  columnwise)
+         *   or n-by-k (right, columnwise)
+         *   or k-by-m (left,  rowwise)
+         *   or k-by-n (right, rowwise)
+         * T is k-by-k
+         * A is k-by-n (left)
+         *   or m-by-k (right)
+         * B is m-by-n
+         */
         if( LAPACKE_lsame( storev, 'C' ) ) {
             ncols_v = k;
             nrows_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
         } else if( LAPACKE_lsame( storev, 'R' ) ) {
             ncols_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
             nrows_v = k;
         } else {
             ncols_v = 0;
             nrows_v = 0;
         }
-        if( LAPACKE_sge_nancheck( matrix_layout, k, m, a, lda ) ) {
+        nrows_a = LAPACKE_lsame( side, 'L' ) ? k :
+                  LAPACKE_lsame( side, 'R' ) ? m : 0;
+        ncols_a = LAPACKE_lsame( side, 'L' ) ? n :
+                  LAPACKE_lsame( side, 'R' ) ? k : 0;
+        if( LAPACKE_sge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) {
             return -14;
         }
         if( LAPACKE_sge_nancheck( matrix_layout, m, n, b, ldb ) ) {
@@ -78,14 +91,14 @@ lapack_int LAPACKE_stprfb( int matrix_layout, char side, char trans, char direct
     }
 #endif
     if (side=='l' ||  side=='L') {
-       ldwork = k;
-       work_size = MAX(1,ldwork) * MAX(1,n);
-       }
+        ldwork = k;
+        work_size = MAX(1,ldwork) * MAX(1,n);
+    }
     else {
-       ldwork = m;
-       work_size = MAX(1,ldwork) * MAX(1,k);
-       }
-        /* Allocate memory for working array(s) */
+        ldwork = m;
+        work_size = MAX(1,ldwork) * MAX(1,k);
+    }
+    /* Allocate memory for working array(s) */
     work = (float*)
         LAPACKE_malloc( sizeof(float) * work_size );
     if( work == NULL ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_strsen.c b/lapack-netlib/LAPACKE/src/lapacke_strsen.c
index efba91af8..0ec3ee907 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_strsen.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_strsen.c
@@ -69,7 +69,7 @@ lapack_int LAPACKE_strsen( int matrix_layout, char job, char compq,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     if( LAPACKE_lsame( job, 'b' ) || LAPACKE_lsame( job, 'v' ) ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c b/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c
index f3b5110a7..153efb371 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgejsv.c
@@ -124,7 +124,6 @@ lapack_int LAPACKE_zgejsv( int matrix_layout, char joba, char jobu, char jobv,
     double* rwork = NULL;
     lapack_complex_double* cwork = NULL;
     lapack_int i;
-    lapack_int nu, nv;
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
         LAPACKE_xerbla( "LAPACKE_zgejsv", -1 );
         return -1;
@@ -132,8 +131,6 @@ lapack_int LAPACKE_zgejsv( int matrix_layout, char joba, char jobu, char jobv,
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        nu = LAPACKE_lsame( jobu, 'n' ) ? 1 : m;
-        nv = LAPACKE_lsame( jobv, 'n' ) ? 1 : n;
         if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
             return -10;
         }
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c b/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c
index 6d111c69f..eca145090 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgelsd.c
@@ -75,7 +75,7 @@ lapack_int LAPACKE_zgelsd( int matrix_layout, lapack_int m, lapack_int n,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
new file mode 100644
index 000000000..f58a5c4e9
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq.c
@@ -0,0 +1,106 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native high-level C interface to LAPACK function zgesvdq
+* Author: Intel Corporation
+* Generated November 2018
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgesvdq( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv, lapack_int* numrank)
+{
+    lapack_int info = 0;
+    lapack_int liwork = -1;
+    lapack_int* iwork = NULL;
+    lapack_int iwork_query;
+    lapack_int lcwork = -1;
+    lapack_complex_double* cwork = NULL;
+    lapack_complex_double cwork_query;
+    lapack_int lrwork = -1;
+    double* rwork = NULL;
+    double rwork_query;
+    lapack_int i;
+    if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
+        LAPACKE_xerbla( "LAPACKE_zgesvdq", -1 );
+        return -1;
+    }
+#ifndef LAPACK_DISABLE_NAN_CHECK
+    if( LAPACKE_get_nancheck() ) {
+        /* Optionally check input matrices for NaNs */
+        if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) {
+            return -6;
+        }
+    }
+#endif
+    /* Query optimal working array(s) size */
+    info = LAPACKE_zgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 &iwork_query, liwork, &cwork_query, lcwork,
+                                 &rwork_query, lrwork );
+    if( info != 0 ) {
+        goto exit_level_0;
+    }
+    liwork = iwork_query;
+    lcwork = LAPACK_C2INT(cwork_query);
+    lrwork = (lapack_int)rwork_query;
+    /* Allocate memory for work arrays */
+    iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
+    if( iwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    cwork = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lcwork );
+    if( cwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    rwork = (double*)LAPACKE_malloc( sizeof(double) * lrwork );
+    if( rwork == NULL ) {
+        info = LAPACK_WORK_MEMORY_ERROR;
+        goto exit_level_0;
+    }
+    /* Call middle-level interface */
+    info = LAPACKE_zgesvdq_work( matrix_layout, joba, jobp, jobr, jobu, jobv,
+                                 m, n, a, lda, s, u, ldu, v, ldv, numrank,
+                                 iwork, liwork, cwork, lcwork, rwork, lrwork );
+
+    /* Release memory and exit */
+    LAPACKE_free( iwork );
+    LAPACKE_free( cwork );
+    LAPACKE_free( rwork );
+exit_level_0:
+    if( info == LAPACK_WORK_MEMORY_ERROR ) {
+        LAPACKE_xerbla( "LAPACKE_zgesvdq", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c
new file mode 100644
index 000000000..5824de4e0
--- /dev/null
+++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvdq_work.c
@@ -0,0 +1,149 @@
+/*****************************************************************************
+  Copyright (c) 2014, Intel Corp.
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+      this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in the
+      documentation and/or other materials provided with the distribution.
+    * Neither the name of Intel Corporation nor the names of its contributors
+      may be used to endorse or promote products derived from this software
+      without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************
+* Contents: Native middle-level C interface to LAPACK function zgesvdq
+* Author: Intel Corporation
+* Generated November 2015
+*****************************************************************************/
+
+#include "lapacke_utils.h"
+
+lapack_int LAPACKE_zgesvdq_work( int matrix_layout, char joba, char jobp,
+                           char jobr, char jobu, char jobv,
+                           lapack_int m, lapack_int n, lapack_complex_double* a,
+                           lapack_int lda, double* s, lapack_complex_double* u, lapack_int ldu,
+                           lapack_complex_double* v, lapack_int ldv, lapack_int* numrank,
+                           lapack_int* iwork, lapack_int liwork,
+                           lapack_complex_double* cwork, lapack_int lcwork,
+                           double* rwork, lapack_int lrwork )
+{
+    lapack_int info = 0;
+    if( matrix_layout == LAPACK_COL_MAJOR ) {
+        /* Call LAPACK function and adjust info */
+        LAPACK_zgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda, s, u, &ldu, v, &ldv,
+                       numrank, iwork, &liwork, cwork, &lcwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+    } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
+        lapack_int nrows_u = ( LAPACKE_lsame( jobu, 'a' ) ||
+                             LAPACKE_lsame( jobu, 's' ) ) ? m : 1;
+        lapack_int ncols_u = LAPACKE_lsame( jobu, 'a' ) ? m :
+                             (LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1);
+        lapack_int nrows_v = LAPACKE_lsame( jobv, 'a' ) ? n :
+                              ( LAPACKE_lsame( jobv, 's' ) ? MIN(m,n) : 1);
+        lapack_int lda_t = MAX(1,m);
+        lapack_int ldu_t = MAX(1,nrows_u);
+        lapack_int ldv_t = MAX(1,nrows_v);
+        lapack_complex_double* a_t = NULL;
+        lapack_complex_double* u_t = NULL;
+        lapack_complex_double* v_t = NULL;
+        /* Check leading dimension(s) */
+        if( lda < n ) {
+            info = -9;
+            LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info );
+            return info;
+        }
+        if( ldu < ncols_u ) {
+            info = -12;
+            LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info );
+            return info;
+        }
+        if( ldv < n ) {
+            info = -14;
+            LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info );
+            return info;
+        }
+        /* Query optimal working array(s) size if requested */
+        if( lcwork == -1 ) {
+            LAPACK_zgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             cwork, &lcwork, rwork, &lrwork, &info );
+            return (info < 0) ? (info - 1) : info;
+        }
+        /* Allocate memory for temporary array(s) */
+        a_t = (lapack_complex_double*)LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) );
+        if( a_t == NULL ) {
+            info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+            goto exit_level_0;
+        }
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            u_t = (lapack_complex_double*)
+                LAPACKE_malloc( sizeof(lapack_complex_double) * ldu_t * MAX(1,ncols_u) );
+            if( u_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            v_t = (lapack_complex_double*)
+                LAPACKE_malloc( sizeof(lapack_complex_double) * ldv_t * MAX(1,n) );
+            if( v_t == NULL ) {
+                info = LAPACK_TRANSPOSE_MEMORY_ERROR;
+                goto exit_level_2;
+            }
+        }
+        /* Transpose input matrices */
+        LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t );
+        /* Call LAPACK function and adjust info */
+            LAPACK_zgesvdq( &joba, &jobp, &jobr, &jobu, &jobv, &m, &n, a, &lda_t,
+                             s, u, &ldu_t, v, &ldv_t, numrank, iwork, &liwork,
+                             cwork, &lcwork, rwork, &lrwork, &info );
+        if( info < 0 ) {
+            info = info - 1;
+        }
+        /* Transpose output matrices */
+        LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda );
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrows_u, ncols_u, u_t, ldu_t,
+                               u, ldu );
+        }
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_zge_trans( LAPACK_COL_MAJOR, nrows_v, n, v_t, ldv_t, v,
+                               ldv );
+        }
+        /* Release memory and exit */
+        if( LAPACKE_lsame( jobv, 'a' ) || LAPACKE_lsame( jobv, 's' ) ) {
+            LAPACKE_free( v_t );
+        }
+exit_level_2:
+        if( LAPACKE_lsame( jobu, 'a' ) || LAPACKE_lsame( jobu, 's' ) ) {
+            LAPACKE_free( u_t );
+        }
+exit_level_1:
+        LAPACKE_free( a_t );
+exit_level_0:
+        if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
+            LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info );
+        }
+    } else {
+        info = -1;
+        LAPACKE_xerbla( "LAPACKE_zgesvdq_work", info );
+    }
+    return info;
+}
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zggesx.c b/lapack-netlib/LAPACKE/src/lapacke_zggesx.c
index 6b4d27045..53e086753 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zggesx.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zggesx.c
@@ -91,7 +91,7 @@ lapack_int LAPACKE_zggesx( int matrix_layout, char jobvsl, char jobvsr,
     if( info != 0 ) {
         goto exit_level_2;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c b/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c
index 95c6d3a54..ac9467496 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhbevd.c
@@ -67,7 +67,7 @@ lapack_int LAPACKE_zhbevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c
index eca867b28..9b6005b2d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhbevd_2stage.c
@@ -67,7 +67,7 @@ lapack_int LAPACKE_zhbevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c
index 91bfc0a73..76c3bac3a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhbgvd.c
@@ -71,7 +71,7 @@ lapack_int LAPACKE_zhbgvd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
index 32b4a76f0..ce278b272 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_zge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_zheev( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                       &info );
@@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd.c
index 4b1afb95c..1305ebfb3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd.c
@@ -53,7 +53,7 @@ lapack_int LAPACKE_zheevd( int matrix_layout, char jobz, char uplo, lapack_int n
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -65,7 +65,7 @@ lapack_int LAPACKE_zheevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c
index 9016da54c..63f139435 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage.c
@@ -53,7 +53,7 @@ lapack_int LAPACKE_zheevd_2stage( int matrix_layout, char jobz, char uplo, lapac
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
         /* Optionally check input matrices for NaNs */
-        if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) {
+        if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) {
             return -5;
         }
     }
@@ -65,7 +65,7 @@ lapack_int LAPACKE_zheevd_2stage( int matrix_layout, char jobz, char uplo, lapac
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
index d4b648ee1..bf2e2c828 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
@@ -71,7 +71,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_zge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_zheevd_2stage( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                        &lrwork, iwork, &liwork, &info );
@@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
index 9672e6a22..f09cfe49d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
@@ -71,7 +71,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo,
             goto exit_level_0;
         }
         /* Transpose input matrices */
-        LAPACKE_zge_trans( matrix_layout, n, n, a, lda, a_t, lda_t );
+        LAPACKE_zhe_trans( matrix_layout, uplo, n, a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
         LAPACK_zheevd( &jobz, &uplo, &n, a_t, &lda_t, w, work, &lwork, rwork,
                        &lrwork, iwork, &liwork, &info );
@@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevr.c b/lapack-netlib/LAPACKE/src/lapacke_zheevr.c
index 52e7a5bee..0d26dc2f9 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevr.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_zheevr( int matrix_layout, char jobz, char range, char uplo,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c
index faf949aef..6fa69c44b 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevr_2stage.c
@@ -83,7 +83,7 @@ lapack_int LAPACKE_zheevr_2stage( int matrix_layout, char jobz, char range, char
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
index aa2d84d84..8c4a5c374 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo,
                            lapack_int n, lapack_complex_double* a,
-                           lapack_int lda, const lapack_complex_double* b,
+                           lapack_int lda, lapack_complex_double* b,
                            lapack_int ldb )
 {
     if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
index f77894204..62fce1f27 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo,
                                 lapack_int n, lapack_complex_double* a,
-                                lapack_int lda, const lapack_complex_double* b,
+                                lapack_int lda, lapack_complex_double* b,
                                 lapack_int ldb )
 {
     lapack_int info = 0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
index 81c3d29b4..1242a0eda 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c b/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c
index 948bb9c10..a470ca3bb 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhpevd.c
@@ -66,7 +66,7 @@ lapack_int LAPACKE_zhpevd( int matrix_layout, char jobz, char uplo, lapack_int n
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c
index be18d3313..91fa26443 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zhpgvd.c
@@ -70,7 +70,7 @@ lapack_int LAPACKE_zhpgvd( int matrix_layout, lapack_int itype, char jobz,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
index 0d8bcf550..e62f8a4e3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c
@@ -43,12 +43,10 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo,
     if( matrix_layout == LAPACK_COL_MAJOR ) {
         /* Call LAPACK function and adjust info */
         res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work );
-        if( info < 0 ) {
-            info = info - 1;
-        }
     } else if( matrix_layout == LAPACK_ROW_MAJOR ) {
         lapack_int lda_t = MAX(1,m);
         lapack_complex_double* a_t = NULL;
+        double* work_lapack = NULL;
         /* Check leading dimension(s) */
         if( lda < n ) {
             info = -8;
@@ -62,12 +60,23 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo,
             info = LAPACK_TRANSPOSE_MEMORY_ERROR;
             goto exit_level_0;
         }
+        /* Allocate memory for work array(s) */
+        if( LAPACKE_lsame( norm, 'i' ) ) {
+            work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) );
+            if( work_lapack == NULL ) {
+                info = LAPACK_WORK_MEMORY_ERROR;
+                goto exit_level_1;
+            }
+        }
         /* Transpose input matrices */
         LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t );
         /* Call LAPACK function and adjust info */
-        res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work );
-        info = 0;  /* LAPACK call is ok! */
+        res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack );
         /* Release memory and exit */
+        if( work_lapack ) {
+            LAPACKE_free( work_lapack );
+        }
+exit_level_1:
         LAPACKE_free( a_t );
 exit_level_0:
         if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zstedc.c b/lapack-netlib/LAPACKE/src/lapacke_zstedc.c
index 1bd7274c1..665c4414f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zstedc.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zstedc.c
@@ -74,7 +74,7 @@ lapack_int LAPACKE_zstedc( int matrix_layout, char compz, lapack_int n,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lrwork = (lapack_int)rwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zstegr.c b/lapack-netlib/LAPACKE/src/lapacke_zstegr.c
index 2a65dcc4d..07b5ce81d 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zstegr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zstegr.c
@@ -82,7 +82,7 @@ lapack_int LAPACKE_zstegr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zstemr.c b/lapack-netlib/LAPACKE/src/lapacke_zstemr.c
index c1144488e..d1d1d5692 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zstemr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zstemr.c
@@ -75,7 +75,7 @@ lapack_int LAPACKE_zstemr( int matrix_layout, char jobz, char range,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = (lapack_int)work_query;
     /* Allocate memory for work arrays */
     iwork = (lapack_int*)LAPACKE_malloc( sizeof(lapack_int) * liwork );
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
index 3c85f9796..7442702aa 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c
@@ -34,7 +34,7 @@
 #include "lapacke_utils.h"
 
 lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n,
-                            lapack_int nrhs, const lapack_complex_double* a,
+                            lapack_int nrhs, lapack_complex_double* a,
                             lapack_int lda, const lapack_int* ipiv,
                             lapack_complex_double* b, lapack_int ldb )
 {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
index cdc97fa02..ec05ce6d5 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c
@@ -35,7 +35,7 @@
 
 lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n,
                                  lapack_int nrhs,
-                                 const lapack_complex_double* a, lapack_int lda,
+                                 lapack_complex_double* a, lapack_int lda,
                                  const lapack_int* ipiv,
                                  lapack_complex_double* b, lapack_int ldb,
                                  lapack_complex_double* work )
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c b/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c
index 60f48ba8f..f6f58becd 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztgsen.c
@@ -84,7 +84,7 @@ lapack_int LAPACKE_ztgsen( int matrix_layout, lapack_int ijob,
     if( info != 0 ) {
         goto exit_level_0;
     }
-    liwork = (lapack_int)iwork_query;
+    liwork = iwork_query;
     lwork = LAPACK_Z2INT( work_query );
     /* Allocate memory for work arrays */
     if( ijob != 0 ) {
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c b/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c
index fce801762..7a791c0d4 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ztprfb.c
@@ -41,7 +41,7 @@ lapack_int LAPACKE_ztprfb( int matrix_layout, char side, char trans, char direct
                            lapack_complex_double* a, lapack_int lda,
                            lapack_complex_double* b, lapack_int ldb)
 {
-    lapack_int ncols_v, nrows_v;
+    lapack_int ncols_v, nrows_v, ncols_a, nrows_a;
     lapack_int info = 0;
     lapack_int ldwork;
     lapack_int work_size;
@@ -52,20 +52,33 @@ lapack_int LAPACKE_ztprfb( int matrix_layout, char side, char trans, char direct
     }
 #ifndef LAPACK_DISABLE_NAN_CHECK
     if( LAPACKE_get_nancheck() ) {
-        /* Optionally check input matrices for NaNs */
+        /* Optionally check input matrices for NaNs
+         * V is m-by-k (left,  columnwise)
+         *   or n-by-k (right, columnwise)
+         *   or k-by-m (left,  rowwise)
+         *   or k-by-n (right, rowwise)
+         * T is k-by-k
+         * A is k-by-n (left)
+         *   or m-by-k (right)
+         * B is m-by-n
+         */
         if( LAPACKE_lsame( storev, 'C' ) ) {
             ncols_v = k;
             nrows_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
         } else if( LAPACKE_lsame( storev, 'R' ) ) {
             ncols_v = LAPACKE_lsame( side, 'L' ) ? m :
-                                 ( LAPACKE_lsame( side, 'R' ) ? n : 0 );
+                      LAPACKE_lsame( side, 'R' ) ? n : 0;
             nrows_v = k;
         } else {
             ncols_v = 0;
             nrows_v = 0;
         }
-        if( LAPACKE_zge_nancheck( matrix_layout, k, m, a, lda ) ) {
+        nrows_a = LAPACKE_lsame( side, 'L' ) ? k :
+                  LAPACKE_lsame( side, 'R' ) ? m : 0;
+        ncols_a = LAPACKE_lsame( side, 'L' ) ? n :
+                  LAPACKE_lsame( side, 'R' ) ? k : 0;
+        if( LAPACKE_zge_nancheck( matrix_layout, ncols_a, nrows_a, a, lda ) ) {
             return -14;
         }
         if( LAPACKE_zge_nancheck( matrix_layout, m, n, b, ldb ) ) {
@@ -80,17 +93,16 @@ lapack_int LAPACKE_ztprfb( int matrix_layout, char side, char trans, char direct
     }
 #endif
     if (side=='l' ||  side=='L') {
-       ldwork = k;
-       work_size = MAX(1,ldwork) * MAX(1,n);
-       }
+        ldwork = k;
+        work_size = MAX(1,ldwork) * MAX(1,n);
+    }
     else {
-       ldwork = m;
-       work_size = MAX(1,ldwork) * MAX(1,k);
-       }
-
+        ldwork = m;
+        work_size = MAX(1,ldwork) * MAX(1,k);
+    }
     /* Allocate memory for working array(s) */
     work = (lapack_complex_double*)
-    LAPACKE_malloc( sizeof(lapack_complex_double) * work_size );
+        LAPACKE_malloc( sizeof(lapack_complex_double) * work_size );
     if( work == NULL ) {
         info = LAPACK_WORK_MEMORY_ERROR;
         goto exit_level_0;
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c b/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c
index 357d71184..61ed6f6f2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zunmhr.c
@@ -58,7 +58,7 @@ lapack_int LAPACKE_zunmhr( int matrix_layout, char side, char trans,
         if( LAPACKE_zge_nancheck( matrix_layout, m, n, c, ldc ) ) {
             return -11;
         }
-        if( LAPACKE_z_nancheck( m-1, tau, 1 ) ) {
+        if( LAPACKE_z_nancheck( r-1, tau, 1 ) ) {
             return -10;
         }
     }
diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile
index 1f639c6ea..648a8c141 100644
--- a/lapack-netlib/LAPACKE/utils/Makefile
+++ b/lapack-netlib/LAPACKE/utils/Makefile
@@ -32,7 +32,12 @@
 ##############################################################################
 # makefile for LAPACKE, used to build lapacke binary.
 #
-include ../../make.inc
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
+.SUFFIXES: .c .o
+.c.o:
+	$(CC) $(CFLAGS) -I../include -c -o $@ $<
 
 OBJ = lapacke_cgb_nancheck.o \
       lapacke_cgb_trans.o \
@@ -183,15 +188,15 @@ OBJ = lapacke_cgb_nancheck.o \
       lapacke_make_complex_float.o \
       lapacke_make_complex_double.o
 
+.PHONY: all
 all: lib
 
+.PHONY: lib
 lib: $(OBJ)
-	$(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $^
-	$(RANLIB) ../../$(LAPACKELIB)
+	$(AR) $(ARFLAGS) $(LAPACKELIB) $^
+	$(RANLIB) $(LAPACKELIB)
 
+.PHONY: clean cleanobj
 clean: cleanobj
 cleanobj:
 	rm -f *.o
-
-.c.o:
-	$(CC) $(CFLAGS) -I../include -c -o $@ $<
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c
index 5e51e237c..0a7e6a2e2 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_chp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c
index a1f14fd69..5e058418e 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_cpf_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo, transr or
  * matrix_layout.
  */
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c
index fc00ce2df..23174d68b 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_cpp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c
index 56d53c74b..d1a8aa290 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_csp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c
index 97d1ab083..35c48a409 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ctp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c
index 69c4cfdb4..df95f1318 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dpf_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo, transr or
  * matrix_layout.
  */
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c
index 214496710..0ba66f96c 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dpp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c
index 2eada7c99..69d24611c 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dsp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c
index 29666e273..43f33bdd2 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_dtp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c
index 0e5b4659f..20666c4d6 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_spf_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo, transr or
  * matrix_layout.
  */
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c
index eae73fa5c..c1098de70 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_spp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c
index 447724b01..35ffe6522 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ssp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c
index 2932d4040..4dfef0200 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_stp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c
index 694e1310e..bcf331fe1 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_zhp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c
index a0682290b..c510b1d1a 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_zpf_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo, transr or
  * matrix_layout.
  */
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c
index 141a796aa..450878bcf 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_zpp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c
index d1a88641c..2d7795166 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_zsp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c b/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c
index 8e1eec971..d3a06c381 100644
--- a/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c
+++ b/lapack-netlib/LAPACKE/utils/lapacke_ztp_nancheck.c
@@ -33,7 +33,7 @@
 #include "lapacke_utils.h"
 
 /* Check a matrix for NaN entries.
- * Since matrix in packed format stored continiously it just required to
+ * Since matrix in packed format stored continuously it just required to
  * check 1d array for NaNs. It doesn't depend upon uplo or matrix_layout.
  */
 
diff --git a/lapack-netlib/Makefile b/lapack-netlib/Makefile
index 1d7e82c34..d5e75b69e 100644
--- a/lapack-netlib/Makefile
+++ b/lapack-netlib/Makefile
@@ -4,89 +4,120 @@
 #  April 2012
 #
 
-include make.inc
+TOPSRCDIR = .
+include $(TOPSRCDIR)/make.inc
 
+.PHONY: all
 all: lapack_install lib blas_testing lapack_testing
 
+.PHONY: lib
 lib: lapacklib tmglib
 #lib: blaslib variants lapacklib tmglib
 
+.PHONY: blaslib
 blaslib:
 	$(MAKE) -C BLAS
 
+.PHONY: cblaslib
 cblaslib:
 	$(MAKE) -C CBLAS
 
+.PHONY: lapacklib
 lapacklib:
 	$(MAKE) -C SRC
 
+.PHONY: lapackelib
 lapackelib:
 	$(MAKE) -C LAPACKE
 
+.PHONY: blaspplib
+blaspplib:
+	@echo "Thank you for your interest in BLAS++, a newly developed C++ API for BLAS library"
+	@echo "The objective of BLAS++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc."
+	@echo "We are still working on integrating BLAS++ in our library. For the moment, you can download directly blas++ from https://bitbucket.org/icl/blaspp"
+	@echo "For support BLAS++ related question, please email: slate-user@icl.utk.edu"
+
+.PHONY: lapackpplib
+lapackpplib:
+	@echo "Thank you for your interest in LAPACK++, a newly developed C++ API for LAPACK library"
+	@echo "The objective of LAPACK++ is to provide a convenient, performance oriented API for development in the C++ language, that, for the most part, preserves established conventions, while, at the same time, takes advantages of modern C++ features, such as: namespaces, templates, exceptions, etc."
+	@echo "We are still working on integrating LAPACK++ in our library. For the moment, you can download directly lapack++ from https://bitbucket.org/icl/lapackpp"
+	@echo "For support LAPACK++ related question, please email: slate-user@icl.utk.edu"
+
+.PHONY: tmglib
 tmglib:
 	$(MAKE) -C TESTING/MATGEN
 
+.PHONY: variants
 variants:
 	$(MAKE) -C SRC/VARIANTS
 
+.PHONY: lapack_install
 lapack_install:
 	$(MAKE) -C INSTALL run
 
+.PHONY: blas_testing
 blas_testing: blaslib
 	$(MAKE) -C BLAS blas_testing
 
+.PHONY: cblas_testing
 cblas_testing: cblaslib blaslib
 	$(MAKE) -C CBLAS cblas_testing
 
+.PHONY: lapack_testing
 lapack_testing: tmglib lapacklib blaslib
 	$(MAKE) -C TESTING/LIN cleanexe
 	$(MAKE) -C TESTING
 	./lapack_testing.py
 
+.PHONY: variants_testing
 variants_testing: tmglib variants lapacklib blaslib
 	$(MAKE) -C TESTING/LIN cleanexe
-	$(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/cholrl.a'
+	$(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/cholrl.a'
 	$(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_cholrl.out
 	$(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_cholrl.out
 	$(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_cholrl.out
 	$(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_cholrl.out
 	$(MAKE) -C TESTING/LIN cleanexe
-	$(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/choltop.a'
+	$(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/choltop.a'
 	$(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_choltop.out
 	$(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_choltop.out
 	$(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_choltop.out
 	$(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_choltop.out
 	$(MAKE) -C TESTING/LIN cleanexe
-	$(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/lucr.a'
+	$(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/lucr.a'
 	$(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_lucr.out
 	$(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_lucr.out
 	$(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_lucr.out
 	$(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_lucr.out
 	$(MAKE) -C TESTING/LIN cleanexe
-	$(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/lull.a'
+	$(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/lull.a'
 	$(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_lull.out
 	$(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_lull.out
 	$(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_lull.out
 	$(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_lull.out
 	$(MAKE) -C TESTING/LIN cleanexe
-	$(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/lurec.a'
+	$(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/lurec.a'
 	$(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_lurec.out
 	$(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_lurec.out
 	$(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_lurec.out
 	$(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_lurec.out
 	$(MAKE) -C TESTING/LIN cleanexe
-	$(MAKE) -C TESTING/LIN VARLIB='SRC/VARIANTS/qrll.a'
+	$(MAKE) -C TESTING/LIN VARLIB='../../SRC/VARIANTS/qrll.a'
 	$(MAKE) -C TESTING stest.out && mv TESTING/stest.out TESTING/stest_qrll.out
 	$(MAKE) -C TESTING dtest.out && mv TESTING/dtest.out TESTING/dtest_qrll.out
 	$(MAKE) -C TESTING ctest.out && mv TESTING/ctest.out TESTING/ctest_qrll.out
 	$(MAKE) -C TESTING ztest.out && mv TESTING/ztest.out TESTING/ztest_qrll.out
 
+.PHONY: cblas_example
 cblas_example: cblaslib blaslib
 	$(MAKE) -C CBLAS cblas_example
 
+.PHONY: lapacke_example
 lapacke_example: lapackelib lapacklib blaslib
 	$(MAKE) -C LAPACKE lapacke_example
 
+.PHONY: html
 html:
 	@echo "LAPACK HTML PAGES GENERATION with Doxygen"
 	doxygen DOCS/Doxyfile
@@ -96,6 +127,7 @@ html:
 	@echo "Online version available at http://www.netlib.org/lapack/explore-html/"
 	@echo "=================="
 
+.PHONY: man
 man:
 	@echo "LAPACK MAN PAGES GENERATION with Doxygen"
 	doxygen DOCS/Doxyfile_man
@@ -105,6 +137,7 @@ man:
 	@echo "Usage: man dgetrf.f"
 	@echo "=================="
 
+.PHONY: clean cleanobj cleanlib cleanexe cleantest
 clean:
 	$(MAKE) -C INSTALL clean
 	$(MAKE) -C BLAS clean
@@ -146,4 +179,4 @@ cleantest:
 	$(MAKE) -C INSTALL cleantest
 	$(MAKE) -C BLAS cleantest
 	$(MAKE) -C CBLAS cleantest
-	$(MAKE) -C TESTING cleantest
+	$(MAKE) -C TESTING cleantest
\ No newline at end of file
diff --git a/lapack-netlib/README.md b/lapack-netlib/README.md
index e5ac2d9c8..f0aed6c18 100644
--- a/lapack-netlib/README.md
+++ b/lapack-netlib/README.md
@@ -3,6 +3,7 @@
 [![Build Status](https://travis-ci.org/Reference-LAPACK/lapack.svg?branch=master)](https://travis-ci.org/Reference-LAPACK/lapack)
 [![Appveyor](https://ci.appveyor.com/api/projects/status/bh38iin398msrbtr?svg=true)](https://ci.appveyor.com/project/langou/lapack/)
 [![codecov](https://codecov.io/gh/Reference-LAPACK/lapack/branch/master/graph/badge.svg)](https://codecov.io/gh/Reference-LAPACK/lapack)
+[![Packaging status](https://repology.org/badge/tiny-repos/lapack.svg)](https://repology.org/metapackage/lapack/versions)
 
 
 * VERSION 1.0   :  February 29, 1992
@@ -29,6 +30,7 @@
 * VERSION 3.7.0 : December 2016
 * VERSION 3.7.1 : June 2017
 * VERSION 3.8.0 : November 2017
+* VERSION 3.9.0 : November 2019
 
 LAPACK is a library of Fortran subroutines for solving the most commonly
 occurring problems in numerical linear algebra.
@@ -70,6 +72,14 @@ CBLAS, a C interface to the BLAS, and (5) LAPACKE, a C interface to LAPACK.
  - LAPACK includes also the CMake build. You will need to have CMake installed
    on your machine (CMake is available at http://www.cmake.org/). CMake will
    allow an easy installation on a Windows Machine.
+   An example CMake build is:
+   ```sh
+   mkdir build
+   cd build
+   cmake -DCMAKE_INSTALL_LIBDIR=$HOME/.local/lapack ..
+   cmake --build -j . --target install
+   ```
+   That installs the LAPACK library under $HOME/.local/lapack/
  - Specific information to run LAPACK under Windows is available at
    http://icl.cs.utk.edu/lapack-for-windows/lapack/.
 
@@ -99,7 +109,7 @@ You can also contact directly the LAPACK team at lapack@icl.utk.edu.
 ## Testing
 
 LAPACK includes a thorough test suite. We recommend that, after compilation,
-you run the test suite. 
+you run the test suite.
 
 For complete information on the LAPACK Testing please consult LAPACK Working
 Note 41 "Installation Guide for LAPACK".
@@ -115,4 +125,3 @@ LAPACK now includes the LAPACKE package.  LAPACKE is a Standard C language API
 for LAPACK This was born from a collaboration of the LAPACK and INTEL Math
 Kernel Library teams. See:
 http://www.netlib.org/lapack/#_standard_c_language_apis_for_lapack.
-
diff --git a/lapack-netlib/SRC/CMakeLists.txt b/lapack-netlib/SRC/CMakeLists.txt
index 944401beb..f19bdd302 100644
--- a/lapack-netlib/SRC/CMakeLists.txt
+++ b/lapack-netlib/SRC/CMakeLists.txt
@@ -106,7 +106,7 @@ set(SLASRC
    slatbs.f slatdf.f slatps.f slatrd.f slatrs.f slatrz.f
    slauu2.f slauum.f sopgtr.f sopmtr.f sorg2l.f sorg2r.f
    sorgbr.f sorghr.f sorgl2.f sorglq.f sorgql.f sorgqr.f sorgr2.f
-   sorgrq.f sorgtr.f sorm2l.f sorm2r.f sorm22.f
+   sorgrq.f sorgtr.f sorgtsqr.f sorm2l.f sorm2r.f sorm22.f
    sormbr.f sormhr.f sorml2.f sormlq.f sormql.f sormqr.f sormr2.f
    sormr3.f sormrq.f sormrz.f sormtr.f spbcon.f spbequ.f spbrfs.f
    spbstf.f spbsv.f  spbsvx.f
@@ -148,9 +148,11 @@ set(SLASRC
    sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f
    sgelq.f slaswlq.f slamswlq.f sgemlq.f
    stplqt.f stplqt2.f stpmlqt.f
+   sorhr_col.f slaorhr_col_getrfnp.f slaorhr_col_getrfnp2.f
    ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f
    ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f
-   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f)
+   ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f
+   sgesvdq.f scombssq.f)
 
 set(DSLASRC spotrs.f sgetrs.f spotrf.f sgetrf.f)
 
@@ -233,7 +235,7 @@ set(CLASRC
    ctptrs.f ctrcon.f ctrevc.f ctrevc3.f ctrexc.f ctrrfs.f ctrsen.f ctrsna.f
    ctrsyl.f ctrti2.f ctrtri.f ctrtrs.f ctzrzf.f cung2l.f cung2r.f
    cungbr.f cunghr.f cungl2.f cunglq.f cungql.f cungqr.f cungr2.f
-   cungrq.f cungtr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f
+   cungrq.f cungtr.f cungtsqr.f cunm2l.f cunm2r.f cunmbr.f cunmhr.f cunml2.f cunm22.f
    cunmlq.f cunmql.f cunmqr.f cunmr2.f cunmr3.f cunmrq.f cunmrz.f
    cunmtr.f cupgtr.f cupmtr.f icmax1.f scsum1.f cstemr.f
    chfrk.f ctfttp.f clanhf.f cpftrf.f cpftri.f cpftrs.f ctfsm.f ctftri.f
@@ -247,9 +249,11 @@ set(CLASRC
    cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f
    cgelq.f claswlq.f clamswlq.f cgemlq.f
    ctplqt.f ctplqt2.f ctpmlqt.f
+   cunhr_col.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f
    chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f
    cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f
-   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f)
+   chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f
+   cgesvdq.f)
 
 set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f
    cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f
@@ -295,7 +299,7 @@ set(DLASRC
    dlatbs.f dlatdf.f dlatps.f dlatrd.f dlatrs.f dlatrz.f dlauu2.f
    dlauum.f dopgtr.f dopmtr.f dorg2l.f dorg2r.f
    dorgbr.f dorghr.f dorgl2.f dorglq.f dorgql.f dorgqr.f dorgr2.f
-   dorgrq.f dorgtr.f dorm2l.f dorm2r.f dorm22.f
+   dorgrq.f dorgtr.f dorgtsqr.f dorm2l.f dorm2r.f dorm22.f
    dormbr.f dormhr.f dorml2.f dormlq.f dormql.f dormqr.f dormr2.f
    dormr3.f dormrq.f dormrz.f dormtr.f dpbcon.f dpbequ.f dpbrfs.f
    dpbstf.f dpbsv.f  dpbsvx.f
@@ -339,9 +343,11 @@ set(DLASRC
    dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f
    dgelq.f dlaswlq.f dlamswlq.f dgemlq.f
    dtplqt.f dtplqt2.f dtpmlqt.f
+   dorhr_col.f dlaorhr_col_getrfnp.f dlaorhr_col_getrfnp2.f
    dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f
    dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f
-   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f)
+   dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f
+   dgesvdq.f dcombssq.f)
 
 set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f
    dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f
@@ -424,7 +430,7 @@ set(ZLASRC
    ztptrs.f ztrcon.f ztrevc.f ztrevc3.f ztrexc.f ztrrfs.f ztrsen.f ztrsna.f
    ztrsyl.f ztrti2.f ztrtri.f ztrtrs.f ztzrzf.f zung2l.f
    zung2r.f zungbr.f zunghr.f zungl2.f zunglq.f zungql.f zungqr.f zungr2.f
-   zungrq.f zungtr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f
+   zungrq.f zungtr.f zungtsqr.f zunm2l.f zunm2r.f zunmbr.f zunmhr.f zunml2.f zunm22.f
    zunmlq.f zunmql.f zunmqr.f zunmr2.f zunmr3.f zunmrq.f zunmrz.f
    zunmtr.f zupgtr.f
    zupmtr.f izmax1.f dzsum1.f zstemr.f
@@ -440,9 +446,11 @@ set(ZLASRC
    zgelqt.f zgelqt3.f zgemlqt.f
    zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f
    zgelq.f zlaswlq.f zlamswlq.f zgemlq.f
+   zunhr_col.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f
    zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f
    zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f
-   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f)
+   zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f
+   zgesvdq.f)
 
 set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f
    zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f
@@ -504,7 +512,7 @@ if(USE_XBLAS)
 endif()
 target_link_libraries(lapack PRIVATE ${BLAS_LIBRARIES})
 
-if (${CMAKE_BUILD_TYPE_UPPER} STREQUAL "COVERAGE")
+if(_is_coverage_build)
   target_link_libraries(lapack PRIVATE gcov)
   add_coverage(lapack)
 endif()
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 1c276aff6..9f79e20e9 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -1,5 +1,3 @@
-include ../make.inc
-
 #######################################################################
 #  This is the makefile to create a library for LAPACK.
 #  The files are organized as follows:
@@ -44,7 +42,7 @@ include ../make.inc
 #  and is created at the next higher directory level.
 #
 #  To remove the object files after the library is created, enter
-#       make clean
+#       make cleanobj
 #  On some systems, you can force the source files to be recompiled by
 #  entering (for example)
 #       make single FRC=FRC
@@ -56,6 +54,13 @@ include ../make.inc
 #
 #######################################################################
 
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc
+
+.SUFFIXES: .F .o
+.F.o:
+	$(FC) $(FFLAGS) -c -o $@ $<
+
 ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \
    iparmq.o iparam2stage.o \
    ilaprec.o ilatrans.o ilauplo.o iladiag.o chla_transtype.o \
@@ -128,7 +133,7 @@ SLASRC_O = \
    slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \
    slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \
    sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \
-   sorgrq.o sorgtr.o sorm2l.o sorm2r.o sorm22.o \
+   sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \
    sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \
    sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \
    spbstf.o spbsv.o  spbsvx.o \
@@ -171,9 +176,11 @@ SLASRC_O = \
    sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \
    sgelq.o slaswlq.o slamswlq.o sgemlq.o \
    stplqt.o stplqt2.o stpmlqt.o \
+   sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \
    ssytrd_2stage.o ssytrd_sy2sb.o ssytrd_sb2st.o ssb2st_kernels.o \
    ssyevd_2stage.o ssyev_2stage.o ssyevx_2stage.o ssyevr_2stage.o \
-   ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o
+   ssbev_2stage.o ssbevx_2stage.o ssbevd_2stage.o ssygv_2stage.o \
+   sgesvdq.o scombssq.o
 
 DSLASRC_O = spotrs.o sgetrs.o spotrf.o sgetrf.o
 
@@ -258,7 +265,7 @@ CLASRC_O = \
    ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \
    ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \
    cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \
-   cungrq.o cungtr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
+   cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \
    cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \
    cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \
    chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \
@@ -272,9 +279,11 @@ CLASRC_O = \
    cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \
    cgelq.o claswlq.o clamswlq.o cgemlq.o \
    ctplqt.o ctplqt2.o ctpmlqt.o \
+   cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \
    chetrd_2stage.o chetrd_he2hb.o chetrd_hb2st.o chb2st_kernels.o \
    cheevd_2stage.o cheev_2stage.o cheevx_2stage.o cheevr_2stage.o \
-   chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o
+   chbev_2stage.o chbevx_2stage.o chbevd_2stage.o chegv_2stage.o \
+   cgesvdq.o
 
 ifdef USEXBLAS
 CXLASRC = cgesvxx.o cgerfsx.o cla_gerfsx_extended.o cla_geamv.o \
@@ -324,7 +333,7 @@ DLASRC_O = \
    dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \
    dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \
    dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \
-   dorgrq.o dorgtr.o dorm2l.o dorm2r.o dorm22.o \
+   dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \
    dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \
    dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \
    dpbstf.o dpbsv.o  dpbsvx.o \
@@ -368,9 +377,11 @@ DLASRC_O = \
    dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \
    dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \
    dtplqt.o dtplqt2.o dtpmlqt.o \
+   dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \
    dsytrd_2stage.o dsytrd_sy2sb.o dsytrd_sb2st.o dsb2st_kernels.o \
    dsyevd_2stage.o dsyev_2stage.o dsyevx_2stage.o dsyevr_2stage.o \
-   dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o
+   dsbev_2stage.o dsbevx_2stage.o dsbevd_2stage.o dsygv_2stage.o \
+   dgesvdq.o dcombssq.o
 
 ifdef USEXBLAS
 DXLASRC = dgesvxx.o dgerfsx.o dla_gerfsx_extended.o dla_geamv.o \
@@ -456,7 +467,7 @@ ZLASRC_O = \
    ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \
    ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \
    zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \
-   zungrq.o zungtr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
+   zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \
    zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \
    zunmtr.o zupgtr.o \
    zupmtr.o izmax1.o dzsum1.o zstemr.o \
@@ -472,9 +483,11 @@ ZLASRC_O = \
    zgelqt.o zgelqt3.o zgemlqt.o \
    zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \
    zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \
+   zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \
    zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \
    zheevd_2stage.o zheev_2stage.o zheevx_2stage.o zheevr_2stage.o \
-   zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o
+   zhbev_2stage.o zhbevx_2stage.o zhbevd_2stage.o zhegv_2stage.o \
+   zgesvdq.o
 
 ifdef USEXBLAS
 ZXLASRC = zgesvxx.o zgerfsx.o zla_gerfsx_extended.o zla_geamv.o \
@@ -550,33 +563,29 @@ ifdef BUILD_DEPRECATED
 DEPRECATED = $(DEPRECSRC)
 endif
 
-all: ../$(LAPACKLIB)
+.PHONY: all
+all: $(LAPACKLIB)
 
-.PHONY: ../$(LAPACKLIB)
-
-../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
-	$(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
+$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
+.PHONY: single complex double complex16
 single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \
-	$(SXLASRC) $(SCLAUX) $(ALLAUX)
-	$(RANLIB) ../$(LAPACKLIB)
+	$(AR) $(ARFLAGS) $(LAPACKLIB) $^
+	$(RANLIB) $(LAPACKLIB)
 
 complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \
-	$(CXLASRC) $(SCLAUX) $(ALLAUX)
-	$(RANLIB) ../$(LAPACKLIB)
+	$(AR) $(ARFLAGS) $(LAPACKLIB) $^
+	$(RANLIB) $(LAPACKLIB)
 
 double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \
-	$(DXLASRC) $(DZLAUX) $(ALLAUX)
-	$(RANLIB) ../$(LAPACKLIB)
+	$(AR) $(ARFLAGS) $(LAPACKLIB) $^
+	$(RANLIB) $(LAPACKLIB)
 
 complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \
-	$(ZXLASRC) $(DZLAUX) $(ALLAUX)
-	$(RANLIB) ../$(LAPACKLIB)
+	$(AR) $(ARFLAGS) $(LAPACKLIB) $^
+	$(RANLIB) $(LAPACKLIB)
 
 $(ALLAUX): $(FRC)
 $(SCLAUX): $(FRC)
@@ -597,18 +606,16 @@ endif
 FRC:
 	@FRC=$(FRC)
 
-clean:
+.PHONY: clean cleanobj cleanlib
+clean: cleanobj cleanlib
+cleanobj:
 	rm -f *.o DEPRECATED/*.o
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
-
-.F.o:
-	$(FORTRAN) $(OPTS) -c $< -o $@
-	
-slaruv.o: slaruv.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-dlaruv.o: dlaruv.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-sla_wwaddw.o: sla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-dla_wwaddw.o: dla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-cla_wwaddw.o: cla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-zla_wwaddw.o: zla_wwaddw.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
+cleanlib:
+	rm -f $(LAPACKLIB)
+
+slaruv.o: slaruv.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+dlaruv.o: dlaruv.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+sla_wwaddw.o: sla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+dla_wwaddw.o: dla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+cla_wwaddw.o: cla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+zla_wwaddw.o: zla_wwaddw.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile
index 9f1410755..25d8ee175 100644
--- a/lapack-netlib/SRC/VARIANTS/Makefile
+++ b/lapack-netlib/SRC/VARIANTS/Makefile
@@ -1,5 +1,3 @@
-include ../../make.inc
-
 #######################################################################
 #  This is the makefile to create a the variants libraries for LAPACK.
 #  The files are organized as follows:
@@ -17,6 +15,9 @@ include ../../make.inc
 #  1065-1081. http://dx.doi.org/10.1137/S0895479896297744
 #######################################################################
 
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
 CHOLRL = cholesky/RL/cpotrf.o cholesky/RL/dpotrf.o cholesky/RL/spotrf.o cholesky/RL/zpotrf.o
 
 CHOLTOP = cholesky/TOP/cpotrf.o cholesky/TOP/dpotrf.o cholesky/TOP/spotrf.o cholesky/TOP/zpotrf.o
@@ -30,37 +31,36 @@ LUREC = lu/REC/cgetrf.o lu/REC/dgetrf.o lu/REC/sgetrf.o lu/REC/zgetrf.o
 QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o
 
 
+.PHONY: all
 all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a
 
 cholrl.a: $(CHOLRL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 choltop.a: $(CHOLTOP)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lucr.a: $(LUCR)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lull.a: $(LULL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lurec.a: $(LUREC)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 qrll.a: $(QRLL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
+.PHONY: clean cleanobj cleanlib
 clean: cleanobj cleanlib
 cleanobj:
 	rm -f $(CHOLRL) $(CHOLTOP) $(LUCR) $(LULL) $(LUREC) $(QRLL)
 cleanlib:
 	rm -f *.a
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
diff --git a/lapack-netlib/SRC/VARIANTS/README b/lapack-netlib/SRC/VARIANTS/README
index 4d301cc6e..ef7626deb 100644
--- a/lapack-netlib/SRC/VARIANTS/README
+++ b/lapack-netlib/SRC/VARIANTS/README
@@ -34,7 +34,7 @@ References:For a more detailed description please refer to
 =========
 
 These variants are compiled by default in the build process but they are not tested by default.
-The build process creates one new library per variants in the four arithmetics (single real/double real/single complex/double complex).
+The build process creates one new library per variants in the four arithmetic (single real/double real/single complex/double complex).
 The libraries are in the SRC/VARIANTS directory.
 
 Corresponding libraries created in SRC/VARIANTS:
@@ -64,16 +64,16 @@ You should then see the following files in the TESTING directory:
 = LINKING YOUR PROGRAM =
 ========================
 
-You just need to add the variants methods library in your linking sequence before your lapack libary.
+You just need to add the variants methods library in your linking sequence before your lapack library.
 Here is a quick example for LU
 
 Default using LU Right Looking version:
- $(FORTRAN) -c myprog.f
- $(FORTRAN) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB)
+ $(FC) $(FFLAGS) -c myprog.f
+ $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(LAPACKLIB) $(BLASLIB)
 
 Using LU Left Looking version:
- $(FORTRAN) -c myprog.f
- $(FORTRAN) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB)
+ $(FC) $(FFLAGS) -c myprog.f
+ $(FC) $(FFLAGS) $(LDFLAGS) -o myexe myprog.o $(PATH TO LAPACK/SRC/VARIANTS)/lull.a $(LAPACKLIB) $(BLASLIB)
 
 ===========
 = SUPPORT =
diff --git a/lapack-netlib/SRC/cgbrfsx.f b/lapack-netlib/SRC/cgbrfsx.f
index 041b6a1b6..c23608afb 100644
--- a/lapack-netlib/SRC/cgbrfsx.f
+++ b/lapack-netlib/SRC/cgbrfsx.f
@@ -75,7 +75,7 @@
 *>     Specifies the form of the system of equations:
 *>       = 'N':  A * X = B     (No transpose)
 *>       = 'T':  A**T * X = B  (Transpose)
-*>       = 'C':  A**H * X = B  (Conjugate transpose = Transpose)
+*>       = 'C':  A**H * X = B  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] EQUED
@@ -308,7 +308,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -344,14 +344,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -359,9 +359,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/cgbsvxx.f b/lapack-netlib/SRC/cgbsvxx.f
index 2e113f99c..9f2bbbc1c 100644
--- a/lapack-netlib/SRC/cgbsvxx.f
+++ b/lapack-netlib/SRC/cgbsvxx.f
@@ -431,7 +431,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -467,14 +467,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -482,9 +482,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/cgebak.f b/lapack-netlib/SRC/cgebak.f
index 63c73bfa7..9b6402622 100644
--- a/lapack-netlib/SRC/cgebak.f
+++ b/lapack-netlib/SRC/cgebak.f
@@ -48,10 +48,10 @@
 *> \verbatim
 *>          JOB is CHARACTER*1
 *>          Specifies the type of backward transformation required:
-*>          = 'N', do nothing, return immediately;
-*>          = 'P', do backward transformation for permutation only;
-*>          = 'S', do backward transformation for scaling only;
-*>          = 'B', do backward transformations for both permutation and
+*>          = 'N': do nothing, return immediately;
+*>          = 'P': do backward transformation for permutation only;
+*>          = 'S': do backward transformation for scaling only;
+*>          = 'B': do backward transformations for both permutation and
 *>                 scaling.
 *>          JOB must be the same as the argument JOB supplied to CGEBAL.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cgeev.f b/lapack-netlib/SRC/cgeev.f
index bdd75e4f1..f07d9b755 100644
--- a/lapack-netlib/SRC/cgeev.f
+++ b/lapack-netlib/SRC/cgeev.f
@@ -157,7 +157,7 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, the QR algorithm failed to compute all the
 *>                eigenvalues, and no eigenvectors have been computed;
-*>                elements and i+1:N of W contain eigenvalues which have
+*>                elements i+1:N of W contain eigenvalues which have
 *>                converged.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/cgejsv.f b/lapack-netlib/SRC/cgejsv.f
index a7b1c451c..350da4c40 100644
--- a/lapack-netlib/SRC/cgejsv.f
+++ b/lapack-netlib/SRC/cgejsv.f
@@ -80,13 +80,13 @@
 *>              desirable, then this option is advisable. The input matrix A
 *>              is preprocessed with QR factorization with FULL (row and
 *>              column) pivoting.
-*>       = 'G'  Computation as with 'F' with an additional estimate of the
+*>       = 'G': Computation as with 'F' with an additional estimate of the
 *>              condition number of B, where A=B*D. If A has heavily weighted
 *>              rows, then using this condition number gives too pessimistic
 *>              error bound.
 *>       = 'A': Small singular values are not well determined by the data 
 *>              and are considered as noisy; the matrix is treated as
-*>              numerically rank defficient. The error in the computed
+*>              numerically rank deficient. The error in the computed
 *>              singular values is bounded by f(m,n)*epsilon*||A||.
 *>              The computed SVD A = U * S * V^* restores A up to
 *>              f(m,n)*epsilon*||A||.
@@ -117,7 +117,7 @@
 *>       = 'V': N columns of V are returned in the array V; Jacobi rotations
 *>              are not explicitly accumulated.
 *>       = 'J': N columns of V are returned in the array V, but they are
-*>              computed as the product of Jacobi rotations, if JOBT .EQ. 'N'.
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
 *>       = 'W': V may be used as workspace of length N*N. See the description
 *>              of V.
 *>       = 'N': V is not computed.
@@ -131,7 +131,7 @@
 *>         specified range. If A .NE. 0 is scaled so that the largest singular
 *>         value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
 *>         the licence to kill columns of A whose norm in c*A is less than
-*>         SQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN,
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
 *>         where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
 *>       = 'N': Do not kill small columns of c*A. This option assumes that
 *>              BLAS and QR factorizations and triangular solvers are
@@ -229,7 +229,7 @@
 *>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
 *>                         the left singular vectors, including an ONB
 *>                         of the orthogonal complement of the Range(A).
-*>          If JOBU = 'W'  .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N),
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
 *>                         then U is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [V] is computed
 *>                         in U as left singular vectors of A^* and then
@@ -251,7 +251,7 @@
 *>          V is COMPLEX array, dimension ( LDV, N )
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
-*>          If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N),
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
 *>                         then V is used as workspace if the pprocedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
@@ -282,7 +282,7 @@
 *>          Length of CWORK to confirm proper allocation of workspace.
 *>          LWORK depends on the job:
 *>
-*>          1. If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
 *>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
 *>               LWORK >= 2*N+1. This is the minimal requirement.
 *>               ->> For optimal performance (blocked code) the optimal value
@@ -298,9 +298,9 @@
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CGEQRF), LWORK(CGESVJ),
 *>                            N*N+LWORK(CPOCON)).
-*>          2. If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'),
-*>             (JOBU.EQ.'N')
-*>            2.1   .. no scaled condition estimate requested (JOBE.EQ.'N'):    
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
 *>            -> the minimal requirement is LWORK >= 3*N.
 *>            -> For optimal performance, 
 *>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
@@ -318,10 +318,10 @@
 *>               LWORK >= max(N+LWORK(CGEQP3), LWORK(CPOCON), N+LWORK(CGESVJ),
 *>                       N+LWORK(CGELQF), 2*N+LWORK(CGEQRF), N+LWORK(CUNMLQ)).   
 *>          3. If SIGMA and the left singular vectors are needed
-*>            3.1  .. no scaled condition estimate requested (JOBE.EQ.'N'):
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
 *>            -> the minimal requirement is LWORK >= 3*N.
 *>            -> For optimal performance:
-*>               if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
 *>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(N+LWORK(CGEQP3), 2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)). 
@@ -329,16 +329,16 @@
 *>               required (JOBA='E', or 'G').
 *>            -> the minimal requirement is LWORK >= 3*N.
 *>            -> For optimal performance:
-*>               if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
 *>               where NB is the optimal block size for CGEQP3, CGEQRF, CUNMQR.
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(N+LWORK(CGEQP3),N+LWORK(CPOCON),
 *>                        2*N+LWORK(CGEQRF), N+LWORK(CUNMQR)).
 *>
-*>          4. If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and
-*>            4.1. if JOBV.EQ.'V'
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            4.1. if JOBV = 'V'
 *>               the minimal requirement is LWORK >= 5*N+2*N*N.
-*>            4.2. if JOBV.EQ.'J' the minimal requirement is
+*>            4.2. if JOBV = 'J' the minimal requirement is
 *>               LWORK >= 4*N+N*N.
 *>            In both cases, the allocated CWORK can accommodate blocked runs
 *>            of CGEQP3, CGEQRF, CGELQF, CUNMQR, CUNMLQ.
@@ -357,7 +357,7 @@
 *>                    of A. (See the description of SVA().)
 *>          RWORK(2) = See the description of RWORK(1).
 *>          RWORK(3) = SCONDA is an estimate for the condition number of
-*>                    column equilibrated A. (If JOBA .EQ. 'E' or 'G')
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
 *>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
 *>                    It is computed using SPOCON. It holds
 *>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
@@ -376,7 +376,7 @@
 *>                    triangular factor in the first QR factorization.
 *>          RWORK(5) = an estimate of the scaled condition number of the
 *>                    triangular factor in the second QR factorization.
-*>          The following two parameters are computed if JOBT .EQ. 'T'.
+*>          The following two parameters are computed if JOBT = 'T'.
 *>          They are provided for a developer/implementer who is familiar
 *>          with the details of the method.
 *>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
@@ -457,23 +457,23 @@
 *>                     of JOBA and JOBR.
 *>          IWORK(2) = the number of the computed nonzero singular values
 *>          IWORK(3) = if nonzero, a warning message:
-*>                     If IWORK(3).EQ.1 then some of the column norms of A
+*>                     If IWORK(3) = 1 then some of the column norms of A
 *>                     were denormalized floats. The requested high accuracy
 *>                     is not warranted by the data.
-*>          IWORK(4) = 1 or -1. If IWORK(4) .EQ. 1, then the procedure used A^* to
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
 *>                     do the job as specified by the JOB parameters.
-*>          If the call to CGEJSV is a workspace query (indicated by LWORK .EQ. -1 and 
-*>          LRWORK .EQ. -1), then on exit IWORK(1) contains the required length of 
+*>          If the call to CGEJSV is a workspace query (indicated by LWORK = -1 and 
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
 *>          IWORK for the job parameters used in the call.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           < 0  : if INFO = -i, then the i-th argument had an illegal value.
-*>           = 0 :  successful exit;
-*>           > 0 :  CGEJSV  did not converge in the maximal allowed number
-*>                  of sweeps. The computed values may be inaccurate.
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  CGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
 *> \endverbatim
 *
 *  Authors:
@@ -1336,7 +1336,7 @@
       IF ( L2ABER ) THEN
 *        Standard absolute error bound suffices. All sigma_i with
 *        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
-*        agressive enforcement of lower numerical rank by introducing a
+*        aggressive enforcement of lower numerical rank by introducing a
 *        backward error of the order of N*EPSLN*||A||.
          TEMP1 = SQRT(REAL(N))*EPSLN
          DO 3001 p = 2, N
@@ -1348,9 +1348,9 @@
  3001    CONTINUE
  3002    CONTINUE
       ELSE IF ( L2RANK ) THEN
-*        .. similarly as above, only slightly more gentle (less agressive).
+*        .. similarly as above, only slightly more gentle (less aggressive).
 *        Sudden drop on the diagonal of R1 is used as the criterion for
-*        close-to-rank-defficient.
+*        close-to-rank-deficient.
          TEMP1 = SQRT(SFMIN)
          DO 3401 p = 2, N
             IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
@@ -1718,7 +1718,7 @@
             CALL CPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
      $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
             CONDR1 = ONE / SQRT(TEMP1)
-*           .. here need a second oppinion on the condition number
+*           .. here need a second opinion on the condition number
 *           .. then assume worst case scenario
 *           R1 is OK for inverse <=> CONDR1 .LT. REAL(N)
 *           more conservative    <=> CONDR1 .LT. SQRT(REAL(N))
@@ -1763,7 +1763,7 @@
             ELSE
 *
 *              .. ill-conditioned case: second QRF with pivoting
-*              Note that windowed pivoting would be equaly good
+*              Note that windowed pivoting would be equally good
 *              numerically, and more run-time efficient. So, in
 *              an optimal implementation, the next call to CGEQP3
 *              should be replaced with eg. CALL CGEQPX (ACM TOMS #782)
@@ -1821,7 +1821,7 @@
 *
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
-*                 (this overwrittes the copy of R2, as it will not be
+*                 (this overwrites the copy of R2, as it will not be
 *                 needed in this branch, but it does not overwritte the
 *                 Huseholder vectors of Q2.).
                   CALL CLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
@@ -2077,7 +2077,7 @@
 *
 *        This branch deploys a preconditioned Jacobi SVD with explicitly
 *        accumulated rotations. It is included as optional, mainly for
-*        experimental purposes. It does perfom well, and can also be used.
+*        experimental purposes. It does perform well, and can also be used.
 *        In this implementation, this branch will be automatically activated
 *        if the  condition number sigma_max(A) / sigma_min(A) is predicted
 *        to be greater than the overflow threshold. This is because the
diff --git a/lapack-netlib/SRC/cgelq.f b/lapack-netlib/SRC/cgelq.f
index 909162ebc..c3b2238bf 100644
--- a/lapack-netlib/SRC/cgelq.f
+++ b/lapack-netlib/SRC/cgelq.f
@@ -1,3 +1,4 @@
+*> \brief \b CGELQ
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,17 @@
 *  =============
 *>
 *> \verbatim
-*> CGELQ computes a LQ factorization of an M-by-N matrix A.
+*>
+*> CGELQ computes an LQ factorization of a complex M-by-N matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +149,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -159,10 +170,10 @@
       SUBROUTINE CGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/cgelq2.f b/lapack-netlib/SRC/cgelq2.f
index 9742d359b..3fab2c396 100644
--- a/lapack-netlib/SRC/cgelq2.f
+++ b/lapack-netlib/SRC/cgelq2.f
@@ -33,8 +33,16 @@
 *>
 *> \verbatim
 *>
-*> CGELQ2 computes an LQ factorization of a complex m by n matrix A:
-*> A = L * Q.
+*> CGELQ2 computes an LQ factorization of a complex m-by-n matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a n-by-n orthogonal matrix;
+*>    L is an lower-triangular m-by-m matrix;
+*>    0 is a m-by-(n-m) zero matrix, if m < n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +104,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complexGEcomputational
 *
@@ -121,10 +129,10 @@
 *  =====================================================================
       SUBROUTINE CGELQ2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/cgelqf.f b/lapack-netlib/SRC/cgelqf.f
index 216630e88..030ac0b4d 100644
--- a/lapack-netlib/SRC/cgelqf.f
+++ b/lapack-netlib/SRC/cgelqf.f
@@ -34,7 +34,15 @@
 *> \verbatim
 *>
 *> CGELQF computes an LQ factorization of a complex M-by-N matrix A:
-*> A = L * Q.
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -110,7 +118,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complexGEcomputational
 *
@@ -135,10 +143,10 @@
 *  =====================================================================
       SUBROUTINE CGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/cgelqt.f b/lapack-netlib/SRC/cgelqt.f
index e151f10fe..f40db0b02 100644
--- a/lapack-netlib/SRC/cgelqt.f
+++ b/lapack-netlib/SRC/cgelqt.f
@@ -1,3 +1,4 @@
+*> \brief \b CGELQT
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/cgelqt3.f b/lapack-netlib/SRC/cgelqt3.f
index f64379722..80a9a9fc7 100644
--- a/lapack-netlib/SRC/cgelqt3.f
+++ b/lapack-netlib/SRC/cgelqt3.f
@@ -1,3 +1,5 @@
+*> \brief \b CGELQT3
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/cgemlq.f b/lapack-netlib/SRC/cgemlq.f
index 2f44e7cfb..4e374077e 100644
--- a/lapack-netlib/SRC/cgemlq.f
+++ b/lapack-netlib/SRC/cgemlq.f
@@ -1,3 +1,4 @@
+*> \brief \b CGEMLQ
 *
 *  Definition:
 *  ===========
@@ -143,7 +144,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/cgemlqt.f b/lapack-netlib/SRC/cgemlqt.f
index e35e421b1..66b186bff 100644
--- a/lapack-netlib/SRC/cgemlqt.f
+++ b/lapack-netlib/SRC/cgemlqt.f
@@ -1,3 +1,5 @@
+*> \brief \b CGEMLQT
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/cgemqr.f b/lapack-netlib/SRC/cgemqr.f
index a43d7be5b..54ab7aa74 100644
--- a/lapack-netlib/SRC/cgemqr.f
+++ b/lapack-netlib/SRC/cgemqr.f
@@ -1,3 +1,4 @@
+*> \brief \b CGEMQR
 *
 *  Definition:
 *  ===========
@@ -144,7 +145,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/cgeqr.f b/lapack-netlib/SRC/cgeqr.f
index a00ef45c0..e0aea88b1 100644
--- a/lapack-netlib/SRC/cgeqr.f
+++ b/lapack-netlib/SRC/cgeqr.f
@@ -1,3 +1,4 @@
+*> \brief \b CGEQR
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,18 @@
 *  =============
 *>
 *> \verbatim
-*> CGEQR computes a QR factorization of an M-by-N matrix A.
+*>
+*> CGEQR computes a QR factorization of a complex M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +150,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -160,10 +172,10 @@
       SUBROUTINE CGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/cgeqr2.f b/lapack-netlib/SRC/cgeqr2.f
index 1b2030b47..8cb2fa119 100644
--- a/lapack-netlib/SRC/cgeqr2.f
+++ b/lapack-netlib/SRC/cgeqr2.f
@@ -33,8 +33,17 @@
 *>
 *> \verbatim
 *>
-*> CGEQR2 computes a QR factorization of a complex m by n matrix A:
-*> A = Q * R.
+*> CGEQR2 computes a QR factorization of a complex m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +105,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complexGEcomputational
 *
@@ -121,10 +130,10 @@
 *  =====================================================================
       SUBROUTINE CGEQR2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/cgeqr2p.f b/lapack-netlib/SRC/cgeqr2p.f
index 3c64255d9..1e7b980df 100644
--- a/lapack-netlib/SRC/cgeqr2p.f
+++ b/lapack-netlib/SRC/cgeqr2p.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> CGEQR2P computes a QR factorization of a complex m by n matrix A:
-*> A = Q * R. The diagonal entries of R are real and nonnegative.
+*> CGEQR2P computes a QR factorization of a complex m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -97,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complexGEcomputational
 *
@@ -124,10 +134,10 @@
 *  =====================================================================
       SUBROUTINE CGEQR2P( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/cgeqrf.f b/lapack-netlib/SRC/cgeqrf.f
index 833384707..ff0c53f2f 100644
--- a/lapack-netlib/SRC/cgeqrf.f
+++ b/lapack-netlib/SRC/cgeqrf.f
@@ -34,7 +34,16 @@
 *> \verbatim
 *>
 *> CGEQRF computes a QR factorization of a complex M-by-N matrix A:
-*> A = Q * R.
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -111,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complexGEcomputational
 *
@@ -136,10 +145,10 @@
 *  =====================================================================
       SUBROUTINE CGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/cgeqrfp.f b/lapack-netlib/SRC/cgeqrfp.f
index a56508b4e..9c29ac90b 100644
--- a/lapack-netlib/SRC/cgeqrfp.f
+++ b/lapack-netlib/SRC/cgeqrfp.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> CGEQRFP computes a QR factorization of a complex M-by-N matrix A:
-*> A = Q * R. The diagonal entries of R are real and nonnegative.
+*> CGEQR2P computes a QR factorization of a complex M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -112,7 +122,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complexGEcomputational
 *
@@ -139,10 +149,10 @@
 *  =====================================================================
       SUBROUTINE CGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/cgerfsx.f b/lapack-netlib/SRC/cgerfsx.f
index 7b72f9c9a..a6e24ae4f 100644
--- a/lapack-netlib/SRC/cgerfsx.f
+++ b/lapack-netlib/SRC/cgerfsx.f
@@ -74,7 +74,7 @@
 *>     Specifies the form of the system of equations:
 *>       = 'N':  A * X = B     (No transpose)
 *>       = 'T':  A**T * X = B  (Transpose)
-*>       = 'C':  A**H * X = B  (Conjugate transpose = Transpose)
+*>       = 'C':  A**H * X = B  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] EQUED
@@ -283,7 +283,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -319,14 +319,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -334,9 +334,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/cgesc2.f b/lapack-netlib/SRC/cgesc2.f
index c0b91107e..6f45a09a6 100644
--- a/lapack-netlib/SRC/cgesc2.f
+++ b/lapack-netlib/SRC/cgesc2.f
@@ -91,7 +91,7 @@
 *> \verbatim
 *>          SCALE is REAL
 *>           On exit, SCALE contains the scale factor. SCALE is chosen
-*>           0 <= SCALE <= 1 to prevent owerflow in the solution.
+*>           0 <= SCALE <= 1 to prevent overflow in the solution.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cgesvdq.f b/lapack-netlib/SRC/cgesvdq.f
new file mode 100644
index 000000000..77c883dde
--- /dev/null
+++ b/lapack-netlib/SRC/cgesvdq.f
@@ -0,0 +1,1391 @@
+*> \brief <b> CGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CGESVDQ + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cgesvdq.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cgesvdq.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cgesvdq.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE CGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+*                          S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+*                          CWORK, LCWORK, RWORK, LRWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*      IMPLICIT    NONE
+*      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+*      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK,
+*                  INFO
+*     ..
+*     .. Array Arguments ..
+*      COMPLEX     A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * )
+*      REAL        S( * ), RWORK( * )
+*      INTEGER     IWORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CGESVDQ computes the singular value decomposition (SVD) of a complex
+*> M-by-N matrix A, where M >= N. The SVD of A is written as
+*>                                    [++]   [xx]   [x0]   [xx]
+*>              A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+*>                                    [++]   [xx]
+*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+*> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+*> of SIGMA are the singular values of A. The columns of U and V are the
+*> left and the right singular vectors of A, respectively.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBA
+*> \verbatim
+*>  JOBA is CHARACTER*1
+*>  Specifies the level of accuracy in the computed SVD
+*>  = 'A' The requested accuracy corresponds to having the backward
+*>        error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F,
+*>        where EPS = SLAMCH('Epsilon'). This authorises CGESVDQ to
+*>        truncate the computed triangular factor in a rank revealing
+*>        QR factorization whenever the truncated part is below the
+*>        threshold of the order of EPS * ||A||_F. This is aggressive
+*>        truncation level.
+*>  = 'M' Similarly as with 'A', but the truncation is more gentle: it
+*>        is allowed only when there is a drop on the diagonal of the
+*>        triangular factor in the QR factorization. This is medium
+*>        truncation level.
+*>  = 'H' High accuracy requested. No numerical rank determination based
+*>        on the rank revealing QR factorization is attempted.
+*>  = 'E' Same as 'H', and in addition the condition number of column
+*>        scaled A is estimated and returned in  RWORK(1).
+*>        N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1)
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>  JOBP is CHARACTER*1
+*>  = 'P' The rows of A are ordered in decreasing order with respect to
+*>        ||A(i,:)||_\infty. This enhances numerical accuracy at the cost
+*>        of extra data movement. Recommended for numerical robustness.
+*>  = 'N' No row pivoting.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>          = 'T' After the initial pivoted QR factorization, CGESVD is applied to
+*>          the adjoint R**H of the computed triangular factor R. This involves
+*>          some extra data movement (matrix transpositions). Useful for
+*>          experiments, research and development.
+*>          = 'N' The triangular factor R is given as input to CGESVD. This may be
+*>          preferred as it involves less data movement.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>          = 'A' All M left singular vectors are computed and returned in the
+*>          matrix U. See the description of U.
+*>          = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned
+*>          in the matrix U. See the description of U.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular
+*>          vectors are computed and returned in the matrix U.
+*>          = 'F' The N left singular vectors are returned in factored form as the
+*>          product of the Q factor from the initial QR factorization and the
+*>          N left singular vectors of (R**H , 0)**H. If row pivoting is used,
+*>          then the necessary information on the row pivoting is stored in
+*>          IWORK(N+1:N+M-1).
+*>          = 'N' The left singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>          = 'A', 'V' All N right singular vectors are computed and returned in
+*>          the matrix V.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular
+*>          vectors are computed and returned in the matrix V. This option is
+*>          allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal.
+*>          = 'N' The right singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the input matrix A.  M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array of dimensions LDA x N
+*>          On entry, the input matrix A.
+*>          On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains
+*>          the Householder vectors as stored by CGEQP3. If JOBU = 'F', these Householder
+*>          vectors together with CWORK(1:N) can be used to restore the Q factors from
+*>          the initial pivoted QR factorization of A. See the description of U.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER.
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array of dimension N.
+*>          The singular values of A, ordered so that S(i) >= S(i+1).
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX array, dimension
+*>          LDU x M if JOBU = 'A'; see the description of LDU. In this case,
+*>          on exit, U contains the M left singular vectors.
+*>          LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this
+*>          case, U contains the leading N or the leading NUMRANK left singular vectors.
+*>          LDU x N if JOBU = 'F' ; see the description of LDU. In this case U
+*>          contains N x N unitary matrix that can be used to form the left
+*>          singular vectors.
+*>          If JOBU = 'N', U is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER.
+*>          The leading dimension of the array U.
+*>          If JOBU = 'A', 'S', 'U', 'R',  LDU >= max(1,M).
+*>          If JOBU = 'F',                 LDU >= max(1,N).
+*>          Otherwise,                     LDU >= 1.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX array, dimension
+*>          LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' .
+*>          If JOBV = 'A', or 'V',  V contains the N-by-N unitary matrix  V**H;
+*>          If JOBV = 'R', V contains the first NUMRANK rows of V**H (the right
+*>          singular vectors, stored rowwise, of the NUMRANK largest singular values).
+*>          If JOBV = 'N' and JOBA = 'E', V is used as a workspace.
+*>          If JOBV = 'N', and JOBA.NE.'E', V is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If JOBV = 'A', 'V', 'R',  or JOBA = 'E', LDV >= max(1,N).
+*>          Otherwise,                               LDV >= 1.
+*> \endverbatim
+*>
+*> \param[out] NUMRANK
+*> \verbatim
+*>          NUMRANK is INTEGER
+*>          NUMRANK is the numerical rank first determined after the rank
+*>          revealing QR factorization, following the strategy specified by the
+*>          value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK
+*>          leading singular values and vectors are then requested in the call
+*>          of CGESVD. The final value of NUMRANK might be further reduced if
+*>          some singular values are computed as zeros.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (max(1, LIWORK)).
+*>          On exit, IWORK(1:N) contains column pivoting permutation of the
+*>          rank revealing QR factorization.
+*>          If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence
+*>          of row swaps used in row pivoting. These can be used to restore the
+*>          left singular vectors in the case JOBU = 'F'.
+*>
+*>          If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          LIWORK(1) returns the minimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          LIWORK >= N + M - 1,  if JOBP = 'P';
+*>          LIWORK >= N           if JOBP = 'N'.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the CWORK, IWORK, and RWORK arrays, and no error
+*>          message related to LCWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX array, dimension (max(2, LCWORK)), used as a workspace.
+*>          On exit, if, on entry, LCWORK.NE.-1, CWORK(1:N) contains parameters
+*>          needed to recover the Q factor from the QR factorization computed by
+*>          CGEQP3.
+*>
+*>          If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          CWORK(1) returns the optimal LCWORK, and
+*>          CWORK(2) returns the minimal LCWORK.
+*> \endverbatim
+*>
+*> \param[in,out] LCWORK
+*> \verbatim
+*>          LCWORK is INTEGER
+*>          The dimension of the array CWORK. It is determined as follows:
+*>          Let  LWQP3 = N+1,  LWCON = 2*N, and let
+*>          LWUNQ = { MAX( N, 1 ),  if JOBU = 'R', 'S', or 'U'
+*>                  { MAX( M, 1 ),  if JOBU = 'A'
+*>          LWSVD = MAX( 3*N, 1 )
+*>          LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 3*(N/2), 1 ), LWUNLQ = MAX( N, 1 ),
+*>          LWQRF = MAX( N/2, 1 ), LWUNQ2 = MAX( N, 1 )
+*>          Then the minimal value of LCWORK is:
+*>          = MAX( N + LWQP3, LWSVD )        if only the singular values are needed;
+*>          = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed,
+*>                                   and a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWUNQ ) if the singular values and the left
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the singular values and the left
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD )        if the singular values and the right
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition etimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWUNQ ) if the full SVD is requested with JOBV = 'R';
+*>                                   independent of JOBR;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the full SVD is requested,
+*>                                   JOBV = 'R' and, also a scaled condition
+*>                                   estimate requested; independent of JOBR;
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ) ) if the
+*>                         full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ ) )
+*>                         if the full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N', and also a scaled condition number estimate
+*>                         requested.
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) ) if the
+*>                         full SVD is requested with JOBV = 'A', 'V', and JOBR ='T'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) )
+*>                         if the full SVD is requested with JOBV = 'A', 'V' and
+*>                         JOBR ='T', and also a scaled condition number estimate
+*>                         requested.
+*>          Finally, LCWORK must be at least two: LCWORK = MAX( 2, LCWORK ).
+*>
+*>          If LCWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the CWORK, IWORK, and RWORK arrays, and no error
+*>          message related to LCWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (max(1, LRWORK)).
+*>          On exit,
+*>          1. If JOBA = 'E', RWORK(1) contains an estimate of the condition
+*>          number of column scaled A. If A = C * D where D is diagonal and C
+*>          has unit columns in the Euclidean norm, then, assuming full column rank,
+*>          N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1).
+*>          Otherwise, RWORK(1) = -1.
+*>          2. RWORK(2) contains the number of singular values computed as
+*>          exact zeros in CGESVD applied to the upper triangular or trapeziodal
+*>          R (from the initial QR factorization). In case of early exit (no call to
+*>          CGESVD, such as in the case of zero matrix) RWORK(2) = -1.
+*>
+*>          If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          RWORK(1) returns the minimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER.
+*>          The dimension of the array RWORK.
+*>          If JOBP ='P', then LRWORK >= MAX(2, M, 5*N);
+*>          Otherwise, LRWORK >= MAX(2, 5*N).
+*>
+*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the CWORK, IWORK, and RWORK arrays, and no error
+*>          message related to LCWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if CBDSQR did not converge, INFO specifies how many superdiagonals
+*>          of an intermediate bidiagonal form B (computed in CGESVD) did not
+*>          converge to zero.
+*> \endverbatim
+*
+*> \par Further Details:
+*  ========================
+*>
+*> \verbatim
+*>
+*>   1. The data movement (matrix transpose) is coded using simple nested
+*>   DO-loops because BLAS and LAPACK do not provide corresponding subroutines.
+*>   Those DO-loops are easily identified in this source code - by the CONTINUE
+*>   statements labeled with 11**. In an optimized version of this code, the
+*>   nested DO loops should be replaced with calls to an optimized subroutine.
+*>   2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause
+*>   column norm overflow. This is the minial precaution and it is left to the
+*>   SVD routine (CGESVD) to do its own preemptive scaling if potential over-
+*>   or underflows are detected. To avoid repeated scanning of the array A,
+*>   an optimal implementation would do all necessary scaling before calling
+*>   CGESVD and the scaling in CGESVD can be switched off.
+*>   3. Other comments related to code optimization are given in comments in the
+*>   code, enlosed in [[double brackets]].
+*> \endverbatim
+*
+*> \par Bugs, examples and comments
+*  ===========================
+*
+*> \verbatim
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*> \endverbatim
+*
+*> \par References
+*  ===============
+*
+*> \verbatim
+*>  [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for
+*>      Computing the SVD with High Accuracy. ACM Trans. Math. Softw.
+*>      44(1): 11:1-11:30 (2017)
+*>
+*>  SIGMA library, xGESVDQ section updated February 2016.
+*>  Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*> Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2018
+*
+*> \ingroup complexGEsing
+*
+*  =====================================================================
+      SUBROUTINE CGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+     $                    S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+     $                    CWORK, LCWORK, RWORK, LRWORK, INFO )
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK,
+     $            INFO
+*     ..
+*     .. Array Arguments ..
+      COMPLEX     A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * )
+      REAL        S( * ), RWORK( * )
+      INTEGER     IWORK( * )
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL        ZERO,         ONE
+      PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+      COMPLEX     CZERO,                    CONE
+      PARAMETER ( CZERO = ( 0.0E0, 0.0E0 ), CONE = ( 1.0E0, 0.0E0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER     IERR, NR, N1, OPTRATIO, p, q
+      INTEGER     LWCON, LWQP3, LWRK_CGELQF, LWRK_CGESVD, LWRK_CGESVD2,
+     $            LWRK_CGEQP3, LWRK_CGEQRF, LWRK_CUNMLQ, LWRK_CUNMQR,
+     $            LWRK_CUNMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWUNQ,
+     $            LWUNQ2, LWUNLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2,
+     $            IMINWRK, RMINWRK
+      LOGICAL     ACCLA,  ACCLM, ACCLH, ASCALED, CONDA, DNTWU,  DNTWV,
+     $            LQUERY, LSVC0, LSVEC, ROWPRM,  RSVEC, RTRANS, WNTUA,
+     $            WNTUF,  WNTUR, WNTUS, WNTVA,   WNTVR
+      REAL        BIG, EPSLN, RTMP, SCONDA, SFMIN
+      COMPLEX     CTMP
+*     ..
+*     .. Local Arrays
+      COMPLEX     CDUMMY(1)
+      REAL        RDUMMY(1)
+*     ..
+*     .. External Subroutines (BLAS, LAPACK)
+      EXTERNAL    CGELQF, CGEQP3, CGEQRF, CGESVD, CLACPY, CLAPMT,
+     $            CLASCL, CLASET, CLASWP, CSSCAL, SLASET, SLASCL,
+     $            CPOCON, CUNMLQ, CUNMQR, XERBLA
+*     ..
+*     .. External Functions (BLAS, LAPACK)
+      LOGICAL    LSAME
+      INTEGER    ISAMAX
+      REAL       CLANGE, SCNRM2, SLAMCH
+      EXTERNAL   CLANGE, LSAME, ISAMAX, SCNRM2, SLAMCH
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC   ABS, CONJG, MAX, MIN, REAL, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      WNTUS  = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' )
+      WNTUR  = LSAME( JOBU, 'R' )
+      WNTUA  = LSAME( JOBU, 'A' )
+      WNTUF  = LSAME( JOBU, 'F' )
+      LSVC0  = WNTUS .OR. WNTUR .OR. WNTUA
+      LSVEC  = LSVC0 .OR. WNTUF
+      DNTWU  = LSAME( JOBU, 'N' )
+*
+      WNTVR  = LSAME( JOBV, 'R' )
+      WNTVA  = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' )
+      RSVEC  = WNTVR .OR. WNTVA
+      DNTWV  = LSAME( JOBV, 'N' )
+*
+      ACCLA  = LSAME( JOBA, 'A' )
+      ACCLM  = LSAME( JOBA, 'M' )
+      CONDA  = LSAME( JOBA, 'E' )
+      ACCLH  = LSAME( JOBA, 'H' ) .OR. CONDA
+*
+      ROWPRM = LSAME( JOBP, 'P' )
+      RTRANS = LSAME( JOBR, 'T' )
+*
+      IF ( ROWPRM ) THEN
+         IMINWRK = MAX( 1, N + M - 1 )
+         RMINWRK = MAX( 2, M, 5*N )
+      ELSE
+         IMINWRK = MAX( 1, N )
+         RMINWRK = MAX( 2, 5*N )
+      END IF
+      LQUERY = (LIWORK .EQ. -1 .OR. LCWORK .EQ. -1 .OR. LRWORK .EQ. -1)
+      INFO  = 0
+      IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN
+         INFO = -1
+      ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN
+         INFO = -4
+      ELSE IF ( WNTUR .AND. WNTVA ) THEN
+         INFO = -5
+      ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN
+         INFO = -5
+      ELSE IF ( M.LT.0 ) THEN
+         INFO = -6
+      ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN
+         INFO = -7
+      ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -9
+      ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR.
+     $       ( WNTUF .AND. LDU.LT.N ) ) THEN
+         INFO = -12
+      ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR.
+     $          ( CONDA .AND. LDV.LT.N ) ) THEN
+         INFO = -14
+      ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN
+         INFO = -17
+      END IF
+*
+*
+      IF ( INFO .EQ. 0 ) THEN
+*
+*     Compute workspace
+*        .. compute the minimal and the optimal workspace lengths
+*        [[The expressions for computing the minimal and the optimal
+*        values of LCWORK are written with a lot of redundancy and
+*        can be simplified. However, this detailed form is easier for
+*        maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for CGEQP3 of an M x N matrix
+         LWQP3 = N+1
+*        .. minimal workspace length for CUNMQR to build left singular vectors
+         IF ( WNTUS .OR. WNTUR ) THEN
+             LWUNQ  = MAX( N  , 1 )
+         ELSE IF ( WNTUA ) THEN
+             LWUNQ = MAX( M , 1 )
+         END IF
+*        .. minimal workspace length for CPOCON of an N x N matrix
+         LWCON = 2 * N
+*        .. CGESVD of an N x N matrix
+         LWSVD = MAX( 3 * N, 1 )
+         IF ( LQUERY ) THEN
+             CALL CGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1,
+     $            RDUMMY, IERR )
+             LWRK_CGEQP3 = INT( CDUMMY(1) )
+             IF ( WNTUS .OR. WNTUR ) THEN
+                 CALL CUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQR = INT( CDUMMY(1) )
+             ELSE IF ( WNTUA ) THEN
+                 CALL CUNMQR( 'L', 'N', M, M, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_CUNMQR = INT( CDUMMY(1) )
+             ELSE
+                 LWRK_CUNMQR = 0
+             END IF
+         END IF
+         MINWRK = 2
+         OPTWRK = 2
+         IF ( .NOT. (LSVEC .OR. RSVEC )) THEN
+*            .. minimal and optimal sizes of the complex workspace if
+*            only the singular values are requested
+             IF ( CONDA ) THEN
+                MINWRK = MAX( N+LWQP3, LWCON, LWSVD )
+             ELSE
+                MINWRK = MAX( N+LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL CGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU,
+     $                V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                 LWRK_CGESVD = INT( CDUMMY(1) )
+                 IF ( CONDA ) THEN
+                    OPTWRK = MAX( N+LWRK_CGEQP3, N+LWCON, LWRK_CGESVD )
+                 ELSE
+                    OPTWRK = MAX( N+LWRK_CGEQP3, LWRK_CGESVD )
+                 END IF
+             END IF
+         ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD, LWUNQ )
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL CGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                ELSE
+                   CALL CGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                END IF
+                LWRK_CGESVD = INT( CDUMMY(1) )
+                IF ( CONDA ) THEN
+                    OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, LWRK_CGESVD,
+     $                               LWRK_CUNMQR )
+                ELSE
+                    OPTWRK = N + MAX( LWRK_CGEQP3, LWRK_CGESVD,
+     $                               LWRK_CUNMQR )
+                END IF
+             END IF
+         ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 IF ( RTRANS ) THEN
+                     CALL CGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                 ELSE
+                     CALL CGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                 END IF
+                 LWRK_CGESVD = INT( CDUMMY(1) )
+                 IF ( CONDA ) THEN
+                     OPTWRK = N + MAX( LWRK_CGEQP3, LWCON, LWRK_CGESVD )
+                 ELSE
+                     OPTWRK = N + MAX( LWRK_CGEQP3, LWRK_CGESVD )
+                 END IF
+             END IF
+         ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( RTRANS ) THEN
+                 MINWRK = MAX( LWQP3, LWSVD, LWUNQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N x N/2 CGEQRF
+                    LWQRF  = MAX( N/2, 1 )
+*                   .. minimal workspace lengt for N/2 x N/2 CGESVD
+                    LWSVD2 = MAX( 3 * (N/2), 1 )
+                    LWUNQ2 = MAX( N, 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2,
+     $                        N/2+LWUNQ2, LWUNQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             ELSE
+                 MINWRK = MAX( LWQP3, LWSVD, LWUNQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N/2 x N CGELQF
+                    LWLQF  = MAX( N/2, 1 )
+                    LWSVD2 = MAX( 3 * (N/2), 1 )
+                    LWUNLQ = MAX( N , 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2,
+     $                        N/2+LWUNLQ, LWUNQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL CGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                   LWRK_CGESVD = INT( CDUMMY(1) )
+                   OPTWRK = MAX(LWRK_CGEQP3,LWRK_CGESVD,LWRK_CUNMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                       CALL CGEQRF(N,N/2,U,LDU,CDUMMY,CDUMMY,-1,IERR)
+                       LWRK_CGEQRF = INT( CDUMMY(1) )
+                       CALL CGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU,
+     $                      V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                       LWRK_CGESVD2 = INT( CDUMMY(1) )
+                       CALL CUNMQR( 'R', 'C', N, N, N/2, U, LDU, CDUMMY,
+     $                      V, LDV, CDUMMY, -1, IERR )
+                       LWRK_CUNMQR2 = INT( CDUMMY(1) )
+                       OPTWRK2 = MAX( LWRK_CGEQP3, N/2+LWRK_CGEQRF,
+     $                           N/2+LWRK_CGESVD2, N/2+LWRK_CUNMQR2 )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                ELSE
+                   CALL CGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                   LWRK_CGESVD = INT( CDUMMY(1) )
+                   OPTWRK = MAX(LWRK_CGEQP3,LWRK_CGESVD,LWRK_CUNMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                      CALL CGELQF(N/2,N,U,LDU,CDUMMY,CDUMMY,-1,IERR)
+                      LWRK_CGELQF = INT( CDUMMY(1) )
+                      CALL CGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU,
+     $                     V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                      LWRK_CGESVD2 = INT( CDUMMY(1) )
+                      CALL CUNMLQ( 'R', 'N', N, N, N/2, U, LDU, CDUMMY,
+     $                     V, LDV, CDUMMY,-1,IERR )
+                      LWRK_CUNMLQ = INT( CDUMMY(1) )
+                      OPTWRK2 = MAX( LWRK_CGEQP3, N/2+LWRK_CGELQF,
+     $                           N/2+LWRK_CGESVD2, N/2+LWRK_CUNMLQ )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                END IF
+             END IF
+         END IF
+*
+         MINWRK = MAX( 2, MINWRK )
+         OPTWRK = MAX( 2, OPTWRK )
+         IF ( LCWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19
+*
+      END IF
+*
+      IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN
+         INFO = -21
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CGESVDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+*
+*     Return optimal workspace
+*
+          IWORK(1) = IMINWRK
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = RMINWRK
+          RETURN
+      END IF
+*
+*     Quick return if the matrix is void.
+*
+      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN
+*     .. all output is void.
+         RETURN
+      END IF
+*
+      BIG = SLAMCH('O')
+      ASCALED = .FALSE.
+      IF ( ROWPRM ) THEN
+*           .. reordering the rows in decreasing sequence in the
+*           ell-infinity norm - this enhances numerical robustness in
+*           the case of differently scaled rows.
+            DO 1904 p = 1, M
+*               RWORK(p) = ABS( A(p,ICAMAX(N,A(p,1),LDA)) )
+*               [[CLANGE will return NaN if an entry of the p-th row is Nan]]
+                RWORK(p) = CLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY )
+*               .. check for NaN's and Inf's
+                IF ( ( RWORK(p) .NE. RWORK(p) ) .OR.
+     $               ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN
+                    INFO = - 8
+                    CALL XERBLA( 'CGESVDQ', -INFO )
+                    RETURN
+                END IF
+ 1904       CONTINUE
+            DO 1952 p = 1, M - 1
+            q = ISAMAX( M-p+1, RWORK(p), 1 ) + p - 1
+            IWORK(N+p) = q
+            IF ( p .NE. q ) THEN
+               RTMP     = RWORK(p)
+               RWORK(p) = RWORK(q)
+               RWORK(q) = RTMP
+            END IF
+ 1952       CONTINUE
+*
+            IF ( RWORK(1) .EQ. ZERO ) THEN
+*              Quick return: A is the M x N zero matrix.
+               NUMRANK = 0
+               CALL SLASET( 'G', N, 1, ZERO, ZERO, S, N )
+               IF ( WNTUS ) CALL CLASET('G', M, N, CZERO, CONE, U, LDU)
+               IF ( WNTUA ) CALL CLASET('G', M, M, CZERO, CONE, U, LDU)
+               IF ( WNTVA ) CALL CLASET('G', N, N, CZERO, CONE, V, LDV)
+               IF ( WNTUF ) THEN
+                   CALL CLASET( 'G', N, 1, CZERO, CZERO, CWORK, N )
+                   CALL CLASET( 'G', M, N, CZERO, CONE, U, LDU )
+               END IF
+               DO 5001 p = 1, N
+                   IWORK(p) = p
+ 5001          CONTINUE
+               IF ( ROWPRM ) THEN
+                   DO 5002 p = N + 1, N + M - 1
+                       IWORK(p) = p - N
+ 5002              CONTINUE
+               END IF
+               IF ( CONDA ) RWORK(1) = -1
+               RWORK(2) = -1
+               RETURN
+            END IF
+*
+            IF ( RWORK(1) .GT. BIG / SQRT(REAL(M)) ) THEN
+*               .. to prevent overflow in the QR factorization, scale the
+*               matrix by 1/sqrt(M) if too large entry detected
+                CALL CLASCL('G',0,0,SQRT(REAL(M)),ONE, M,N, A,LDA, IERR)
+                ASCALED = .TRUE.
+            END IF
+            CALL CLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 )
+      END IF
+*
+*    .. At this stage, preemptive scaling is done only to avoid column
+*    norms overflows during the QR factorization. The SVD procedure should
+*    have its own scaling to save the singular values from overflows and
+*    underflows. That depends on the SVD procedure.
+*
+      IF ( .NOT.ROWPRM ) THEN
+          RTMP = CLANGE( 'M', M, N, A, LDA, RWORK )
+          IF ( ( RTMP .NE. RTMP ) .OR.
+     $         ( (RTMP*ZERO) .NE. ZERO ) ) THEN
+               INFO = - 8
+               CALL XERBLA( 'CGESVDQ', -INFO )
+               RETURN
+          END IF
+          IF ( RTMP .GT. BIG / SQRT(REAL(M)) ) THEN
+*             .. to prevent overflow in the QR factorization, scale the
+*             matrix by 1/sqrt(M) if too large entry detected
+              CALL CLASCL('G',0,0, SQRT(REAL(M)),ONE, M,N, A,LDA, IERR)
+              ASCALED = .TRUE.
+          END IF
+      END IF
+*
+*     .. QR factorization with column pivoting
+*
+*     A * P = Q * [ R ]
+*                 [ 0 ]
+*
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL CGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LCWORK-N,
+     $     RWORK, IERR )
+*
+*    If the user requested accuracy level allows truncation in the
+*    computed upper triangular factor, the matrix R is examined and,
+*    if possible, replaced with its leading upper trapezoidal part.
+*
+      EPSLN = SLAMCH('E')
+      SFMIN = SLAMCH('S')
+*     SMALL = SFMIN / EPSLN
+      NR = N
+*
+      IF ( ACCLA ) THEN
+*
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPS*||A||_F are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPS*||A||_F.
+         NR = 1
+         RTMP = SQRT(REAL(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002
+               NR = NR + 1
+ 3001    CONTINUE
+ 3002    CONTINUE
+*
+      ELSEIF ( ACCLM ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R is used as the criterion for being
+*        close-to-rank-deficient. The threshold is set to EPSLN=SLAMCH('E').
+*        [[This can be made more flexible by replacing this hard-coded value
+*        with a user specified threshold.]] Also, the values that underflow
+*        will be truncated.
+         NR = 1
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        .. RRQR not authorized to determine numerical rank except in the
+*        obvious case of zero pivots.
+*        .. inspect R for exact zeros on the diagonal;
+*        R(i,i)=0 => R(i:N,i:N)=0.
+         NR = 1
+         DO 3501 p = 2, N
+            IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502
+            NR = NR + 1
+ 3501    CONTINUE
+ 3502    CONTINUE
+*
+         IF ( CONDA ) THEN
+*           Estimate the scaled condition number of A. Use the fact that it is
+*           the same as the scaled condition number of R.
+*              .. V is used as workspace
+               CALL CLACPY( 'U', N, N, A, LDA, V, LDV )
+*              Only the leading NR x NR submatrix of the triangular factor
+*              is considered. Only if NR=N will this give a reliable error
+*              bound. However, even for NR < N, this can be used on an
+*              expert level and obtain useful information in the sense of
+*              perturbation theory.
+               DO 3053 p = 1, NR
+                  RTMP = SCNRM2( p, V(1,p), 1 )
+                  CALL CSSCAL( p, ONE/RTMP, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN
+                   CALL CPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  CWORK, RWORK, IERR )
+               ELSE
+                   CALL CPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  CWORK(N+1), RWORK, IERR )
+               END IF
+               SCONDA = ONE / SQRT(RTMP)
+*           For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1),
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*           See the reference [1] for more details.
+         END IF
+*
+      ENDIF
+*
+      IF ( WNTUR ) THEN
+          N1 = NR
+      ELSE IF ( WNTUS .OR. WNTUF) THEN
+          N1 = N
+      ELSE IF ( WNTUA ) THEN
+          N1 = M
+      END IF
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*.......................................................................
+*        .. only the singular values are requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*         .. compute the singular values of R**H = [A](1:NR,1:N)**H
+*           .. set the lower triangle of [A] to [A](1:NR,1:N)**H and
+*           the upper triangle of [A] to zero.
+            DO 1146 p = 1, MIN( N, NR )
+               A(p,p) = CONJG(A(p,p))
+               DO 1147 q = p + 1, N
+                  A(q,p) = CONJG(A(p,q))
+                  IF ( q .LE. NR ) A(p,q) = CZERO
+ 1147          CONTINUE
+ 1146       CONTINUE
+*
+            CALL CGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU,
+     $           V, LDV, CWORK, LCWORK, RWORK, INFO )
+*
+         ELSE
+*
+*           .. compute the singular values of R = [A](1:NR,1:N)
+*
+            IF ( NR .GT. 1 )
+     $          CALL CLASET( 'L', NR-1,NR-1, CZERO,CZERO, A(2,1), LDA )
+            CALL CGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU,
+     $           V, LDV, CWORK, LCWORK, RWORK, INFO )
+*
+         END IF
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN
+*.......................................................................
+*       .. the singular values and the left singular vectors requested
+*.......................................................................""""""""
+         IF ( RTRANS ) THEN
+*            .. apply CGESVD to R**H
+*            .. copy R**H into [U] and overwrite [U] with the right singular
+*            vectors of R
+            DO 1192 p = 1, NR
+               DO 1193 q = p, N
+                  U(q,p) = CONJG(A(p,q))
+ 1193          CONTINUE
+ 1192       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, U(1,2), LDU )
+*           .. the left singular vectors not computed, the NR right singular
+*           vectors overwrite [U](1:NR,1:NR) as conjugate transposed. These
+*           will be pre-multiplied by Q to build the left singular vectors of A.
+               CALL CGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU,
+     $              U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+               DO 1119 p = 1, NR
+                   U(p,p) = CONJG(U(p,p))
+                   DO 1120 q = p + 1, NR
+                      CTMP   = CONJG(U(q,p))
+                      U(q,p) = CONJG(U(p,q))
+                      U(p,q) = CTMP
+ 1120              CONTINUE
+ 1119          CONTINUE
+*
+         ELSE
+*            .. apply CGESVD to R
+*            .. copy R into [U] and overwrite [U] with the left singular vectors
+             CALL CLACPY( 'U', NR, N, A, LDA, U, LDU )
+             IF ( NR .GT. 1 )
+     $         CALL CLASET( 'L', NR-1, NR-1, CZERO, CZERO, U(2,1), LDU )
+*            .. the right singular vectors not computed, the NR left singular
+*            vectors overwrite [U](1:NR,1:NR)
+                CALL CGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU,
+     $               V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*               .. now [U](1:NR,1:NR) contains the NR left singular vectors of
+*               R. These will be pre-multiplied by Q to build the left singular
+*               vectors of A.
+         END IF
+*
+*           .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+         IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN
+             CALL CLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+             IF ( NR .LT. N1 ) THEN
+                CALL CLASET( 'A',NR,N1-NR,CZERO,CZERO,U(1,NR+1), LDU )
+                CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $               U(NR+1,NR+1), LDU )
+             END IF
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT.WNTUF )
+     $       CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $            LDU, CWORK(N+1), LCWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+      ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN
+*.......................................................................
+*       .. the singular values and the right singular vectors requested
+*.......................................................................
+          IF ( RTRANS ) THEN
+*            .. apply CGESVD to R**H
+*            .. copy R**H into V and overwrite V with the left singular vectors
+            DO 1165 p = 1, NR
+               DO 1166 q = p, N
+                  V(q,p) = CONJG(A(p,q))
+ 1166          CONTINUE
+ 1165       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+*           .. the left singular vectors of R**H overwrite V, the right singular
+*           vectors not computed
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+               CALL CGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU,
+     $              U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+               DO 1121 p = 1, NR
+                   V(p,p) = CONJG(V(p,p))
+                   DO 1122 q = p + 1, NR
+                      CTMP   = CONJG(V(q,p))
+                      V(q,p) = CONJG(V(p,q))
+                      V(p,q) = CTMP
+ 1122              CONTINUE
+ 1121          CONTINUE
+*
+               IF ( NR .LT. N ) THEN
+                   DO 1103 p = 1, NR
+                      DO 1104 q = NR + 1, N
+                          V(p,q) = CONJG(V(q,p))
+ 1104                 CONTINUE
+ 1103              CONTINUE
+               END IF
+               CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:N,1:NR)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the QR factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                CALL CLASET('G', N, N-NR, CZERO, CZERO, V(1,NR+1), LDV)
+                CALL CGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU,
+     $               U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+                DO 1123 p = 1, N
+                   V(p,p) = CONJG(V(p,p))
+                   DO 1124 q = p + 1, N
+                      CTMP   = CONJG(V(q,p))
+                      V(q,p) = CONJG(V(p,q))
+                      V(p,q) = CTMP
+ 1124              CONTINUE
+ 1123           CONTINUE
+                CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK )
+            END IF
+*
+          ELSE
+*            .. aply CGESVD to R
+*            .. copy R into V and overwrite V with the right singular vectors
+             CALL CLACPY( 'U', NR, N, A, LDA, V, LDV )
+             IF ( NR .GT. 1 )
+     $         CALL CLASET( 'L', NR-1, NR-1, CZERO, CZERO, V(2,1), LDV )
+*            .. the right singular vectors overwrite V, the NR left singular
+*            vectors stored in U(1:NR,1:NR)
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+                CALL CGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H
+             ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:NR,1:N)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the LQ factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                 CALL CLASET('G', N-NR, N, CZERO,CZERO, V(NR+1,1), LDV)
+                 CALL CGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU,
+     $                V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                 CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK )
+             END IF
+*            .. now [V] contains the adjoint of the matrix of the right singular
+*            vectors of A.
+          END IF
+*
+      ELSE
+*.......................................................................
+*       .. FULL SVD requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*            .. apply CGESVD to R**H [[this option is left for R&D&T]]
+*
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*            .. copy R**H into [V] and overwrite [V] with the left singular
+*            vectors of R**H
+            DO 1168 p = 1, NR
+               DO 1169 q = p, N
+                  V(q,p) = CONJG(A(p,q))
+ 1169          CONTINUE
+ 1168       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL CLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+*
+*           .. the left singular vectors of R**H overwrite [V], the NR right
+*           singular vectors of R**H stored in [U](1:NR,1:NR) as conjugate
+*           transposed
+               CALL CGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV,
+     $              U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*              .. assemble V
+               DO 1115 p = 1, NR
+                  V(p,p) = CONJG(V(p,p))
+                  DO 1116 q = p + 1, NR
+                     CTMP   = CONJG(V(q,p))
+                     V(q,p) = CONJG(V(p,q))
+                     V(p,q) = CTMP
+ 1116             CONTINUE
+ 1115          CONTINUE
+               IF ( NR .LT. N ) THEN
+                   DO 1101 p = 1, NR
+                      DO 1102 q = NR+1, N
+                         V(p,q) = CONJG(V(q,p))
+ 1102                 CONTINUE
+ 1101              CONTINUE
+               END IF
+               CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*
+                DO 1117 p = 1, NR
+                   U(p,p) = CONJG(U(p,p))
+                   DO 1118 q = p + 1, NR
+                      CTMP   = CONJG(U(q,p))
+                      U(q,p) = CONJG(U(p,q))
+                      U(p,q) = CTMP
+ 1118              CONTINUE
+ 1117           CONTINUE
+*
+                IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL CLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*            .. copy R**H into [V] and overwrite [V] with the left singular
+*            vectors of R**H
+*               [[The optimal ratio N/NR for using QRF instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'CGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+                OPTRATIO = 2
+                IF ( OPTRATIO*NR .GT. N ) THEN
+                   DO 1198 p = 1, NR
+                      DO 1199 q = p, N
+                         V(q,p) = CONJG(A(p,q))
+ 1199                 CONTINUE
+ 1198              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL CLASET('U',NR-1,NR-1, CZERO,CZERO, V(1,2),LDV)
+*
+                   CALL CLASET('A',N,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                   CALL CGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV,
+     $                  U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+                   DO 1113 p = 1, N
+                      V(p,p) = CONJG(V(p,p))
+                      DO 1114 q = p + 1, N
+                         CTMP   = CONJG(V(q,p))
+                         V(q,p) = CONJG(V(p,q))
+                         V(p,q) = CTMP
+ 1114                 CONTINUE
+ 1113              CONTINUE
+                   CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*              .. assemble the left singular vector matrix U of dimensions
+*              (M x N1), i.e. (M x N) or (M x M).
+*
+                   DO 1111 p = 1, N
+                      U(p,p) = CONJG(U(p,p))
+                      DO 1112 q = p + 1, N
+                         CTMP   = CONJG(U(q,p))
+                         U(q,p) = CONJG(U(p,q))
+                         U(p,q) = CTMP
+ 1112                 CONTINUE
+ 1111              CONTINUE
+*
+                   IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL CLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL CLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU)
+                        CALL CLASET('A',M-N,N1-N,CZERO,CONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                   END IF
+                ELSE
+*                  .. copy R**H into [U] and overwrite [U] with the right
+*                  singular vectors of R
+                   DO 1196 p = 1, NR
+                      DO 1197 q = p, N
+                         U(q,NR+p) = CONJG(A(p,q))
+ 1197                 CONTINUE
+ 1196              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL CLASET('U',NR-1,NR-1,CZERO,CZERO,U(1,NR+2),LDU)
+                   CALL CGEQRF( N, NR, U(1,NR+1), LDU, CWORK(N+1),
+     $                  CWORK(N+NR+1), LCWORK-N-NR, IERR )
+                   DO 1143 p = 1, NR
+                       DO 1144 q = 1, N
+                           V(q,p) = CONJG(U(p,NR+q))
+ 1144                  CONTINUE
+ 1143              CONTINUE
+                  CALL CLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV)
+                  CALL CGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V,LDV, CWORK(N+NR+1),LCWORK-N-NR,RWORK, INFO )
+                  CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  CALL CUNMQR('R','C', N, N, NR, U(1,NR+1), LDU,
+     $                 CWORK(N+1),V,LDV,CWORK(N+NR+1),LCWORK-N-NR,IERR)
+                  CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL CLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1),LDU)
+                     END IF
+                  END IF
+                END IF
+            END IF
+*
+         ELSE
+*
+*            .. apply CGESVD to R [[this is the recommended option]]
+*
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*                .. copy R into [V] and overwrite V with the right singular vectors
+                 CALL CLACPY( 'U', NR, N, A, LDA, V, LDV )
+                IF ( NR .GT. 1 )
+     $          CALL CLASET( 'L', NR-1,NR-1, CZERO,CZERO, V(2,1), LDV )
+*               .. the right singular vectors of R overwrite [V], the NR left
+*               singular vectors of R stored in [U](1:NR,1:NR)
+                CALL CGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                CALL CLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+               IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL CLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+             ELSE
+*              .. need all N right singular vectors and NR < N
+*              .. the requested number of the left singular vectors
+*               is then N1 (N or M)
+*               [[The optimal ratio N/NR for using LQ instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'CGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+               OPTRATIO = 2
+               IF ( OPTRATIO * NR .GT. N ) THEN
+                  CALL CLACPY( 'U', NR, N, A, LDA, V, LDV )
+                  IF ( NR .GT. 1 )
+     $            CALL CLASET('L', NR-1,NR-1, CZERO,CZERO, V(2,1),LDV)
+*              .. the right singular vectors of R overwrite [V], the NR left
+*                 singular vectors of R stored in [U](1:NR,1:NR)
+                  CALL CLASET('A', N-NR,N, CZERO,CZERO, V(NR+1,1),LDV)
+                  CALL CGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU,
+     $                 V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                  CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. now [V] contains the adjoint of the matrix of the right
+*                 singular vectors of A. The leading N left singular vectors
+*                 are in [U](1:N,1:N)
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x N1), i.e. (M x N) or (M x M).
+                  IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL CLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL CLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU)
+                        CALL CLASET( 'A',M-N,N1-N,CZERO,CONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                  END IF
+               ELSE
+                  CALL CLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU )
+                  IF ( NR .GT. 1 )
+     $            CALL CLASET('L',NR-1,NR-1,CZERO,CZERO,U(NR+2,1),LDU)
+                  CALL CGELQF( NR, N, U(NR+1,1), LDU, CWORK(N+1),
+     $                 CWORK(N+NR+1), LCWORK-N-NR, IERR )
+                  CALL CLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV)
+                  IF ( NR .GT. 1 )
+     $            CALL CLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV)
+                  CALL CGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V, LDV, CWORK(N+NR+1), LCWORK-N-NR, RWORK, INFO )
+                  CALL CLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL CLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL CLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  CALL CUNMLQ('R','N',N,N,NR,U(NR+1,1),LDU,CWORK(N+1),
+     $                 V, LDV, CWORK(N+NR+1),LCWORK-N-NR,IERR)
+                  CALL CLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL CLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL CLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL CLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1), LDU )
+                     END IF
+                  END IF
+               END IF
+             END IF
+*        .. end of the "R**H or R" branch
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT. WNTUF )
+     $       CALL CUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $            LDU, CWORK(N+1), LCWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL CLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+*     ... end of the "full SVD" branch
+      END IF
+*
+*     Check whether some singular values are returned as zeros, e.g.
+*     due to underflow, and update the numerical rank.
+      p = NR
+      DO 4001 q = p, 1, -1
+          IF ( S(q) .GT. ZERO ) GO TO 4002
+          NR = NR - 1
+ 4001 CONTINUE
+ 4002 CONTINUE
+*
+*     .. if numerical rank deficiency is detected, the truncated
+*     singular values are set to zero.
+      IF ( NR .LT. N ) CALL SLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N )
+*     .. undo scaling; this may cause overflow in the largest singular
+*     values.
+      IF ( ASCALED )
+     $   CALL SLASCL( 'G',0,0, ONE,SQRT(REAL(M)), NR,1, S, N, IERR )
+      IF ( CONDA ) RWORK(1) = SCONDA
+      RWORK(2) = p - NR
+*     .. p-NR is the number of singular values that are computed as
+*     exact zeros in CGESVD() applied to the (possibly truncated)
+*     full row rank triangular (trapezoidal) factor of A.
+      NUMRANK = NR
+*
+      RETURN
+*
+*     End of CGESVDQ
+*
+      END
diff --git a/lapack-netlib/SRC/cgesvj.f b/lapack-netlib/SRC/cgesvj.f
index 2a5ced225..81e40efef 100644
--- a/lapack-netlib/SRC/cgesvj.f
+++ b/lapack-netlib/SRC/cgesvj.f
@@ -89,12 +89,12 @@
 *>          Specifies whether to compute the right singular vectors, that
 *>          is, the matrix V:
 *>          = 'V' or 'J': the matrix V is computed and returned in the array V
-*>          = 'A' : the Jacobi rotations are applied to the MV-by-N
+*>          = 'A':  the Jacobi rotations are applied to the MV-by-N
 *>                  array V. In other words, the right singular vector
 *>                  matrix V is not computed explicitly; instead it is
 *>                  applied to an MV-by-N matrix initially stored in the
 *>                  first MV rows of V.
-*>          = 'N' : the matrix V is not computed and the array V is not
+*>          = 'N':  the matrix V is not computed and the array V is not
 *>                  referenced
 *> \endverbatim
 *>
@@ -116,8 +116,8 @@
 *>          A is COMPLEX array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
 *>          On exit,
-*>          If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C':
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'U' .OR. JOBU = 'C':
+*>                 If INFO = 0 :
 *>                 RANKA orthonormal columns of U are returned in the
 *>                 leading RANKA columns of the array A. Here RANKA <= N
 *>                 is the number of computed singular values of A that are
@@ -127,9 +127,9 @@
 *>                 in the array RWORK as RANKA=NINT(RWORK(2)). Also see the
 *>                 descriptions of SVA and RWORK. The computed columns of U
 *>                 are mutually numerically orthogonal up to approximately
-*>                 TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'),
+*>                 TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'),
 *>                 see the description of JOBU.
-*>                 If INFO .GT. 0,
+*>                 If INFO > 0,
 *>                 the procedure CGESVJ did not converge in the given number
 *>                 of iterations (sweeps). In that case, the computed
 *>                 columns of U may not be orthogonal up to TOL. The output
@@ -137,8 +137,8 @@
 *>                 values in SVA(1:N)) and V is still a decomposition of the
 *>                 input matrix A in the sense that the residual
 *>                 || A - SCALE * U * SIGMA * V^* ||_2 / ||A||_2 is small.
-*>          If JOBU .EQ. 'N':
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'N':
+*>                 If INFO = 0 :
 *>                 Note that the left singular vectors are 'for free' in the
 *>                 one-sided Jacobi SVD algorithm. However, if only the
 *>                 singular values are needed, the level of numerical
@@ -147,7 +147,7 @@
 *>                 numerically orthogonal up to approximately M*EPS. Thus,
 *>                 on exit, A contains the columns of U scaled with the
 *>                 corresponding singular values.
-*>                 If INFO .GT. 0 :
+*>                 If INFO > 0 :
 *>                 the procedure CGESVJ did not converge in the given number
 *>                 of iterations (sweeps).
 *> \endverbatim
@@ -162,9 +162,9 @@
 *> \verbatim
 *>          SVA is REAL array, dimension (N)
 *>          On exit,
-*>          If INFO .EQ. 0 :
+*>          If INFO = 0 :
 *>          depending on the value SCALE = RWORK(1), we have:
-*>                 If SCALE .EQ. ONE:
+*>                 If SCALE = ONE:
 *>                 SVA(1:N) contains the computed singular values of A.
 *>                 During the computation SVA contains the Euclidean column
 *>                 norms of the iterated matrices in the array A.
@@ -173,7 +173,7 @@
 *>                 factored representation is due to the fact that some of the
 *>                 singular values of A might underflow or overflow.
 *>
-*>          If INFO .GT. 0 :
+*>          If INFO > 0 :
 *>          the procedure CGESVJ did not converge in the given number of
 *>          iterations (sweeps) and SCALE*SVA(1:N) may not be accurate.
 *> \endverbatim
@@ -181,7 +181,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then the product of Jacobi rotations in CGESVJ
+*>          If JOBV = 'A', then the product of Jacobi rotations in CGESVJ
 *>          is applied to the first MV rows of V. See the description of JOBV.
 *> \endverbatim
 *>
@@ -199,16 +199,16 @@
 *> \param[in] LDV
 *> \verbatim
 *>          LDV is INTEGER
-*>          The leading dimension of the array V, LDV .GE. 1.
-*>          If JOBV .EQ. 'V', then LDV .GE. max(1,N).
-*>          If JOBV .EQ. 'A', then LDV .GE. max(1,MV) .
+*>          The leading dimension of the array V, LDV >= 1.
+*>          If JOBV = 'V', then LDV >= max(1,N).
+*>          If JOBV = 'A', then LDV >= max(1,MV) .
 *> \endverbatim
 *>
 *> \param[in,out] CWORK
 *> \verbatim
 *>          CWORK is COMPLEX array, dimension (max(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK .EQ. -1, then a workspace query is assumed and
+*>          If on entry LWORK = -1, then a workspace query is assumed and
 *>          no computation is done; CWORK(1) is set to the minial (and optimal)
 *>          length of CWORK.
 *> \endverbatim
@@ -223,7 +223,7 @@
 *> \verbatim
 *>          RWORK is REAL array, dimension (max(6,LRWORK))
 *>          On entry,
-*>          If JOBU .EQ. 'C' :
+*>          If JOBU = 'C' :
 *>          RWORK(1) = CTOL, where CTOL defines the threshold for convergence.
 *>                    The process stops if all columns of A are mutually
 *>                    orthogonal up to CTOL*EPS, EPS=SLAMCH('E').
@@ -243,11 +243,11 @@
 *>          RWORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep.
 *>                    This is useful information in cases when CGESVJ did
 *>                    not converge, as it can be used to estimate whether
-*>                    the output is stil useful and for post festum analysis.
+*>                    the output is still useful and for post festum analysis.
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK .EQ. -1, then a workspace query is assumed and
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
 *>         no computation is done; RWORK(1) is set to the minial (and optimal)
 *>         length of RWORK.
 *> \endverbatim
@@ -261,9 +261,9 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
-*>          > 0 : CGESVJ did not converge in the maximal allowed number
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
+*>          > 0:  CGESVJ did not converge in the maximal allowed number
 *>                (NSWEEP=30) of sweeps. The output may still be useful.
 *>                See the description of RWORK.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cgesvxx.f b/lapack-netlib/SRC/cgesvxx.f
index 30d1beb33..383e4d011 100644
--- a/lapack-netlib/SRC/cgesvxx.f
+++ b/lapack-netlib/SRC/cgesvxx.f
@@ -411,7 +411,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -447,14 +447,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -462,9 +462,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/cgetsls.f b/lapack-netlib/SRC/cgetsls.f
index e7c5d8120..3d783be66 100644
--- a/lapack-netlib/SRC/cgetsls.f
+++ b/lapack-netlib/SRC/cgetsls.f
@@ -1,3 +1,5 @@
+*> \brief \b CGETSLS
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/cggesx.f b/lapack-netlib/SRC/cggesx.f
index 74169ff80..acc4eda36 100644
--- a/lapack-netlib/SRC/cggesx.f
+++ b/lapack-netlib/SRC/cggesx.f
@@ -120,10 +120,10 @@
 *> \verbatim
 *>          SENSE is CHARACTER*1
 *>          Determines which reciprocal condition numbers are computed.
-*>          = 'N' : None are computed;
-*>          = 'E' : Computed for average of selected eigenvalues only;
-*>          = 'V' : Computed for selected deflating subspaces only;
-*>          = 'B' : Computed for both.
+*>          = 'N':  None are computed;
+*>          = 'E':  Computed for average of selected eigenvalues only;
+*>          = 'V':  Computed for selected deflating subspaces only;
+*>          = 'B':  Computed for both.
 *>          If SENSE = 'E', 'V', or 'B', SORT must equal 'S'.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/cgsvj0.f b/lapack-netlib/SRC/cgsvj0.f
index 80e67a06e..810df3367 100644
--- a/lapack-netlib/SRC/cgsvj0.f
+++ b/lapack-netlib/SRC/cgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
@@ -136,8 +136,8 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -157,7 +157,7 @@
 *>          TOL is REAL
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -175,14 +175,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/cgsvj1.f b/lapack-netlib/SRC/cgsvj1.f
index bebcd5c45..06b417cf2 100644
--- a/lapack-netlib/SRC/cgsvj1.f
+++ b/lapack-netlib/SRC/cgsvj1.f
@@ -61,7 +61,7 @@
 *> In terms of the columns of A, the first N1 columns are rotated 'against'
 *> the remaining N-N1 columns, trying to increase the angle between the
 *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
-*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter.
+*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter.
 *> The number of sweeps is given in NSWEEP and the orthogonality threshold
 *> is given in TOL.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
@@ -166,8 +166,8 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -187,7 +187,7 @@
 *>          TOL is REAL
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -205,14 +205,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/chb2st_kernels.f b/lapack-netlib/SRC/chb2st_kernels.f
index 25c9ab717..01ea217bb 100644
--- a/lapack-netlib/SRC/chb2st_kernels.f
+++ b/lapack-netlib/SRC/chb2st_kernels.f
@@ -1,26 +1,26 @@
 *> \brief \b CHB2ST_KERNELS
 *
 *  @generated from zhb2st_kernels.f, fortran z -> c, Wed Dec  7 08:22:40 2016
-*      
+*
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download CHB2ST_KERNELS + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chb2st_kernels.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chb2st_kernels.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chb2st_kernels.f"> 
+*> Download CHB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/chb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/chb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/chb2st_kernels.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE  CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*       SUBROUTINE  CHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
 *                                   ST, ED, SWEEP, N, NB, IB,
 *                                   A, LDA, V, TAU, LDVT, WORK)
 *
@@ -32,9 +32,9 @@
 *       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *       ..
 *       .. Array Arguments ..
-*       COMPLEX            A( LDA, * ), V( * ), 
+*       COMPLEX            A( LDA, * ), V( * ),
 *                          TAU( * ), WORK( * )
-*  
+*
 *> \par Purpose:
 *  =============
 *>
@@ -124,7 +124,7 @@
 *>          LDVT is INTEGER.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array. Workspace of size nb.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -155,16 +155,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE  CHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+      SUBROUTINE  CHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
      $                            ST, ED, SWEEP, N, NB, IB,
      $                            A, LDA, V, TAU, LDVT, WORK)
 *
@@ -181,7 +181,7 @@
       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX            A( LDA, * ), V( * ), 
+      COMPLEX            A( LDA, * ), V( * ),
      $                   TAU( * ), WORK( * )
 *     ..
 *
@@ -195,8 +195,8 @@
 *     .. Local Scalars ..
       LOGICAL            UPPER
       INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
-     $                   DPOS, OFDPOS, AJETER 
-      COMPLEX            CTMP 
+     $                   DPOS, OFDPOS, AJETER
+      COMPLEX            CTMP
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CLARFG, CLARFX, CLARFY
@@ -209,7 +209,7 @@
 *     ..
 *     ..
 *     .. Executable Statements ..
-*      
+*
       AJETER = IB + LDVT
       UPPER = LSAME( UPLO, 'U' )
 
@@ -240,10 +240,10 @@
               V( VPOS ) = ONE
               DO 10 I = 1, LM-1
                   V( VPOS+I )         = CONJG( A( OFDPOS-I, ST+I ) )
-                  A( OFDPOS-I, ST+I ) = ZERO  
+                  A( OFDPOS-I, ST+I ) = ZERO
    10         CONTINUE
               CTMP = CONJG( A( OFDPOS, ST ) )
-              CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+              CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
               A( OFDPOS, ST ) = CTMP
 *
@@ -281,14 +281,14 @@
 *
                   V( VPOS ) = ONE
                   DO 30 I = 1, LM-1
-                      V( VPOS+I )          = 
+                      V( VPOS+I )          =
      $                                    CONJG( A( DPOS-NB-I, J1+I ) )
                       A( DPOS-NB-I, J1+I ) = ZERO
    30             CONTINUE
                   CTMP = CONJG( A( DPOS-NB, J1 ) )
                   CALL CLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
                   A( DPOS-NB, J1 ) = CTMP
-*                 
+*
                   CALL CLARFX( 'Right', LN-1, LM, V( VPOS ),
      $                         TAU( TAUPOS ),
      $                         A( DPOS-NB+1, J1 ), LDA-1, WORK)
@@ -296,9 +296,9 @@
           ENDIF
 *
 *     Lower case
-*  
+*
       ELSE
-*      
+*
           IF( WANTZ ) THEN
               VPOS   = MOD( SWEEP-1, 2 ) * N + ST
               TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
@@ -313,9 +313,9 @@
               V( VPOS ) = ONE
               DO 20 I = 1, LM-1
                   V( VPOS+I )         = A( OFDPOS+I, ST-1 )
-                  A( OFDPOS+I, ST-1 ) = ZERO  
+                  A( OFDPOS+I, ST-1 ) = ZERO
    20         CONTINUE
-              CALL CLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+              CALL CLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
 *
               LM = ED - ST + 1
@@ -342,7 +342,7 @@
               LM = J2-J1+1
 *
               IF( LM.GT.0) THEN
-                  CALL CLARFX( 'Right', LM, LN, V( VPOS ), 
+                  CALL CLARFX( 'Right', LM, LN, V( VPOS ),
      $                         TAU( TAUPOS ), A( DPOS+NB, ST ),
      $                         LDA-1, WORK)
 *
@@ -359,13 +359,13 @@
                       V( VPOS+I )        = A( DPOS+NB+I, ST )
                       A( DPOS+NB+I, ST ) = ZERO
    40             CONTINUE
-                  CALL CLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+                  CALL CLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
      $                                        TAU( TAUPOS ) )
 *
-                  CALL CLARFX( 'Left', LM, LN-1, V( VPOS ), 
+                  CALL CLARFX( 'Left', LM, LN-1, V( VPOS ),
      $                         CONJG( TAU( TAUPOS ) ),
      $                         A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
-             
+
               ENDIF
           ENDIF
       ENDIF
@@ -374,4 +374,4 @@
 *
 *     END OF CHB2ST_KERNELS
 *
-      END      
+      END
diff --git a/lapack-netlib/SRC/checon_3.f b/lapack-netlib/SRC/checon_3.f
index 6427dd594..5d9ed97e9 100644
--- a/lapack-netlib/SRC/checon_3.f
+++ b/lapack-netlib/SRC/checon_3.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE CHECON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND,
-*                            WORK, IWORK, INFO )
+*                            WORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          UPLO
@@ -27,7 +27,7 @@
 *       REAL               ANORM, RCOND
 *       ..
 *       .. Array Arguments ..
-*       INTEGER            IPIV( * ), IWORK( * )
+*       INTEGER            IPIV( * )
 *       COMPLEX            A( LDA, * ), E ( * ), WORK( * )
 *       ..
 *
@@ -129,11 +129,6 @@
 *>          WORK is COMPLEX array, dimension (2*N)
 *> \endverbatim
 *>
-*> \param[out] IWORK
-*> \verbatim
-*>          IWORK is INTEGER array, dimension (N)
-*> \endverbatim
-*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
diff --git a/lapack-netlib/SRC/cheevr.f b/lapack-netlib/SRC/cheevr.f
index 0b055baf6..c5deb1166 100644
--- a/lapack-netlib/SRC/cheevr.f
+++ b/lapack-netlib/SRC/cheevr.f
@@ -210,7 +210,7 @@
 *>          eigenvalues are computed to high relative accuracy when
 *>          possible in future releases.  The current code does not
 *>          make any guarantees about high relative accuracy, but
-*>          furutre releases will. See J. Barlow and J. Demmel,
+*>          future releases will. See J. Barlow and J. Demmel,
 *>          "Computing Accurate Eigensystems of Scaled Diagonally
 *>          Dominant Matrices", LAPACK Working Note #7, for a discussion
 *>          of which matrices define their eigenvalues to high relative
diff --git a/lapack-netlib/SRC/cheevr_2stage.f b/lapack-netlib/SRC/cheevr_2stage.f
index 20a1cb3f3..1489a322e 100644
--- a/lapack-netlib/SRC/cheevr_2stage.f
+++ b/lapack-netlib/SRC/cheevr_2stage.f
@@ -217,7 +217,7 @@
 *>          eigenvalues are computed to high relative accuracy when
 *>          possible in future releases.  The current code does not
 *>          make any guarantees about high relative accuracy, but
-*>          furutre releases will. See J. Barlow and J. Demmel,
+*>          future releases will. See J. Barlow and J. Demmel,
 *>          "Computing Accurate Eigensystems of Scaled Diagonally
 *>          Dominant Matrices", LAPACK Working Note #7, for a discussion
 *>          of which matrices define their eigenvalues to high relative
diff --git a/lapack-netlib/SRC/chegs2.f b/lapack-netlib/SRC/chegs2.f
index 68d2f6625..55a895fc3 100644
--- a/lapack-netlib/SRC/chegs2.f
+++ b/lapack-netlib/SRC/chegs2.f
@@ -97,6 +97,7 @@
 *>          B is COMPLEX array, dimension (LDB,N)
 *>          The triangular factor from the Cholesky factorization of B,
 *>          as returned by CPOTRF.
+*>          B is modified by the routine but restored on exit.
 *> \endverbatim
 *>
 *> \param[in] LDB
diff --git a/lapack-netlib/SRC/chegst.f b/lapack-netlib/SRC/chegst.f
index 2f933729c..b3fdff2d5 100644
--- a/lapack-netlib/SRC/chegst.f
+++ b/lapack-netlib/SRC/chegst.f
@@ -97,6 +97,7 @@
 *>          B is COMPLEX array, dimension (LDB,N)
 *>          The triangular factor from the Cholesky factorization of B,
 *>          as returned by CPOTRF.
+*>          B is modified by the routine but restored on exit.
 *> \endverbatim
 *>
 *> \param[in] LDB
diff --git a/lapack-netlib/SRC/cherfsx.f b/lapack-netlib/SRC/cherfsx.f
index 4ed2c99f7..76cef7cd1 100644
--- a/lapack-netlib/SRC/cherfsx.f
+++ b/lapack-netlib/SRC/cherfsx.f
@@ -102,7 +102,7 @@
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>     The symmetric matrix A.  If UPLO = 'U', the leading N-by-N
+*>     The Hermitian matrix A.  If UPLO = 'U', the leading N-by-N
 *>     upper triangular part of A contains the upper triangular
 *>     part of the matrix A, and the strictly lower triangular
 *>     part of A is not referenced.  If UPLO = 'L', the leading
@@ -270,7 +270,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -306,14 +306,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -321,9 +321,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/chesv_aa.f b/lapack-netlib/SRC/chesv_aa.f
index 470f910bc..b934e624b 100644
--- a/lapack-netlib/SRC/chesv_aa.f
+++ b/lapack-netlib/SRC/chesv_aa.f
@@ -42,7 +42,7 @@
 *> matrices.
 *>
 *> Aasen's algorithm is used to factor A as
-*>    A = U * T * U**H,  if UPLO = 'U', or
+*>    A = U**H * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**H,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is Hermitian and tridiagonal. The factored form
@@ -86,7 +86,7 @@
 *>
 *>          On exit, if INFO = 0, the tridiagonal matrix T and the
 *>          multipliers used to obtain the factor U or L from the
-*>          factorization A = U*T*U**H or A = L*T*L**H as computed by
+*>          factorization A = U**H*T*U or A = L*T*L**H as computed by
 *>          CHETRF_AA.
 *> \endverbatim
 *>
@@ -230,7 +230,7 @@
          RETURN
       END IF
 *
-*     Compute the factorization A = U*T*U**H or A = L*T*L**H.
+*     Compute the factorization A = U**H*T*U or A = L*T*L**H.
 *
       CALL CHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
       IF( INFO.EQ.0 ) THEN
diff --git a/lapack-netlib/SRC/chesv_aa_2stage.f b/lapack-netlib/SRC/chesv_aa_2stage.f
index 05f6b7bb7..ab5786d57 100644
--- a/lapack-netlib/SRC/chesv_aa_2stage.f
+++ b/lapack-netlib/SRC/chesv_aa_2stage.f
@@ -43,7 +43,7 @@
 *> matrices.
 *>
 *> Aasen's 2-stage algorithm is used to factor A as
-*>    A = U * T * U**H,  if UPLO = 'U', or
+*>    A = U**H * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**H,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is Hermitian and band. The matrix T is
@@ -257,7 +257,7 @@
       END IF
 *
 *
-*     Compute the factorization A = U*T*U**H or A = L*T*L**H.
+*     Compute the factorization A = U**H*T*U or A = L*T*L**H.
 *
       CALL CHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
      $                       WORK, LWORK, INFO )
diff --git a/lapack-netlib/SRC/chesvxx.f b/lapack-netlib/SRC/chesvxx.f
index 3f4466d41..c59e72bbf 100644
--- a/lapack-netlib/SRC/chesvxx.f
+++ b/lapack-netlib/SRC/chesvxx.f
@@ -46,7 +46,7 @@
 *>
 *>    CHESVXX uses the diagonal pivoting factorization to compute the
 *>    solution to a complex system of linear equations A * X = B, where
-*>    A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+*>    A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
 *>    matrices.
 *>
 *>    If requested, both normwise and maximum componentwise error bounds
@@ -88,7 +88,7 @@
 *>       A = L * D * L**T,  if UPLO = 'L',
 *>
 *>    where U (or L) is a product of permutation and unit upper (lower)
-*>    triangular matrices, and D is symmetric and block diagonal with
+*>    triangular matrices, and D is Hermitian and block diagonal with
 *>    1-by-1 and 2-by-2 diagonal blocks.
 *>
 *>    3. If some D(i,i)=0, so that D is exactly singular, then the
@@ -161,7 +161,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>     The symmetric matrix A.  If UPLO = 'U', the leading N-by-N
+*>     The Hermitian matrix A.  If UPLO = 'U', the leading N-by-N
 *>     upper triangular part of A contains the upper triangular
 *>     part of the matrix A, and the strictly lower triangular
 *>     part of A is not referenced.  If UPLO = 'L', the leading
@@ -378,7 +378,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -414,14 +414,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -429,9 +429,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/chetf2_rk.f b/lapack-netlib/SRC/chetf2_rk.f
index 38a0ce373..80e2f61b7 100644
--- a/lapack-netlib/SRC/chetf2_rk.f
+++ b/lapack-netlib/SRC/chetf2_rk.f
@@ -322,7 +322,7 @@
 *
 *        Factorize A as U*D*U**H using the upper triangle of A
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -676,7 +676,7 @@
 *
 *        Factorize A as L*D*L**H using the lower triangle of A
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/chetrd_2stage.f b/lapack-netlib/SRC/chetrd_2stage.f
index e7370a4dd..4575a5e90 100644
--- a/lapack-netlib/SRC/chetrd_2stage.f
+++ b/lapack-netlib/SRC/chetrd_2stage.f
@@ -123,23 +123,22 @@
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX array, dimension LHOUS2, that
-*>          store the Householder representation of the stage2
+*>          HOUS2 is COMPLEX array, dimension (LHOUS2)
+*>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
 *>
 *> \param[in] LHOUS2
 *> \verbatim
 *>          LHOUS2 is INTEGER
-*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
+*>          The dimension of the array HOUS2.
 *>          If LWORK = -1, or LHOUS2=-1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
 *>          this value as the first entry of the HOUS2 array, and no error
 *>          message related to LHOUS2 is issued by XERBLA.
-*>          LHOUS2 = MAX(1, dimension) where
-*>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'
+*>          If VECT='N', LHOUS2 = max(1, 4*n);
+*>          if VECT='V', option not yet available.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -151,7 +150,7 @@
 *> \verbatim
 *>          LWORK is INTEGER
 *>          The dimension of the array WORK. LWORK = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a workspace query is assumed; the routine
 *>          only calculates the optimal size of the WORK array, returns
 *>          this value as the first entry of the WORK array, and no error
diff --git a/lapack-netlib/SRC/chetrd_hb2st.F b/lapack-netlib/SRC/chetrd_hb2st.F
index 43da45640..a3d8259d3 100644
--- a/lapack-netlib/SRC/chetrd_hb2st.F
+++ b/lapack-netlib/SRC/chetrd_hb2st.F
@@ -50,9 +50,9 @@
 *  Arguments:
 *  ==========
 *
-*> \param[in] STAGE
+*> \param[in] STAGE1
 *> \verbatim
-*>          STAGE is CHARACTER*1
+*>          STAGE1 is CHARACTER*1
 *>          = 'N':  "No": to mention that the stage 1 of the reduction  
 *>                  from dense to band using the chetrd_he2hb routine
 *>                  was not called before this routine to reproduce AB. 
diff --git a/lapack-netlib/SRC/chetrd_he2hb.f b/lapack-netlib/SRC/chetrd_he2hb.f
index e334532fe..e85c1fd01 100644
--- a/lapack-netlib/SRC/chetrd_he2hb.f
+++ b/lapack-netlib/SRC/chetrd_he2hb.f
@@ -363,7 +363,7 @@
 *
 *
 *     Set the workspace of the triangular matrix T to zero once such a
-*     way everytime T is generated the upper/lower portion will be always zero  
+*     way every time T is generated the upper/lower portion will be always zero
 *   
       CALL CLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
 *
diff --git a/lapack-netlib/SRC/chetrf_aa.f b/lapack-netlib/SRC/chetrf_aa.f
index 2c5564893..c6f548d42 100644
--- a/lapack-netlib/SRC/chetrf_aa.f
+++ b/lapack-netlib/SRC/chetrf_aa.f
@@ -37,7 +37,7 @@
 *> CHETRF_AA computes the factorization of a complex hermitian matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**H  or  A = L*T*L**H
+*>    A = U**H*T*U  or  A = L*T*L**H
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a hermitian tridiagonal matrix.
@@ -223,7 +223,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**H using the upper triangle of A
+*        Factorize A as U**H*D*U using the upper triangle of A
 *        .....................................................
 *
 *        copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N))
@@ -256,7 +256,7 @@
      $                      A( MAX(1, J), J+1 ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
@@ -376,7 +376,7 @@
      $                      A( J+1, MAX(1, J) ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
diff --git a/lapack-netlib/SRC/chetrf_aa_2stage.f b/lapack-netlib/SRC/chetrf_aa_2stage.f
index ce34d73cc..d2e0e0023 100644
--- a/lapack-netlib/SRC/chetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrf_aa_2stage.f
@@ -38,7 +38,7 @@
 *> CHETRF_AA_2STAGE computes the factorization of a real hermitian matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a hermitian band matrix with the
@@ -277,7 +277,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
          DO J = 0, NT-1
@@ -453,14 +453,17 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL CSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, 
      $                                A( (J+1)*NB+1, I2 ), 1 )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
-     $                                    A( I1+1, I2 ), 1 )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) ) THEN
+                        CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
+     $                                       A( I1+1, I2 ), 1 )
+                        CALL CLACGV( I2-I1-1, A( I1+1, I2 ), 1 )
+                     END IF
                      CALL CLACGV( I2-I1, A( I1, I1+1 ), LDA )
-                     CALL CLACGV( I2-I1-1, A( I1+1, I2 ), 1 )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL CSWAP( N-I2, A( I1, I2+1 ), LDA,
-     $                                 A( I2, I2+1 ), LDA ) 
+                     IF( I2.LT.N )
+     $                  CALL CSWAP( N-I2, A( I1, I2+1 ), LDA,
+     $                                    A( I2, I2+1 ), LDA ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
@@ -630,14 +633,17 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL CSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, 
      $                                A( I2, (J+1)*NB+1 ), LDA )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1,
-     $                                    A( I2, I1+1 ), LDA )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) ) THEN
+                        CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1,
+     $                                       A( I2, I1+1 ), LDA )
+                        CALL CLACGV( I2-I1-1, A( I2, I1+1 ), LDA )
+                     END IF
                      CALL CLACGV( I2-I1, A( I1+1, I1 ), 1 )
-                     CALL CLACGV( I2-I1-1, A( I2, I1+1 ), LDA )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL CSWAP( N-I2, A( I2+1, I1 ), 1,
-     $                                 A( I2+1, I2 ), 1 ) 
+                     IF( I2.LT.N )
+     $                  CALL CSWAP( N-I2, A( I2+1, I1 ), 1,
+     $                                    A( I2+1, I2 ), 1 ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
diff --git a/lapack-netlib/SRC/chetri2.f b/lapack-netlib/SRC/chetri2.f
index 722d13008..1e18202cf 100644
--- a/lapack-netlib/SRC/chetri2.f
+++ b/lapack-netlib/SRC/chetri2.f
@@ -62,7 +62,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
+*>          On entry, the block diagonal matrix D and the multipliers
 *>          used to obtain the factor U or L as computed by CHETRF.
 *>
 *>          On exit, if INFO = 0, the (symmetric) inverse of the original
@@ -82,7 +82,7 @@
 *> \param[in] IPIV
 *> \verbatim
 *>          IPIV is INTEGER array, dimension (N)
-*>          Details of the interchanges and the NB structure of D
+*>          Details of the interchanges and the block structure of D
 *>          as determined by CHETRF.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/chetrs_aa.f b/lapack-netlib/SRC/chetrs_aa.f
index 50e5692db..877517031 100644
--- a/lapack-netlib/SRC/chetrs_aa.f
+++ b/lapack-netlib/SRC/chetrs_aa.f
@@ -37,7 +37,7 @@
 *> \verbatim
 *>
 *> CHETRS_AA solves a system of linear equations A*X = B with a complex
-*> hermitian matrix A using the factorization A = U*T*U**H or
+*> hermitian matrix A using the factorization A = U**H*T*U or
 *> A = L*T*L**H computed by CHETRF_AA.
 *> \endverbatim
 *
@@ -49,7 +49,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**H;
+*>          = 'U':  Upper triangular, form is A = U**H*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**H.
 *> \endverbatim
 *>
@@ -97,14 +97,16 @@
 *>          The leading dimension of the array B.  LDB >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE array, dimension (MAX(1,LWORK))
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER, LWORK >= MAX(1,3*N-2).
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -198,24 +200,31 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**H*T*U.
+*
+*        1) Forward substitution with U**H
+*
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
 *
-*        P**T * B
+            K = 1
+            DO WHILE ( K.LE.N )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K + 1
+            END DO
 *
-         K = 1
-         DO WHILE ( K.LE.N )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $          CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K + 1
-         END DO
+*           Compute U**H \ B -> B    [ (U**H \P**T * B) ]
 *
-*        Compute (U \P**T * B) -> B    [ (U \P**T * B) ]
+            CALL CTRSM( 'L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-         CALL CTRSM('L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+*        2) Solve with triangular matrix T
 *
-*        Compute T \ B -> B   [ T \ (U \P**T * B) ]
+*        Compute T \ B -> B   [ T \ (U**H \P**T * B) ]
 *
          CALL CLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1)
          IF( N.GT.1 ) THEN
@@ -226,65 +235,82 @@
          CALL CGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB,
      $              INFO)
 *
-*        Compute (U**T \ B) -> B   [ U**T \ (T \ (U \P**T * B) ) ]
+*        3) Backward substitution with U
+*
+         IF( N.GT.1 ) THEN
 *
-         CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B(2, 1), LDB)
+*           Compute U \ B -> B   [ U \ (T \ (U**H \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (U**T \ (T \ (U \P**T * B) )) ]
+            CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B(2, 1), LDB)
 *
-         K = N
-         DO WHILE ( K.GE.1 )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K - 1
-         END DO
+*           Pivot, P * B  -> B [ P * (U \ (T \ (U**H \P**T * B) )) ]
+*
+            K = N
+            DO WHILE ( K.GE.1 )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K - 1
+            END DO
+         END IF
 *
       ELSE
 *
-*        Solve A*X = B, where A = L*T*L**T.
+*        Solve A*X = B, where A = L*T*L**H.
 *
-*        Pivot, P**T * B
+*        1) Forward substitution with L
 *
-         K = 1
-         DO WHILE ( K.LE.N )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K + 1
-         END DO
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
+*
+            K = 1
+            DO WHILE ( K.LE.N )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K + 1
+            END DO
 *
-*        Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute L \ B -> B    [ (L \P**T * B) ]
+*
+            CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1),
+     $                  LDA, B(2, 1), LDB )
+         END IF
 *
-         CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1), LDA,
-     $               B(2, 1), LDB)
+*        2) Solve with triangular matrix T
 *
 *        Compute T \ B -> B   [ T \ (L \P**T * B) ]
 *
          CALL CLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1)
          IF( N.GT.1 ) THEN
-             CALL CLACPY( 'F', 1, N-1, A( 2, 1 ), LDA+1, WORK( 1 ), 1)
+             CALL CLACPY( 'F', 1, N-1, A( 2, 1 ), LDA+1, WORK( 1 ), 1 )
              CALL CLACPY( 'F', 1, N-1, A( 2, 1 ), LDA+1, WORK( 2*N ), 1)
              CALL CLACGV( N-1, WORK( 2*N ), 1 )
          END IF
          CALL CGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB,
      $              INFO)
 *
-*        Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
+*        3) Backward substitution with L**H
 *
-         CALL CTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $              B( 2, 1 ), LDB)
+         IF( N.GT.1 ) THEN
+*
+*           Compute (L**H \ B) -> B   [ L**H \ (T \ (L \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+            CALL CTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB )
 *
-         K = N
-         DO WHILE ( K.GE.1 )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K - 1
-         END DO
+*           Pivot, P * B -> B  [ P * (L**H \ (T \ (L \P**T * B) )) ]
+*
+            K = N
+            DO WHILE ( K.GE.1 )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K - 1
+            END DO
+         END IF
 *
       END IF
 *
diff --git a/lapack-netlib/SRC/chetrs_aa_2stage.f b/lapack-netlib/SRC/chetrs_aa_2stage.f
index 05d09275b..979d80a7c 100644
--- a/lapack-netlib/SRC/chetrs_aa_2stage.f
+++ b/lapack-netlib/SRC/chetrs_aa_2stage.f
@@ -38,7 +38,7 @@
 *> \verbatim
 *>
 *> CHETRS_AA_2STAGE solves a system of linear equations A*X = B with a real
-*> hermitian matrix A using the factorization A = U*T*U**T or
+*> hermitian matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by CHETRF_AA_2STAGE.
 *> \endverbatim
 *
@@ -50,7 +50,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -210,15 +210,15 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (U**T \P**T * B) -> B    [ (U**T \P**T * B) ]
+*           Compute (U**T \ B) -> B    [ (U**T \P**T * B) ]
 *
             CALL CTRSM( 'L', 'U', 'C', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                 LDA, B(NB+1, 1), LDB)
diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f
index 34bf49249..cfcf725b2 100644
--- a/lapack-netlib/SRC/chseqr.f
+++ b/lapack-netlib/SRC/chseqr.f
@@ -69,7 +69,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -86,7 +86,7 @@
 *>           set by a previous call to CGEBAL, and then passed to ZGEHRD
 *>           when the matrix output by CGEBAL is reduced to Hessenberg
 *>           form. Otherwise ILO and IHI should be set to 1 and N
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -98,17 +98,17 @@
 *>           triangular matrix T from the Schur decomposition (the
 *>           Schur form). If INFO = 0 and JOB = 'E', the contents of
 *>           H are unspecified on exit.  (The output value of H when
-*>           INFO.GT.0 is given under the description of INFO below.)
+*>           INFO > 0 is given under the description of INFO below.)
 *>
 *>           Unlike earlier versions of CHSEQR, this subroutine may
-*>           explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+*>           explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1
 *>           or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] W
@@ -131,7 +131,7 @@
 *>           if INFO = 0, Z contains Q*Z.
 *>           Normally Q is the unitary matrix generated by CUNGHR
 *>           after the call to CGEHRD which formed the Hessenberg matrix
-*>           H. (The output value of Z when INFO.GT.0 is given under
+*>           H. (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -139,7 +139,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if COMPZ = 'I' or
-*>           COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+*>           COMPZ = 'V', then LDZ >= MAX(1,N).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -152,7 +152,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient and delivers very good and sometimes
 *>           optimal performance.  However, LWORK as large as 11*N
 *>           may be required for optimal performance.  A workspace
@@ -170,21 +170,21 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .LT. 0:  if INFO = -i, the i-th argument had an illegal
+*>             = 0:  successful exit
+*>             < 0:  if INFO = -i, the i-th argument had an illegal
 *>                    value
-*>           .GT. 0:  if INFO = i, CHSEQR failed to compute all of
-*>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
-*>                and WI contain those eigenvalues which have been
+*>             > 0:  if INFO = i, CHSEQR failed to compute all of
+*>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of W
+*>                contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and JOB = 'E', then on exit, the
+*>                If INFO > 0 and JOB = 'E', then on exit, the
 *>                remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and JOB   = 'S', then on exit
+*>                If INFO > 0 and JOB   = 'S', then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -192,19 +192,19 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and COMPZ = 'V', then on exit
+*>                If INFO > 0 and COMPZ = 'V', then on exit
 *>
 *>                  (final value of Z)  =  (initial value of Z)*U
 *>
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'I', then on exit
+*>                If INFO > 0 and COMPZ = 'I', then on exit
 *>                      (final value of Z)  = U
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'N', then Z is not
+*>                If INFO > 0 and COMPZ = 'N', then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -244,8 +244,8 @@
 *>                      This depends on ILO, IHI and NS.  NS is the
 *>                      number of simultaneous shifts returned
 *>                      by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
-*>                      The default for (IHI-ILO+1).LE.500 is NS.
-*>                      The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+*>                      The default for (IHI-ILO+1) <= 500 is NS.
+*>                      The default for (IHI-ILO+1) >  500 is 3*NS/2.
 *>
 *>            ISPEC=14: Nibble crossover point. (See IPARMQ for
 *>                      details.)  Default: 14% of deflation window
@@ -323,8 +323,8 @@
       PARAMETER          ( NTINY = 11 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare CLAHQR failure.  NL .GT. NTINY = 11 is
-*     .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+*     .    through a rare CLAHQR failure.  NL > NTINY = 11 is
+*     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
 *     .    deflation window.  ====
diff --git a/lapack-netlib/SRC/cla_gbrcond_c.f b/lapack-netlib/SRC/cla_gbrcond_c.f
index 123aee26e..c382ac210 100644
--- a/lapack-netlib/SRC/cla_gbrcond_c.f
+++ b/lapack-netlib/SRC/cla_gbrcond_c.f
@@ -132,13 +132,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_gbrcond_x.f b/lapack-netlib/SRC/cla_gbrcond_x.f
index d04aa7fb8..46991ea14 100644
--- a/lapack-netlib/SRC/cla_gbrcond_x.f
+++ b/lapack-netlib/SRC/cla_gbrcond_x.f
@@ -125,13 +125,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_gbrfsx_extended.f b/lapack-netlib/SRC/cla_gbrfsx_extended.f
index 888ecd4f7..9f066137b 100644
--- a/lapack-netlib/SRC/cla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gbrfsx_extended.f
@@ -65,19 +65,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -269,7 +269,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/cla_gercond_c.f b/lapack-netlib/SRC/cla_gercond_c.f
index aabdc0bb9..1a2e8230e 100644
--- a/lapack-netlib/SRC/cla_gercond_c.f
+++ b/lapack-netlib/SRC/cla_gercond_c.f
@@ -21,7 +21,7 @@
 *       REAL FUNCTION CLA_GERCOND_C( TRANS, N, A, LDA, AF, LDAF, IPIV, C,
 *                                    CAPPLY, INFO, WORK, RWORK )
 *
-*       .. Scalar Aguments ..
+*       .. Scalar Arguments ..
 *       CHARACTER          TRANS
 *       LOGICAL            CAPPLY
 *       INTEGER            N, LDA, LDAF, INFO
@@ -114,13 +114,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
@@ -147,7 +147,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
-*     .. Scalar Aguments ..
+*     .. Scalar Arguments ..
       CHARACTER          TRANS
       LOGICAL            CAPPLY
       INTEGER            N, LDA, LDAF, INFO
diff --git a/lapack-netlib/SRC/cla_gercond_x.f b/lapack-netlib/SRC/cla_gercond_x.f
index 6dce99f62..46e9b039f 100644
--- a/lapack-netlib/SRC/cla_gercond_x.f
+++ b/lapack-netlib/SRC/cla_gercond_x.f
@@ -107,13 +107,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_gerfsx_extended.f b/lapack-netlib/SRC/cla_gerfsx_extended.f
index 2e0596334..d231733e6 100644
--- a/lapack-netlib/SRC/cla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/cla_gerfsx_extended.f
@@ -65,19 +65,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -257,7 +257,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERRS_C is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERRS_C is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERRS_C(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/cla_hercond_c.f b/lapack-netlib/SRC/cla_hercond_c.f
index a5ebaf8a2..5f26822af 100644
--- a/lapack-netlib/SRC/cla_hercond_c.f
+++ b/lapack-netlib/SRC/cla_hercond_c.f
@@ -110,13 +110,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_hercond_x.f b/lapack-netlib/SRC/cla_hercond_x.f
index f0004102f..91c80a668 100644
--- a/lapack-netlib/SRC/cla_hercond_x.f
+++ b/lapack-netlib/SRC/cla_hercond_x.f
@@ -103,13 +103,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_herfsx_extended.f b/lapack-netlib/SRC/cla_herfsx_extended.f
index c69589dfa..d1aa8462c 100644
--- a/lapack-netlib/SRC/cla_herfsx_extended.f
+++ b/lapack-netlib/SRC/cla_herfsx_extended.f
@@ -66,11 +66,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -254,7 +254,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/cla_porcond_c.f b/lapack-netlib/SRC/cla_porcond_c.f
index 7a2bcfe63..c2356590f 100644
--- a/lapack-netlib/SRC/cla_porcond_c.f
+++ b/lapack-netlib/SRC/cla_porcond_c.f
@@ -102,13 +102,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_porcond_x.f b/lapack-netlib/SRC/cla_porcond_x.f
index f0844ec89..a5ff3aa61 100644
--- a/lapack-netlib/SRC/cla_porcond_x.f
+++ b/lapack-netlib/SRC/cla_porcond_x.f
@@ -95,13 +95,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_porfsx_extended.f b/lapack-netlib/SRC/cla_porfsx_extended.f
index 3a3409c9e..545bdc445 100644
--- a/lapack-netlib/SRC/cla_porfsx_extended.f
+++ b/lapack-netlib/SRC/cla_porfsx_extended.f
@@ -65,11 +65,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -246,7 +246,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/cla_porpvgrw.f b/lapack-netlib/SRC/cla_porpvgrw.f
index bd2e7af1c..f10299c5a 100644
--- a/lapack-netlib/SRC/cla_porpvgrw.f
+++ b/lapack-netlib/SRC/cla_porpvgrw.f
@@ -85,7 +85,7 @@
 *>     The leading dimension of the array AF.  LDAF >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cla_syrcond_c.f b/lapack-netlib/SRC/cla_syrcond_c.f
index fc52bf23b..e59e83aa6 100644
--- a/lapack-netlib/SRC/cla_syrcond_c.f
+++ b/lapack-netlib/SRC/cla_syrcond_c.f
@@ -110,13 +110,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_syrcond_x.f b/lapack-netlib/SRC/cla_syrcond_x.f
index f8fb566e7..3edf58f83 100644
--- a/lapack-netlib/SRC/cla_syrcond_x.f
+++ b/lapack-netlib/SRC/cla_syrcond_x.f
@@ -103,13 +103,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is REAL array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/cla_syrfsx_extended.f b/lapack-netlib/SRC/cla_syrfsx_extended.f
index 5d2fa0cbb..92243abcb 100644
--- a/lapack-netlib/SRC/cla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/cla_syrfsx_extended.f
@@ -66,11 +66,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -254,7 +254,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/cla_syrpvgrw.f b/lapack-netlib/SRC/cla_syrpvgrw.f
index ccea462c7..15e55ea7d 100644
--- a/lapack-netlib/SRC/cla_syrpvgrw.f
+++ b/lapack-netlib/SRC/cla_syrpvgrw.f
@@ -102,7 +102,7 @@
 *>     as determined by CSYTRF.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/cla_wwaddw.f b/lapack-netlib/SRC/cla_wwaddw.f
index 9267c6df2..08e45ac79 100644
--- a/lapack-netlib/SRC/cla_wwaddw.f
+++ b/lapack-netlib/SRC/cla_wwaddw.f
@@ -36,7 +36,7 @@
 *>    CLA_WWADDW adds a vector W into a doubled-single vector (X, Y).
 *>
 *>    This works for all extant IBM's hex and binary floating point
-*>    arithmetics, but not for decimal.
+*>    arithmetic, but not for decimal.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/clahef_aa.f b/lapack-netlib/SRC/clahef_aa.f
index 88bc3d216..934aa92f9 100644
--- a/lapack-netlib/SRC/clahef_aa.f
+++ b/lapack-netlib/SRC/clahef_aa.f
@@ -288,8 +288,9 @@
 *
 *              Swap A(I1, I2+1:N) with A(I2, I2+1:N)
 *
-               CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
-     $                           A( J1+I2-1, I2+1 ), LDA )
+               IF( I2.LT.M )
+     $            CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
+     $                              A( J1+I2-1, I2+1 ), LDA )
 *
 *              Swap A(I1, I1) with A(I2,I2)
 *
@@ -329,13 +330,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1)
 *
-            IF( A( K, J+1 ).NE.ZERO ) THEN
-               ALPHA = ONE / A( K, J+1 )
-               CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
-               CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
-            ELSE
-               CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO,
-     $                      A( K, J+2 ), LDA)
+            IF( J.LT.(M-1) ) THEN
+               IF( A( K, J+1 ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( K, J+1 )
+                  CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
+                  CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
+               ELSE
+                  CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO,
+     $                         A( K, J+2 ), LDA)
+               END IF
             END IF
          END IF
          J = J + 1
@@ -440,8 +443,9 @@
 *
 *              Swap A(I2+1:N, I1) with A(I2+1:N, I2)
 *
-               CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
-     $                           A( I2+1, J1+I2-1 ), 1 )
+               IF( I2.LT.M )
+     $            CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
+     $                              A( I2+1, J1+I2-1 ), 1 )
 *
 *              Swap A(I1, I1) with A(I2, I2)
 *
@@ -481,13 +485,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1)
 *
-            IF( A( J+1, K ).NE.ZERO ) THEN
-               ALPHA = ONE / A( J+1, K )
-               CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
-               CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
-            ELSE
-               CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO,
-     $                      A( J+2, K ), LDA )
+            IF( J.LT.(M-1) ) THEN
+               IF( A( J+1, K ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( J+1, K )
+                  CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
+                  CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
+               ELSE
+                  CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO,
+     $                         A( J+2, K ), LDA )
+               END IF
             END IF
          END IF
          J = J + 1
diff --git a/lapack-netlib/SRC/clahef_rk.f b/lapack-netlib/SRC/clahef_rk.f
index 4d9dfbe8e..cc4603e9b 100644
--- a/lapack-netlib/SRC/clahef_rk.f
+++ b/lapack-netlib/SRC/clahef_rk.f
@@ -331,7 +331,7 @@
 *        of A and working backwards, and compute the matrix W = U12*D
 *        for use in updating A11 (note that conjg(W) is actually stored)
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -789,7 +789,7 @@
 *        of A and working forwards, and compute the matrix W = L21*D
 *        for use in updating A22 (note that conjg(W) is actually stored)
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/clahqr.f b/lapack-netlib/SRC/clahqr.f
index de2b3938b..ef50b5a56 100644
--- a/lapack-netlib/SRC/clahqr.f
+++ b/lapack-netlib/SRC/clahqr.f
@@ -138,26 +138,26 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           =   0: successful exit
-*>          .GT. 0: if INFO = i, CLAHQR failed to compute all the
+*>           = 0:  successful exit
+*>           > 0:  if INFO = i, CLAHQR failed to compute all the
 *>                  eigenvalues ILO to IHI in a total of 30 iterations
 *>                  per eigenvalue; elements i+1:ihi of W contain
 *>                  those eigenvalues which have been successfully
 *>                  computed.
 *>
-*>                  If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+*>                  If INFO > 0 and WANTT is .FALSE., then on exit,
 *>                  the remaining unconverged eigenvalues are the
 *>                  eigenvalues of the upper Hessenberg matrix
-*>                  rows and columns ILO thorugh INFO of the final,
+*>                  rows and columns ILO through INFO of the final,
 *>                  output value of H.
 *>
-*>                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                  If INFO > 0 and WANTT is .TRUE., then on exit
 *>          (*)       (initial value of H)*U  = U*(final value of H)
-*>                  where U is an orthognal matrix.    The final
+*>                  where U is an orthogonal matrix.    The final
 *>                  value of H is upper Hessenberg and triangular in
 *>                  rows and columns INFO+1 through IHI.
 *>
-*>                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                  If INFO > 0 and WANTZ is .TRUE., then on exit
 *>                      (final value of Z)  = (initial value of Z)*U
 *>                  where U is the orthogonal matrix in (*)
 *>                  (regardless of the value of WANTT.)
diff --git a/lapack-netlib/SRC/clamswlq.f b/lapack-netlib/SRC/clamswlq.f
index f2f9ab7f9..f6909b666 100644
--- a/lapack-netlib/SRC/clamswlq.f
+++ b/lapack-netlib/SRC/clamswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b CLAMSWLQ
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/clamtsqr.f b/lapack-netlib/SRC/clamtsqr.f
index 77d09a573..c71e4aa7d 100644
--- a/lapack-netlib/SRC/clamtsqr.f
+++ b/lapack-netlib/SRC/clamtsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b CLAMTSQR
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/clangb.f b/lapack-netlib/SRC/clangb.f
index 14a163ea7..9818360fe 100644
--- a/lapack-netlib/SRC/clangb.f
+++ b/lapack-netlib/SRC/clangb.f
@@ -130,6 +130,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            KL, KU, LDAB, N
@@ -147,14 +148,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K, L
-      REAL               SCALE, SUM, VALUE, TEMP
+      REAL               SUM, VALUE, TEMP
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
@@ -207,15 +211,22 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
             L = MAX( 1, J-KU )
             K = KU + 1 - J + L
-            CALL CLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL CLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL SCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANGB = VALUE
diff --git a/lapack-netlib/SRC/clange.f b/lapack-netlib/SRC/clange.f
index 50f705a18..00895c8bc 100644
--- a/lapack-netlib/SRC/clange.f
+++ b/lapack-netlib/SRC/clange.f
@@ -120,6 +120,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, M, N
@@ -137,14 +138,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      REAL               SCALE, SUM, VALUE, TEMP
+      REAL               SUM, VALUE, TEMP
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
@@ -196,13 +200,19 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL CLASSQ( M, A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL CLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL SCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANGE = VALUE
diff --git a/lapack-netlib/SRC/clanhb.f b/lapack-netlib/SRC/clanhb.f
index 2b034b19b..f78de23df 100644
--- a/lapack-netlib/SRC/clanhb.f
+++ b/lapack-netlib/SRC/clanhb.f
@@ -137,6 +137,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            K, LDAB, N
@@ -154,14 +155,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, L
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, REAL, SQRT
@@ -233,39 +237,57 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( K.GT.0 ) THEN
             IF( LSAME( UPLO, 'U' ) ) THEN
                DO 110 J = 2, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL CLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   110          CONTINUE
                L = K + 1
             ELSE
                DO 120 J = 1, N - 1
-                  CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   120          CONTINUE
                L = 1
             END IF
-            SUM = 2*SUM
+            SSQ( 2 ) = 2*SSQ( 2 )
          ELSE
             L = 1
          END IF
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 J = 1, N
             IF( REAL( AB( L, J ) ).NE.ZERO ) THEN
                ABSA = ABS( REAL( AB( L, J ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANHB = VALUE
diff --git a/lapack-netlib/SRC/clanhe.f b/lapack-netlib/SRC/clanhe.f
index 101d778eb..33d6c8b01 100644
--- a/lapack-netlib/SRC/clanhe.f
+++ b/lapack-netlib/SRC/clanhe.f
@@ -129,6 +129,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            LDA, N
@@ -146,14 +147,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, SQRT
@@ -223,31 +227,48 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL CLASSQ( J-1, A( 1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( J-1, A( 1, J ), 1,
+     $                      COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL CLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( N-J, A( J+1, J ), 1,
+     $                      COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          DO 130 I = 1, N
             IF( REAL( A( I, I ) ).NE.ZERO ) THEN
                ABSA = ABS( REAL( A( I, I ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( SSQ( 1 ).LT.ABSA ) THEN
+                  SSQ( 2 ) = ONE + SSQ( 2 )*( SSQ( 1 ) / ABSA )**2
+                  SSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  SSQ( 2 ) = SSQ( 2 ) + ( ABSA / SSQ( 1 ) )**2
                END IF
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANHE = VALUE
diff --git a/lapack-netlib/SRC/clanhp.f b/lapack-netlib/SRC/clanhp.f
index c8927d503..e0e23abc7 100644
--- a/lapack-netlib/SRC/clanhp.f
+++ b/lapack-netlib/SRC/clanhp.f
@@ -122,6 +122,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            N
@@ -139,14 +140,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, SQRT
@@ -225,31 +229,48 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          K = 2
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL CLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
                K = K + J
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL CLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
                K = K + N - J + 1
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          K = 1
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 I = 1, N
             IF( REAL( AP( K ) ).NE.ZERO ) THEN
                ABSA = ABS( REAL( AP( K ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( LSAME( UPLO, 'U' ) ) THEN
@@ -258,7 +279,8 @@
                K = K + N - I + 1
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANHP = VALUE
diff --git a/lapack-netlib/SRC/clanhs.f b/lapack-netlib/SRC/clanhs.f
index 35623b73d..661b4f901 100644
--- a/lapack-netlib/SRC/clanhs.f
+++ b/lapack-netlib/SRC/clanhs.f
@@ -114,6 +114,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, N
@@ -131,14 +132,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
@@ -190,13 +194,20 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL CLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL CLASSQ( MIN( N, J+1 ), A( 1, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL SCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANHS = VALUE
diff --git a/lapack-netlib/SRC/clansb.f b/lapack-netlib/SRC/clansb.f
index fbc50674c..1085fd880 100644
--- a/lapack-netlib/SRC/clansb.f
+++ b/lapack-netlib/SRC/clansb.f
@@ -135,6 +135,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            K, LDAB, N
@@ -152,14 +153,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, L
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
@@ -227,29 +231,47 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( K.GT.0 ) THEN
             IF( LSAME( UPLO, 'U' ) ) THEN
                DO 110 J = 2, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL CLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   110          CONTINUE
                L = K + 1
             ELSE
                DO 120 J = 1, N - 1
-                  CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   120          CONTINUE
                L = 1
             END IF
-            SUM = 2*SUM
+            SSQ( 2 ) = 2*SSQ( 2 )
          ELSE
             L = 1
          END IF
-         CALL CLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL CLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANSB = VALUE
diff --git a/lapack-netlib/SRC/clansp.f b/lapack-netlib/SRC/clansp.f
index fd64366c6..628dc0a75 100644
--- a/lapack-netlib/SRC/clansp.f
+++ b/lapack-netlib/SRC/clansp.f
@@ -120,6 +120,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            N
@@ -137,14 +138,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, AIMAG, REAL, SQRT
@@ -219,40 +223,57 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          K = 2
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL CLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
                K = K + J
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL CLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
                K = K + N - J + 1
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          K = 1
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 I = 1, N
             IF( REAL( AP( K ) ).NE.ZERO ) THEN
                ABSA = ABS( REAL( AP( K ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( AIMAG( AP( K ) ).NE.ZERO ) THEN
                ABSA = ABS( AIMAG( AP( K ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( LSAME( UPLO, 'U' ) ) THEN
@@ -261,7 +282,8 @@
                K = K + N - I + 1
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANSP = VALUE
diff --git a/lapack-netlib/SRC/clansy.f b/lapack-netlib/SRC/clansy.f
index 3aa787410..537fb7ba9 100644
--- a/lapack-netlib/SRC/clansy.f
+++ b/lapack-netlib/SRC/clansy.f
@@ -128,6 +128,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            LDA, N
@@ -145,14 +146,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
@@ -218,21 +222,39 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL CLASSQ( J-1, A( 1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL CLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL CLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
   120       CONTINUE
          END IF
-         SUM = 2*SUM
-         CALL CLASSQ( N, A, LDA+1, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL CLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANSY = VALUE
diff --git a/lapack-netlib/SRC/clantb.f b/lapack-netlib/SRC/clantb.f
index 4b4361c79..8066d0ef6 100644
--- a/lapack-netlib/SRC/clantb.f
+++ b/lapack-netlib/SRC/clantb.f
@@ -146,6 +146,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            K, LDAB, N
@@ -164,14 +165,17 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, L
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
@@ -313,46 +317,61 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 280 J = 2, N
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
                      CALL CLASSQ( MIN( J-1, K ),
-     $                            AB( MAX( K+2-J, 1 ), J ), 1, SCALE,
-     $                            SUM )
+     $                            AB( MAX( K+2-J, 1 ), J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL SCOMBSSQ( SSQ, COLSSQ )
   280             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 290 J = 1, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL CLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 300 J = 1, N - 1
-                     CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                            SUM )
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
+                     CALL CLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL SCOMBSSQ( SSQ, COLSSQ )
   300             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 310 J = 1, N
-                  CALL CLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANTB = VALUE
diff --git a/lapack-netlib/SRC/clantp.f b/lapack-netlib/SRC/clantp.f
index 148ac5436..b0c48eb46 100644
--- a/lapack-netlib/SRC/clantp.f
+++ b/lapack-netlib/SRC/clantp.f
@@ -130,6 +130,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            N
@@ -148,14 +149,17 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, K
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
@@ -308,45 +312,64 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 280 J = 2, N
-                  CALL CLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( J-1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   280          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 290 J = 1, N
-                  CALL CLASSQ( J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 300 J = 1, N - 1
-                  CALL CLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( N-J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   300          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 310 J = 1, N
-                  CALL CLASSQ( N-J+1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( N-J+1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANTP = VALUE
diff --git a/lapack-netlib/SRC/clantr.f b/lapack-netlib/SRC/clantr.f
index 4e1843d3d..3b361cc97 100644
--- a/lapack-netlib/SRC/clantr.f
+++ b/lapack-netlib/SRC/clantr.f
@@ -147,6 +147,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            LDA, M, N
@@ -165,14 +166,17 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CLASSQ
+      EXTERNAL           CLASSQ, SCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
@@ -283,7 +287,7 @@
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               DO 210 I = 1, N
+               DO 210 I = 1, MIN( M, N )
                   WORK( I ) = ONE
   210          CONTINUE
                DO 220 I = N + 1, M
@@ -313,38 +317,56 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 290 J = 2, N
-                  CALL CLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( MIN( M, J-1 ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 300 J = 1, N
-                  CALL CLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( MIN( M, J ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   300          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 310 J = 1, N
-                  CALL CLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( M-J, A( MIN( M, J+1 ), J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 320 J = 1, N
-                  CALL CLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL CLASSQ( M-J+1, A( J, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   320          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       CLANTR = VALUE
diff --git a/lapack-netlib/SRC/claqps.f b/lapack-netlib/SRC/claqps.f
index f47e852a0..d0b7efcd5 100644
--- a/lapack-netlib/SRC/claqps.f
+++ b/lapack-netlib/SRC/claqps.f
@@ -127,7 +127,7 @@
 *> \param[in,out] AUXV
 *> \verbatim
 *>          AUXV is COMPLEX array, dimension (NB)
-*>          Auxiliar vector.
+*>          Auxiliary vector.
 *> \endverbatim
 *>
 *> \param[in,out] F
diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f
index b61c9f1e9..2f0ea20db 100644
--- a/lapack-netlib/SRC/claqr0.f
+++ b/lapack-netlib/SRC/claqr0.f
@@ -66,7 +66,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -78,12 +78,12 @@
 *> \verbatim
 *>          IHI is INTEGER
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to CGEBAL, and then passed to CGEHRD when the
 *>           matrix output by CGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -95,17 +95,17 @@
 *>           contains the upper triangular matrix T from the Schur
 *>           decomposition (the Schur form). If INFO = 0 and WANT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] W
@@ -127,7 +127,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -137,7 +137,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -145,7 +145,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -158,7 +158,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -174,19 +174,19 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, CLAQR0 failed to compute all of
+*>             = 0:  successful exit
+*>             > 0:  if INFO = i, CLAQR0 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -194,7 +194,7 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -202,7 +202,7 @@
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -639,7 +639,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/claqr1.f b/lapack-netlib/SRC/claqr1.f
index 977947196..87d53871a 100644
--- a/lapack-netlib/SRC/claqr1.f
+++ b/lapack-netlib/SRC/claqr1.f
@@ -64,7 +64,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>              The leading dimension of H as declared in
-*>              the calling procedure.  LDH.GE.N
+*>              the calling procedure.  LDH >= N
 *> \endverbatim
 *>
 *> \param[in] S1
diff --git a/lapack-netlib/SRC/claqr2.f b/lapack-netlib/SRC/claqr2.f
index 03e9760cf..fc282b2d6 100644
--- a/lapack-netlib/SRC/claqr2.f
+++ b/lapack-netlib/SRC/claqr2.f
@@ -102,7 +102,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -120,7 +120,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -132,7 +132,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -148,7 +148,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -185,13 +185,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -203,14 +203,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -222,7 +222,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/claqr3.f b/lapack-netlib/SRC/claqr3.f
index 660a58376..84d57d4d6 100644
--- a/lapack-netlib/SRC/claqr3.f
+++ b/lapack-netlib/SRC/claqr3.f
@@ -99,7 +99,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -117,7 +117,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -129,7 +129,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -145,7 +145,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -182,13 +182,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -200,14 +200,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -219,7 +219,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f
index 647fa6774..fba286df7 100644
--- a/lapack-netlib/SRC/claqr4.f
+++ b/lapack-netlib/SRC/claqr4.f
@@ -74,7 +74,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -86,12 +86,12 @@
 *> \verbatim
 *>          IHI is INTEGER
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to CGEBAL, and then passed to CGEHRD when the
 *>           matrix output by CGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -103,17 +103,17 @@
 *>           contains the upper triangular matrix T from the Schur
 *>           decomposition (the Schur form). If INFO = 0 and WANT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] W
@@ -135,7 +135,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -145,7 +145,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -153,7 +153,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -166,7 +166,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -182,19 +182,19 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, CLAQR4 failed to compute all of
+*>             = 0:  successful exit
+*>             > 0:  if INFO = i, CLAQR4 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -202,7 +202,7 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -210,7 +210,7 @@
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -643,7 +643,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f
index 4c897895d..e4317a3ad 100644
--- a/lapack-netlib/SRC/claqr5.f
+++ b/lapack-netlib/SRC/claqr5.f
@@ -125,7 +125,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>             LDH is the leading dimension of H just as declared in the
-*>             calling procedure.  LDH.GE.MAX(1,N).
+*>             calling procedure.  LDH >= MAX(1,N).
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -137,7 +137,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>             Specify the rows of Z to which transformations must be
-*>             applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+*>             applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -153,7 +153,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>             LDA is the leading dimension of Z just as declared in
-*>             the calling procedure. LDZ.GE.N.
+*>             the calling procedure. LDZ >= N.
 *> \endverbatim
 *>
 *> \param[out] V
@@ -165,7 +165,7 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>             LDV is the leading dimension of V as declared in the
-*>             calling procedure.  LDV.GE.3.
+*>             calling procedure.  LDV >= 3.
 *> \endverbatim
 *>
 *> \param[out] U
@@ -177,33 +177,14 @@
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU.GE.3*NSHFTS-3.
-*> \endverbatim
-*>
-*> \param[in] NH
-*> \verbatim
-*>          NH is INTEGER
-*>             NH is the number of columns in array WH available for
-*>             workspace. NH.GE.1.
-*> \endverbatim
-*>
-*> \param[out] WH
-*> \verbatim
-*>          WH is COMPLEX array, dimension (LDWH,NH)
-*> \endverbatim
-*>
-*> \param[in] LDWH
-*> \verbatim
-*>          LDWH is INTEGER
-*>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH.GE.3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>             NV is the number of rows in WV agailable for workspace.
-*>             NV.GE.1.
+*>             NV >= 1.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -215,9 +196,28 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>             LDWV is the leading dimension of WV as declared in the
-*>             in the calling subroutine.  LDWV.GE.NV.
+*>             in the calling subroutine.  LDWV >= NV.
 *> \endverbatim
 *
+*> \param[in] NH
+*> \verbatim
+*>          NH is INTEGER
+*>             NH is the number of columns in array WH available for
+*>             workspace. NH >= 1.
+*> \endverbatim
+*>
+*> \param[out] WH
+*> \verbatim
+*>          WH is COMPLEX array, dimension (LDWH,NH)
+*> \endverbatim
+*>
+*> \param[in] LDWH
+*> \verbatim
+*>          LDWH is INTEGER
+*>             Leading dimension of WH just as declared in the
+*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*> \endverbatim
+*>
 *  Authors:
 *  ========
 *
diff --git a/lapack-netlib/SRC/clarfb.f b/lapack-netlib/SRC/clarfb.f
index 8fdd5c89c..a4d429c09 100644
--- a/lapack-netlib/SRC/clarfb.f
+++ b/lapack-netlib/SRC/clarfb.f
@@ -92,6 +92,8 @@
 *>          K is INTEGER
 *>          The order of the matrix T (= the number of elementary
 *>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
 *> \endverbatim
 *>
 *> \param[in] V
diff --git a/lapack-netlib/SRC/clarfx.f b/lapack-netlib/SRC/clarfx.f
index 1111c80f7..ad284883d 100644
--- a/lapack-netlib/SRC/clarfx.f
+++ b/lapack-netlib/SRC/clarfx.f
@@ -94,7 +94,7 @@
 *> \param[in] LDC
 *> \verbatim
 *>          LDC is INTEGER
-*>          The leading dimension of the array C. LDA >= max(1,M).
+*>          The leading dimension of the array C. LDC >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/clarfy.f b/lapack-netlib/SRC/clarfy.f
index a5743858c..fccd136a8 100644
--- a/lapack-netlib/SRC/clarfy.f
+++ b/lapack-netlib/SRC/clarfy.f
@@ -103,7 +103,7 @@
 *
 *> \date December 2016
 *
-*> \ingroup complex_eig
+*> \ingroup complexOTHERauxiliary
 *
 *  =====================================================================
       SUBROUTINE CLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
diff --git a/lapack-netlib/SRC/clarrv.f b/lapack-netlib/SRC/clarrv.f
index 72fe1f948..a45f55ac3 100644
--- a/lapack-netlib/SRC/clarrv.f
+++ b/lapack-netlib/SRC/clarrv.f
@@ -143,7 +143,7 @@
 *>          RTOL2 is REAL
 *>           Parameters for bisection.
 *>           An interval [LEFT,RIGHT] has converged if
-*>           RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>           RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *> \endverbatim
 *>
 *> \param[in,out] W
diff --git a/lapack-netlib/SRC/classq.f b/lapack-netlib/SRC/classq.f
index 28398596f..92e407ff3 100644
--- a/lapack-netlib/SRC/classq.f
+++ b/lapack-netlib/SRC/classq.f
@@ -41,7 +41,7 @@
 *> where x( i ) = abs( X( 1 + ( i - 1 )*INCX ) ). The value of sumsq is
 *> assumed to be at least unity and the value of ssq will then satisfy
 *>
-*>    1.0 .le. ssq .le. ( sumsq + 2*n ).
+*>    1.0 <= ssq <= ( sumsq + 2*n ).
 *>
 *> scale is assumed to be non-negative and scl returns the value
 *>
@@ -65,7 +65,7 @@
 *>
 *> \param[in] X
 *> \verbatim
-*>          X is COMPLEX array, dimension (N)
+*>          X is COMPLEX array, dimension (1+(N-1)*INCX)
 *>          The vector x as described above.
 *>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/claswlq.f b/lapack-netlib/SRC/claswlq.f
index 5fa2276e8..dcbdc0d52 100644
--- a/lapack-netlib/SRC/claswlq.f
+++ b/lapack-netlib/SRC/claswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b CLASWLQ
 *
 *  Definition:
 *  ===========
@@ -18,9 +19,20 @@
 *>
 *> \verbatim
 *>
-*>          CLASWLQ computes a blocked Short-Wide LQ factorization of a
-*>          M-by-N matrix A, where N >= M:
-*>          A = L * Q
+*> CLASWLQ computes a blocked Tall-Skinny LQ factorization of
+*> a complex M-by-N matrix A for M <= N:
+*>
+*>    A = ( L 0 ) *  Q,
+*>
+*> where:
+*>
+*>    Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+*>    form in the elements above the digonal of the array A and in
+*>    the elemenst of the array T;
+*>    L is an lower-triangular M-by-M matrix stored on exit in
+*>    the elements on and below the diagonal of the array A.
+*>    0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -150,7 +162,7 @@
       SUBROUTINE CLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
      $                  INFO)
 *
-*  -- LAPACK computational routine (version 3.7.1) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *     June 2017
diff --git a/lapack-netlib/SRC/clasyf_aa.f b/lapack-netlib/SRC/clasyf_aa.f
index 1bc96ee1b..a44a8f5b1 100644
--- a/lapack-netlib/SRC/clasyf_aa.f
+++ b/lapack-netlib/SRC/clasyf_aa.f
@@ -84,7 +84,7 @@
 *>
 *> \param[in,out] A
 *> \verbatim
-*>          A is REAL array, dimension (LDA,M) for
+*>          A is COMPLEX array, dimension (LDA,M) for
 *>          the first panel, while dimension (LDA,M+1) for the
 *>          remaining panels.
 *>
@@ -112,7 +112,7 @@
 *>
 *> \param[in,out] H
 *> \verbatim
-*>          H is REAL workspace, dimension (LDH,NB).
+*>          H is COMPLEX workspace, dimension (LDH,NB).
 *>
 *> \endverbatim
 *>
@@ -124,7 +124,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL workspace, dimension (M).
+*>          WORK is COMPLEX workspace, dimension (M).
 *> \endverbatim
 *>
 *
@@ -284,8 +284,9 @@
 *
 *              Swap A(I1, I2+1:M) with A(I2, I2+1:M)
 *
-               CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
-     $                           A( J1+I2-1, I2+1 ), LDA )
+               IF( I2.LT.M )
+     $            CALL CSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
+     $                              A( J1+I2-1, I2+1 ), LDA )
 *
 *              Swap A(I1, I1) with A(I2,I2)
 *
@@ -325,13 +326,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( K, J+1 ).NE.ZERO ) THEN
-               ALPHA = ONE / A( K, J+1 )
-               CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
-               CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
-            ELSE
-               CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO,
-     $                      A( K, J+2 ), LDA)
+            IF( J.LT.(M-1) ) THEN
+               IF( A( K, J+1 ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( K, J+1 )
+                  CALL CCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
+                  CALL CSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
+               ELSE
+                  CALL CLASET( 'Full', 1, M-J-1, ZERO, ZERO,
+     $                         A( K, J+2 ), LDA)
+               END IF
             END IF
          END IF
          J = J + 1
@@ -432,8 +435,9 @@
 *
 *              Swap A(I2+1:M, I1) with A(I2+1:M, I2)
 *
-               CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
-     $                           A( I2+1, J1+I2-1 ), 1 )
+               IF( I2.LT.M )
+     $            CALL CSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
+     $                              A( I2+1, J1+I2-1 ), 1 )
 *
 *              Swap A(I1, I1) with A(I2, I2)
 *
@@ -473,13 +477,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( J+1, K ).NE.ZERO ) THEN
-               ALPHA = ONE / A( J+1, K )
-               CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
-               CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
-            ELSE
-               CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO,
-     $                      A( J+2, K ), LDA )
+            IF( J.LT.(M-1) ) THEN
+               IF( A( J+1, K ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( J+1, K )
+                  CALL CCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
+                  CALL CSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
+               ELSE
+                  CALL CLASET( 'Full', M-J-1, 1, ZERO, ZERO,
+     $                         A( J+2, K ), LDA )
+               END IF
             END IF
          END IF
          J = J + 1
diff --git a/lapack-netlib/SRC/clasyf_rk.f b/lapack-netlib/SRC/clasyf_rk.f
index 0700c5cc2..bd7a0fb45 100644
--- a/lapack-netlib/SRC/clasyf_rk.f
+++ b/lapack-netlib/SRC/clasyf_rk.f
@@ -330,7 +330,7 @@
 *        of A and working backwards, and compute the matrix W = U12*D
 *        for use in updating A11
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -658,7 +658,7 @@
 *        of A and working forwards, and compute the matrix W = L21*D
 *        for use in updating A22
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/clatdf.f b/lapack-netlib/SRC/clatdf.f
index 357f66422..557830d1c 100644
--- a/lapack-netlib/SRC/clatdf.f
+++ b/lapack-netlib/SRC/clatdf.f
@@ -261,7 +261,7 @@
 *
 *        Solve for U- part, lockahead for RHS(N) = +-1. This is not done
 *        In BSOLVE and will hopefully give us a better estimate because
-*        any ill-conditioning of the original matrix is transfered to U
+*        any ill-conditioning of the original matrix is transferred to U
 *        and not to L. U(N, N) is an approximation to sigma_min(LU).
 *
          CALL CCOPY( N-1, RHS, 1, WORK, 1 )
diff --git a/lapack-netlib/SRC/clatsqr.f b/lapack-netlib/SRC/clatsqr.f
index dab5774c1..e9c6d77c2 100644
--- a/lapack-netlib/SRC/clatsqr.f
+++ b/lapack-netlib/SRC/clatsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b CLATSQR
 *
 *  Definition:
 *  ===========
@@ -18,9 +19,23 @@
 *>
 *> \verbatim
 *>
-*> SLATSQR computes a blocked Tall-Skinny QR factorization of
-*> an M-by-N matrix A, where M >= N:
-*> A = Q * R .
+*> CLATSQR computes a blocked Tall-Skinny QR factorization of
+*> a complex M-by-N matrix A for M >= N:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+*>    form in the elements below the digonal of the array A and in
+*>    the elemenst of the array T;
+*>
+*>    R is an upper-triangular N-by-N matrix, stored on exit in
+*>    the elements on and above the diagonal of the array A.
+*>
+*>    0 is a (M-N)-by-N zero matrix, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -149,10 +164,10 @@
       SUBROUTINE CLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
      $                    LWORK, INFO)
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
diff --git a/lapack-netlib/SRC/claunhr_col_getrfnp.f b/lapack-netlib/SRC/claunhr_col_getrfnp.f
new file mode 100644
index 000000000..66b9c0407
--- /dev/null
+++ b/lapack-netlib/SRC/claunhr_col_getrfnp.f
@@ -0,0 +1,248 @@
+*> \brief \b CLAUNHR_COL_GETRFNP
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAUNHR_COL_GETRFNP + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claunhr_col_getrfnp.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claunhr_col_getrfnp.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claunhr_col_getrfnp.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAUNHR_COL_GETRFNP computes the modified LU factorization without
+*> pivoting of a complex general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is
+*>    at least one in absolute value (so that division-by-zero not
+*>    not possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the blocked right-looking version of the algorithm,
+*> calling Level 3 BLAS to update the submatrix. To factorize a block,
+*> this routine calls the recursive routine CLAUNHR_COL_GETRFNP2.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is COMPLEX array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be
+*>          only ( +1.0, 0.0 ) or (-1.0, 0.0 ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complexGEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX            A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            IINFO, J, JB, NB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGEMM, CLAUNHR_COL_GETRFNP2, CTRSM, XERBLA
+*     ..
+*     .. External Functions ..
+      INTEGER            ILAENV
+      EXTERNAL           ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CLAUNHR_COL_GETRFNP', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+*
+*     Determine the block size for this environment.
+*
+
+      NB = ILAENV( 1, 'CLAUNHR_COL_GETRFNP', ' ', M, N, -1, -1 )
+
+      IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN
+*
+*        Use unblocked code.
+*
+         CALL CLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      ELSE
+*
+*        Use blocked code.
+*
+         DO J = 1, MIN( M, N ), NB
+            JB = MIN( MIN( M, N )-J+1, NB )
+*
+*           Factor diagonal and subdiagonal blocks.
+*
+            CALL CLAUNHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA,
+     $                                 D( J ), IINFO )
+*
+            IF( J+JB.LE.N ) THEN
+*
+*              Compute block row of U.
+*
+               CALL CTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB,
+     $                     N-J-JB+1, CONE, A( J, J ), LDA, A( J, J+JB ),
+     $                     LDA )
+               IF( J+JB.LE.M ) THEN
+*
+*                 Update trailing submatrix.
+*
+                  CALL CGEMM( 'No transpose', 'No transpose', M-J-JB+1,
+     $                        N-J-JB+1, JB, -CONE, A( J+JB, J ), LDA,
+     $                        A( J, J+JB ), LDA, CONE, A( J+JB, J+JB ),
+     $                        LDA )
+               END IF
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of CLAUNHR_COL_GETRFNP
+*
+      END
diff --git a/lapack-netlib/SRC/claunhr_col_getrfnp2.f b/lapack-netlib/SRC/claunhr_col_getrfnp2.f
new file mode 100644
index 000000000..82fc329ee
--- /dev/null
+++ b/lapack-netlib/SRC/claunhr_col_getrfnp2.f
@@ -0,0 +1,314 @@
+*> \brief \b CLAUNHR_COL_GETRFNP2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CLAUNHR_COL_GETRFNP2 + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/claunhr_col_getrfnp2.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/claunhr_col_getrfnp2.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/claunhr_col_getrfnp2.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       RECURSIVE SUBROUTINE CLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX            A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CLAUNHR_COL_GETRFNP2 computes the modified LU factorization without
+*> pivoting of a complex general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is at
+*>    least one in absolute value (so that division-by-zero not
+*>    possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine CUNHR_COL. In CUNHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the recursive version of the LU factorization algorithm.
+*> Denote A - S by B. The algorithm divides the matrix B into four
+*> submatrices:
+*>
+*>        [  B11 | B12  ]  where B11 is n1 by n1,
+*>    B = [ -----|----- ]        B21 is (m-n1) by n1,
+*>        [  B21 | B22  ]        B12 is n1 by n2,
+*>                               B22 is (m-n1) by n2,
+*>                               with n1 = min(m,n)/2, n2 = n-n1.
+*>
+*>
+*> The subroutine calls itself to factor B11, solves for B21,
+*> solves for B12, updates B22, then calls itself to factor B22.
+*>
+*> For more details on the recursive LU algorithm, see [2].
+*>
+*> CLAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked
+*> routine CLAUNHR_COL_GETRFNP, which uses blocked code calling
+*. Level 3 BLAS to update the submatrix. However, CLAUNHR_COL_GETRFNP2
+*> is self-sufficient and can be used without CLAUNHR_COL_GETRFNP.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*>
+*> [2] "Recursion leads to automatic variable blocking for dense linear
+*>     algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+*>     vol. 41, no. 6, pp. 737-755, 1997.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is COMPLEX array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be
+*>          only ( +1.0, 0.0 ) or (-1.0, 0.0 ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complexGEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      RECURSIVE SUBROUTINE CLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX         A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E+0 )
+      COMPLEX            CONE
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      REAL               SFMIN
+      INTEGER            I, IINFO, N1, N2
+      COMPLEX            Z
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
+      EXTERNAL           SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CGEMM, CSCAL, CTRSM, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, REAL, CMPLX, AIMAG, SIGN, MAX, MIN
+*     ..
+*     .. Statement Functions ..
+      DOUBLE PRECISION   CABS1
+*     ..
+*     .. Statement Function definitions ..
+      CABS1( Z ) = ABS( REAL( Z ) ) + ABS( AIMAG( Z ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CLAUNHR_COL_GETRFNP2', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+
+      IF ( M.EQ.1 ) THEN
+*
+*        One row case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = CMPLX( -SIGN( ONE, REAL( A( 1, 1 ) ) ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+      ELSE IF( N.EQ.1 ) THEN
+*
+*        One column case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = CMPLX( -SIGN( ONE, REAL( A( 1, 1 ) ) ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+*        Scale the elements 2:M of the column
+*
+*        Determine machine safe minimum
+*
+         SFMIN = SLAMCH('S')
+*
+*        Construct the subdiagonal elements of L
+*
+         IF( CABS1( A( 1, 1 ) ) .GE. SFMIN ) THEN
+            CALL CSCAL( M-1, CONE / A( 1, 1 ), A( 2, 1 ), 1 )
+         ELSE
+            DO I = 2, M
+               A( I, 1 ) = A( I, 1 ) / A( 1, 1 )
+            END DO
+         END IF
+*
+      ELSE
+*
+*        Divide the matrix B into four submatrices
+*
+         N1 = MIN( M, N ) / 2
+         N2 = N-N1
+
+*
+*        Factor B11, recursive call
+*
+         CALL CLAUNHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO )
+*
+*        Solve for B21
+*
+         CALL CTRSM( 'R', 'U', 'N', 'N', M-N1, N1, CONE, A, LDA,
+     $               A( N1+1, 1 ), LDA )
+*
+*        Solve for B12
+*
+         CALL CTRSM( 'L', 'L', 'N', 'U', N1, N2, CONE, A, LDA,
+     $               A( 1, N1+1 ), LDA )
+*
+*        Update B22, i.e. compute the Schur complement
+*        B22 := B22 - B21*B12
+*
+         CALL CGEMM( 'N', 'N', M-N1, N2, N1, -CONE, A( N1+1, 1 ), LDA,
+     $               A( 1, N1+1 ), LDA, CONE, A( N1+1, N1+1 ), LDA )
+*
+*        Factor B22, recursive call
+*
+         CALL CLAUNHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA,
+     $                              D( N1+1 ), IINFO )
+*
+      END IF
+      RETURN
+*
+*     End of CLAUNHR_COL_GETRFNP2
+*
+      END
diff --git a/lapack-netlib/SRC/cporfsx.f b/lapack-netlib/SRC/cporfsx.f
index 872bad36c..3a2db7135 100644
--- a/lapack-netlib/SRC/cporfsx.f
+++ b/lapack-netlib/SRC/cporfsx.f
@@ -44,7 +44,7 @@
 *> \verbatim
 *>
 *>    CPORFSX improves the computed solution to a system of linear
-*>    equations when the coefficient matrix is symmetric positive
+*>    equations when the coefficient matrix is Hermitian positive
 *>    definite, and provides error bounds and backward error estimates
 *>    for the solution.  In addition to normwise error bound, the code
 *>    provides maximum componentwise error bound if possible.  See
@@ -103,7 +103,7 @@
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>     The symmetric matrix A.  If UPLO = 'U', the leading N-by-N
+*>     The Hermitian matrix A.  If UPLO = 'U', the leading N-by-N
 *>     upper triangular part of A contains the upper triangular part
 *>     of the matrix A, and the strictly lower triangular part of A
 *>     is not referenced.  If UPLO = 'L', the leading N-by-N lower
@@ -134,7 +134,7 @@
 *> \param[in,out] S
 *> \verbatim
 *>          S is REAL array, dimension (N)
-*>     The row scale factors for A.  If EQUED = 'Y', A is multiplied on
+*>     The scale factors for A.  If EQUED = 'Y', A is multiplied on
 *>     the left and right by diag(S).  S is an input argument if FACT =
 *>     'F'; otherwise, S is an output argument.  If FACT = 'F' and EQUED
 *>     = 'Y', each element of S must be positive.  If S is output, each
@@ -262,7 +262,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -298,14 +298,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -313,9 +313,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/cposvxx.f b/lapack-netlib/SRC/cposvxx.f
index 64d1b67fa..57c2d3feb 100644
--- a/lapack-netlib/SRC/cposvxx.f
+++ b/lapack-netlib/SRC/cposvxx.f
@@ -45,7 +45,7 @@
 *>
 *>    CPOSVXX uses the Cholesky factorization A = U**T*U or A = L*L**T
 *>    to compute the solution to a complex system of linear equations
-*>    A * X = B, where A is an N-by-N symmetric positive definite matrix
+*>    A * X = B, where A is an N-by-N Hermitian positive definite matrix
 *>    and X and B are N-by-NRHS matrices.
 *>
 *>    If requested, both normwise and maximum componentwise error bounds
@@ -157,7 +157,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>     On entry, the symmetric matrix A, except if FACT = 'F' and EQUED =
+*>     On entry, the Hermitian matrix A, except if FACT = 'F' and EQUED =
 *>     'Y', then A must contain the equilibrated matrix
 *>     diag(S)*A*diag(S).  If UPLO = 'U', the leading N-by-N upper
 *>     triangular part of A contains the upper triangular part of the
@@ -365,7 +365,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -401,14 +401,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -416,9 +416,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/cpotrf2.f b/lapack-netlib/SRC/cpotrf2.f
index 789843c41..ed4f12cba 100644
--- a/lapack-netlib/SRC/cpotrf2.f
+++ b/lapack-netlib/SRC/cpotrf2.f
@@ -24,7 +24,7 @@
 *>
 *> \verbatim
 *>
-*> CPOTRF2 computes the Cholesky factorization of a real symmetric
+*> CPOTRF2 computes the Cholesky factorization of a Hermitian
 *> positive definite matrix A using the recursive algorithm.
 *>
 *> The factorization has the form
@@ -63,7 +63,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 *>          N-by-N upper triangular part of A contains the upper
 *>          triangular part of the matrix A, and the strictly lower
 *>          triangular part of A is not referenced.  If UPLO = 'L', the
diff --git a/lapack-netlib/SRC/cstemr.f b/lapack-netlib/SRC/cstemr.f
index 22ac842c9..8fb8131d8 100644
--- a/lapack-netlib/SRC/cstemr.f
+++ b/lapack-netlib/SRC/cstemr.f
@@ -250,13 +250,13 @@
 *> \param[in,out] TRYRAC
 *> \verbatim
 *>          TRYRAC is LOGICAL
-*>          If TRYRAC.EQ..TRUE., indicates that the code should check whether
+*>          If TRYRAC = .TRUE., indicates that the code should check whether
 *>          the tridiagonal matrix defines its eigenvalues to high relative
 *>          accuracy.  If so, the code uses relative-accuracy preserving
 *>          algorithms that might be (a bit) slower depending on the matrix.
 *>          If the matrix does not define its eigenvalues to high relative
 *>          accuracy, the code can uses possibly faster algorithms.
-*>          If TRYRAC.EQ..FALSE., the code is not required to guarantee
+*>          If TRYRAC = .FALSE., the code is not required to guarantee
 *>          relatively accurate eigenvalues and can use the fastest possible
 *>          techniques.
 *>          On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix
diff --git a/lapack-netlib/SRC/csycon_3.f b/lapack-netlib/SRC/csycon_3.f
index 47d52dd15..5c1cb0491 100644
--- a/lapack-netlib/SRC/csycon_3.f
+++ b/lapack-netlib/SRC/csycon_3.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE CSYCON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND,
-*                            WORK, IWORK, INFO )
+*                            WORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          UPLO
@@ -27,7 +27,7 @@
 *       REAL               ANORM, RCOND
 *       ..
 *       .. Array Arguments ..
-*       INTEGER            IPIV( * ), IWORK( * )
+*       INTEGER            IPIV( * )
 *       COMPLEX            A( LDA, * ), E ( * ), WORK( * )
 *       ..
 *
@@ -129,11 +129,6 @@
 *>          WORK is COMPLEX array, dimension (2*N)
 *> \endverbatim
 *>
-*> \param[out] IWORK
-*> \verbatim
-*>          IWORK is INTEGER array, dimension (N)
-*> \endverbatim
-*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
diff --git a/lapack-netlib/SRC/csyconvf.f b/lapack-netlib/SRC/csyconvf.f
index 77ecf46b5..fd5a5e47f 100644
--- a/lapack-netlib/SRC/csyconvf.f
+++ b/lapack-netlib/SRC/csyconvf.f
@@ -294,7 +294,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -347,7 +347,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -438,7 +438,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where k increases from 1 to N
 *
             I = 1
@@ -491,7 +491,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/csyconvf_rook.f b/lapack-netlib/SRC/csyconvf_rook.f
index 1146a97c5..7ede26863 100644
--- a/lapack-netlib/SRC/csyconvf_rook.f
+++ b/lapack-netlib/SRC/csyconvf_rook.f
@@ -285,7 +285,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -336,7 +336,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -426,7 +426,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where i increases from 1 to N
 *
             I = 1
@@ -477,7 +477,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/csyrfsx.f b/lapack-netlib/SRC/csyrfsx.f
index 7323ba8eb..4d1bc3ccc 100644
--- a/lapack-netlib/SRC/csyrfsx.f
+++ b/lapack-netlib/SRC/csyrfsx.f
@@ -271,7 +271,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -307,14 +307,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -322,9 +322,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/csysv_aa.f b/lapack-netlib/SRC/csysv_aa.f
index 87be734cc..2081644b1 100644
--- a/lapack-netlib/SRC/csysv_aa.f
+++ b/lapack-netlib/SRC/csysv_aa.f
@@ -42,7 +42,7 @@
 *> matrices.
 *>
 *> Aasen's algorithm is used to factor A as
-*>    A = U * T * U**T,  if UPLO = 'U', or
+*>    A = U**T * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric tridiagonal. The factored
@@ -75,7 +75,7 @@
 *>
 *> \param[in,out] A
 *> \verbatim
-*>          A is REAL array, dimension (LDA,N)
+*>          A is COMPLEX array, dimension (LDA,N)
 *>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
 *>          N-by-N upper triangular part of A contains the upper
 *>          triangular part of the matrix A, and the strictly lower
@@ -86,7 +86,7 @@
 *>
 *>          On exit, if INFO = 0, the tridiagonal matrix T and the
 *>          multipliers used to obtain the factor U or L from the
-*>          factorization A = U*T*U**T or A = L*T*L**T as computed by
+*>          factorization A = U**T*T*U or A = L*T*L**T as computed by
 *>          CSYTRF.
 *> \endverbatim
 *>
@@ -106,7 +106,7 @@
 *>
 *> \param[in,out] B
 *> \verbatim
-*>          B is REAL array, dimension (LDB,NRHS)
+*>          B is COMPLEX array, dimension (LDB,NRHS)
 *>          On entry, the N-by-NRHS right hand side matrix B.
 *>          On exit, if INFO = 0, the N-by-NRHS solution matrix X.
 *> \endverbatim
@@ -119,7 +119,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -230,7 +230,7 @@
          RETURN
       END IF
 *
-*     Compute the factorization A = U*T*U**T or A = L*T*L**T.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL CSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
       IF( INFO.EQ.0 ) THEN
diff --git a/lapack-netlib/SRC/csysv_aa_2stage.f b/lapack-netlib/SRC/csysv_aa_2stage.f
index a13349824..c5c328c63 100644
--- a/lapack-netlib/SRC/csysv_aa_2stage.f
+++ b/lapack-netlib/SRC/csysv_aa_2stage.f
@@ -43,8 +43,8 @@
 *> matrices.
 *>
 *> Aasen's 2-stage algorithm is used to factor A as
-*>    A = U * T * U**H,  if UPLO = 'U', or
-*>    A = L * T * L**H,  if UPLO = 'L',
+*>    A = U**T * T * U,  if UPLO = 'U', or
+*>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric and band. The matrix T is
 *> then LU-factored with partial pivoting. The factored form of A
@@ -257,7 +257,7 @@
       END IF
 *
 *
-*     Compute the factorization A = U*T*U**H or A = L*T*L**H.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL CSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
      $                       WORK, LWORK, INFO )
diff --git a/lapack-netlib/SRC/csysvxx.f b/lapack-netlib/SRC/csysvxx.f
index 2fd2c8771..7a9aee105 100644
--- a/lapack-netlib/SRC/csysvxx.f
+++ b/lapack-netlib/SRC/csysvxx.f
@@ -378,7 +378,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -414,14 +414,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -429,9 +429,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/csytf2_rk.f b/lapack-netlib/SRC/csytf2_rk.f
index 3b5e53a03..7e39c2dfd 100644
--- a/lapack-netlib/SRC/csytf2_rk.f
+++ b/lapack-netlib/SRC/csytf2_rk.f
@@ -321,7 +321,7 @@
 *
 *        Factorize A as U*D*U**T using the upper triangle of A
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -632,7 +632,7 @@
 *
 *        Factorize A as L*D*L**T using the lower triangle of A
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/csytrf.f b/lapack-netlib/SRC/csytrf.f
index c389725e9..af913b8f4 100644
--- a/lapack-netlib/SRC/csytrf.f
+++ b/lapack-netlib/SRC/csytrf.f
@@ -43,7 +43,7 @@
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and D is symmetric and block diagonal with
-*> with 1-by-1 and 2-by-2 diagonal blocks.
+*> 1-by-1 and 2-by-2 diagonal blocks.
 *>
 *> This is the blocked version of the algorithm, calling Level 3 BLAS.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/csytrf_aa.f b/lapack-netlib/SRC/csytrf_aa.f
index 2f185b0c7..427235bda 100644
--- a/lapack-netlib/SRC/csytrf_aa.f
+++ b/lapack-netlib/SRC/csytrf_aa.f
@@ -37,7 +37,7 @@
 *> CSYTRF_AA computes the factorization of a complex symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a complex symmetric tridiagonal matrix.
@@ -63,7 +63,7 @@
 *>
 *> \param[in,out] A
 *> \verbatim
-*>          A is REAL array, dimension (LDA,N)
+*>          A is COMPLEX array, dimension (LDA,N)
 *>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
 *>          N-by-N upper triangular part of A contains the upper
 *>          triangular part of the matrix A, and the strictly lower
@@ -94,7 +94,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is REAL array, dimension (MAX(1,LWORK))
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
 *> \endverbatim
 *>
@@ -223,7 +223,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
 *        Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N))
@@ -256,7 +256,7 @@
      $                   A( MAX(1, J), J+1 ), LDA,
      $                   IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
@@ -375,7 +375,7 @@
      $                   A( J+1, MAX(1, J) ), LDA,
      $                   IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
diff --git a/lapack-netlib/SRC/csytrf_aa_2stage.f b/lapack-netlib/SRC/csytrf_aa_2stage.f
index 0d0bd156c..0946d61b0 100644
--- a/lapack-netlib/SRC/csytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrf_aa_2stage.f
@@ -38,7 +38,7 @@
 *> CSYTRF_AA_2STAGE computes the factorization of a complex symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a complex symmetric band matrix with the
@@ -275,7 +275,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
          DO J = 0, NT-1
@@ -448,12 +448,14 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL CSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, 
      $                                A( (J+1)*NB+1, I2 ), 1 )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
-     $                                    A( I1+1, I2 ), 1 )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) )
+     $                  CALL CSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
+     $                                       A( I1+1, I2 ), 1 )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL CSWAP( N-I2, A( I1, I2+1 ), LDA,
-     $                                 A( I2, I2+1 ), LDA ) 
+                     IF( I2.LT.N )
+     $                  CALL CSWAP( N-I2, A( I1, I2+1 ), LDA,
+     $                                    A( I2, I2+1 ), LDA ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
@@ -637,11 +639,13 @@ c               END IF
                      CALL CSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, 
      $                                A( I2, (J+1)*NB+1 ), LDA )
 *                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1,
-     $                                    A( I2, I1+1 ), LDA )
+                     IF( I2.GT.(I1+1) )
+     $                  CALL CSWAP( I2-I1-1, A( I1+1, I1 ), 1,
+     $                                       A( I2, I1+1 ), LDA )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL CSWAP( N-I2, A( I2+1, I1 ), 1,
-     $                                 A( I2+1, I2 ), 1 ) 
+                     IF( I2.LT.N )
+     $                  CALL CSWAP( N-I2, A( I2+1, I1 ), 1,
+     $                                    A( I2+1, I2 ), 1 ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
diff --git a/lapack-netlib/SRC/csytri2.f b/lapack-netlib/SRC/csytri2.f
index 4bd8e4f99..8bee149c4 100644
--- a/lapack-netlib/SRC/csytri2.f
+++ b/lapack-netlib/SRC/csytri2.f
@@ -62,7 +62,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
+*>          On entry, the block diagonal matrix D and the multipliers
 *>          used to obtain the factor U or L as computed by CSYTRF.
 *>
 *>          On exit, if INFO = 0, the (symmetric) inverse of the original
@@ -82,7 +82,7 @@
 *> \param[in] IPIV
 *> \verbatim
 *>          IPIV is INTEGER array, dimension (N)
-*>          Details of the interchanges and the NB structure of D
+*>          Details of the interchanges and the block structure of D
 *>          as determined by CSYTRF.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/csytrs2.f b/lapack-netlib/SRC/csytrs2.f
index 1002b5461..93f2d6a1b 100644
--- a/lapack-netlib/SRC/csytrs2.f
+++ b/lapack-netlib/SRC/csytrs2.f
@@ -36,7 +36,7 @@
 *>
 *> \verbatim
 *>
-*> CSYTRS2 solves a system of linear equations A*X = B with a COMPLEX
+*> CSYTRS2 solves a system of linear equations A*X = B with a complex
 *> symmetric matrix A using the factorization A = U*D*U**T or
 *> A = L*D*L**T computed by CSYTRF and converted by CSYCONV.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/csytrs_aa.f b/lapack-netlib/SRC/csytrs_aa.f
index 7cf950492..981f8722a 100644
--- a/lapack-netlib/SRC/csytrs_aa.f
+++ b/lapack-netlib/SRC/csytrs_aa.f
@@ -37,7 +37,7 @@
 *> \verbatim
 *>
 *> CSYTRS_AA solves a system of linear equations A*X = B with a complex
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by CSYTRF_AA.
 *> \endverbatim
 *
@@ -49,7 +49,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -68,7 +68,7 @@
 *>
 *> \param[in] A
 *> \verbatim
-*>          A is REAL array, dimension (LDA,N)
+*>          A is COMPLEX array, dimension (LDA,N)
 *>          Details of factors computed by CSYTRF_AA.
 *> \endverbatim
 *>
@@ -86,7 +86,7 @@
 *>
 *> \param[in,out] B
 *> \verbatim
-*>          B is REAL array, dimension (LDB,NRHS)
+*>          B is COMPLEX array, dimension (LDB,NRHS)
 *>          On entry, the right hand side matrix B.
 *>          On exit, the solution matrix X.
 *> \endverbatim
@@ -97,14 +97,16 @@
 *>          The leading dimension of the array B.  LDB >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE array, dimension (MAX(1,LWORK))
+*>          WORK is COMPLEX array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER, LWORK >= MAX(1,3*N-2).
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -198,22 +200,29 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
+*
+*        1) Forward substitution with U**T
+*
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
 *
-*        Pivot, P**T * B
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $          CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Compute U**T \ B -> B    [ (U**T \P**T * B) ]
 *
-*        Compute (U \P**T * B) -> B    [ (U \P**T * B) ]
+            CALL CTRSM( 'L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-         CALL CTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+*        2) Solve with triangular matrix T
 *
-*        Compute T \ B -> B   [ T \ (U \P**T * B) ]
+*        Compute T \ B -> B   [ T \ (U**T \P**T * B) ]
 *
          CALL CLACPY( 'F', 1, N, A( 1, 1 ), LDA+1, WORK( N ), 1)
          IF( N.GT.1 ) THEN
@@ -223,35 +232,48 @@
          CALL CGTSV( N, NRHS, WORK( 1 ), WORK( N ), WORK( 2*N ), B, LDB,
      $               INFO )
 *
-*        Compute (U**T \ B) -> B   [ U**T \ (T \ (U \P**T * B) ) ]
+*        3) Backward substitution with U
+*
+         IF( N.GT.1 ) THEN
 *
-         CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+*           Compute U \ B -> B   [ U \ (T \ (U**T \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (U**T \ (T \ (U \P**T * B) )) ]
+            CALL CTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Pivot, P * B -> B  [ P * (U**T \ (T \ (U \P**T * B) )) ]
+*
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       ELSE
 *
 *        Solve A*X = B, where A = L*T*L**T.
 *
-*        Pivot, P**T * B
+*        1) Forward substitution with L
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
+*
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+*
+*           Compute L \ B -> B    [ (L \P**T * B) ]
+*
+            CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-*        Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*        2) Solve with triangular matrix T
 *
-         CALL CTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $               B( 2, 1 ), LDB)
 *
 *        Compute T \ B -> B   [ T \ (L \P**T * B) ]
 *
@@ -263,18 +285,23 @@
          CALL CGTSV( N, NRHS, WORK( 1 ), WORK(N), WORK( 2*N ), B, LDB,
      $               INFO)
 *
-*        Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
+*        3) Backward substitution with L**T
+*
+         IF( N.GT.1 ) THEN
 *
-         CALL CTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $              B( 2, 1 ), LDB)
+*           Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+            CALL CTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL CSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       END IF
 *
diff --git a/lapack-netlib/SRC/csytrs_aa_2stage.f b/lapack-netlib/SRC/csytrs_aa_2stage.f
index d025c08fe..581910933 100644
--- a/lapack-netlib/SRC/csytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/csytrs_aa_2stage.f
@@ -36,7 +36,7 @@
 *> \verbatim
 *>
 *> CSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a complex
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by CSYTRF_AA_2STAGE.
 *> \endverbatim
 *
@@ -48,7 +48,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -208,15 +208,15 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (U**T \P**T * B) -> B    [ (U**T \P**T * B) ]
+*           Compute (U**T \ B) -> B    [ (U**T \P**T * B) ]
 *
             CALL CTRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -234,7 +234,7 @@
             CALL CTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
 *
             CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
@@ -246,11 +246,11 @@
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute (L \ B) -> B    [ (L \P**T * B) ]
 *
             CALL CTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -268,7 +268,7 @@
             CALL CTRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
 *
             CALL CLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
diff --git a/lapack-netlib/SRC/ctgsy2.f b/lapack-netlib/SRC/ctgsy2.f
index 66a8980d0..5ccdfb1e1 100644
--- a/lapack-netlib/SRC/ctgsy2.f
+++ b/lapack-netlib/SRC/ctgsy2.f
@@ -67,7 +67,7 @@
 *>             R  * B**H + L  * E**H  = scale * -F
 *>
 *> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
-*> = sigma_min(Z) using reverse communicaton with CLACON.
+*> = sigma_min(Z) using reverse communication with CLACON.
 *>
 *> CTGSY2 also (IJOB >= 1) contributes to the computation in CTGSYL
 *> of an upper bound on the separation between to matrix pairs. Then
@@ -81,7 +81,7 @@
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
-*>          = 'N', solve the generalized Sylvester equation (1).
+*>          = 'N': solve the generalized Sylvester equation (1).
 *>          = 'T': solve the 'transposed' system (3).
 *> \endverbatim
 *>
@@ -89,13 +89,13 @@
 *> \verbatim
 *>          IJOB is INTEGER
 *>          Specifies what kind of functionality to be performed.
-*>          =0: solve (1) only.
-*>          =1: A contribution from this subsystem to a Frobenius
-*>              norm-based estimate of the separation between two matrix
-*>              pairs is computed. (look ahead strategy is used).
-*>          =2: A contribution from this subsystem to a Frobenius
-*>              norm-based estimate of the separation between two matrix
-*>              pairs is computed. (SGECON on sub-systems is used.)
+*>          = 0: solve (1) only.
+*>          = 1: A contribution from this subsystem to a Frobenius
+*>               norm-based estimate of the separation between two matrix
+*>               pairs is computed. (look ahead strategy is used).
+*>          = 2: A contribution from this subsystem to a Frobenius
+*>               norm-based estimate of the separation between two matrix
+*>               pairs is computed. (SGECON on sub-systems is used.)
 *>          Not referenced if TRANS = 'T'.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ctplqt.f b/lapack-netlib/SRC/ctplqt.f
index cb4d419b9..39893df48 100644
--- a/lapack-netlib/SRC/ctplqt.f
+++ b/lapack-netlib/SRC/ctplqt.f
@@ -1,3 +1,5 @@
+*> \brief \b CTPLQT
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/ctplqt2.f b/lapack-netlib/SRC/ctplqt2.f
index b16d6149a..d18452aec 100644
--- a/lapack-netlib/SRC/ctplqt2.f
+++ b/lapack-netlib/SRC/ctplqt2.f
@@ -1,3 +1,5 @@
+*> \brief \b CTPLQT2
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/ctpmlqt.f b/lapack-netlib/SRC/ctpmlqt.f
index cb5f033ca..5899a5335 100644
--- a/lapack-netlib/SRC/ctpmlqt.f
+++ b/lapack-netlib/SRC/ctpmlqt.f
@@ -1,3 +1,5 @@
+*> \brief \b CTPMLQT
+*
 *  Definition:
 *  ===========
 *
@@ -77,7 +79,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is COMPLEX array, dimension (LDA,K)
+*>          V is COMPLEX array, dimension (LDV,K)
 *>          The i-th row must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          DTPLQT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/ctpmqrt.f b/lapack-netlib/SRC/ctpmqrt.f
index fd3d1b109..8d4a36ca8 100644
--- a/lapack-netlib/SRC/ctpmqrt.f
+++ b/lapack-netlib/SRC/ctpmqrt.f
@@ -94,7 +94,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is COMPLEX array, dimension (LDA,K)
+*>          V is COMPLEX array, dimension (LDV,K)
 *>          The i-th column must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          CTPQRT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/ctprfb.f b/lapack-netlib/SRC/ctprfb.f
index 1538deb56..0f45edaf8 100644
--- a/lapack-netlib/SRC/ctprfb.f
+++ b/lapack-netlib/SRC/ctprfb.f
@@ -152,8 +152,8 @@
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.
-*>          If SIDE = 'L', LDC >= max(1,K);
-*>          If SIDE = 'R', LDC >= max(1,M).
+*>          If SIDE = 'L', LDA >= max(1,K);
+*>          If SIDE = 'R', LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in,out] B
diff --git a/lapack-netlib/SRC/cungtsqr.f b/lapack-netlib/SRC/cungtsqr.f
new file mode 100644
index 000000000..bc5305cf9
--- /dev/null
+++ b/lapack-netlib/SRC/cungtsqr.f
@@ -0,0 +1,307 @@
+*> \brief \b CUNGTSQR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CUNGTSQR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cuntsqr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zungtsqr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zungtsqr.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+*      $                     INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX           A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CUNGTSQR generates an M-by-N complex matrix Q_out with orthonormal
+*> columns, which are the first N columns of a product of comlpex unitary
+*> matrices of order M which are returned by CLATSQR
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> See the documentation for CLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by DLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by CLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not accessed.
+*>             The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by CLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored) (same format as the output A
+*>             below the diagonal in CLATSQR).
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of NIRB block reflector sequences
+*>          is stored in a larger NB-by-N column block of T and consists
+*>          of NICB smaller NB-by-NB upper-triangular column blocks.
+*>          (same format as the output T in CLATSQR).
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB1,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX array, dimension (MAX(2,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup comlexOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX        A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CLAMTSQR, CLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      LQUERY  = LWORK.EQ.-1
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array C(LDC, N) and WORK(LWORK)
+*        in the call to CLAMTSQR. See the documentation for CLAMTSQR.
+*
+         IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN
+            INFO = -10
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NBLOCAL = MIN( NB, N )
+*
+*           LWORK = -1, then set the size for the array C(LDC,N)
+*           in CLAMTSQR call and set the optimal size of the work array
+*           WORK(LWORK) in CLAMTSQR call.
+*
+            LDC = M
+            LC = LDC*N
+            LW = N * NBLOCAL
+*
+            LWORKOPT = LC+LW
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -10
+            END IF
+         END IF
+*
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CUNGTSQR', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = CMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in
+*     of M-by-M orthogonal matrix Q_in, which is implicitly stored in
+*     the subdiagonal part of input array A and in the input array T.
+*     Perform by the following operation using the routine CLAMTSQR.
+*
+*         Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix,
+*                        ( 0 )        0 is a (M-N)-by-N zero matrix.
+*
+*     (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones
+*     on the diagonal and zeros elsewhere.
+*
+      CALL CLASET( 'F', M, N, CZERO, CONE, WORK, LDC )
+*
+*     (1b)  On input, WORK(1:LDC*N) stores ( I );
+*                                          ( 0 )
+*
+*           On output, WORK(1:LDC*N) stores Q1_in.
+*
+      CALL CLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT,
+     $               WORK, LDC, WORK( LC+1 ), LW, IINFO )
+*
+*     (2) Copy the result from the part of the work array (1:M,1:N)
+*     with the leading dimension LDC that starts at WORK(1) into
+*     the output array A(1:M,1:N) column-by-column.
+*
+      DO J = 1, N
+         CALL CCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 )
+      END DO
+*
+      WORK( 1 ) = CMPLX( LWORKOPT )
+      RETURN
+*
+*     End of CUNGTSQR
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/cunhr_col.f b/lapack-netlib/SRC/cunhr_col.f
new file mode 100644
index 000000000..15c31491e
--- /dev/null
+++ b/lapack-netlib/SRC/cunhr_col.f
@@ -0,0 +1,441 @@
+*> \brief \b CUNHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download CUNHR_COL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/cunhr_col.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/cunhr_col.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/cunhr_col.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, M, N, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX           A( LDA, * ), D( * ), T( LDT, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>  CUNHR_COL takes an M-by-N complex matrix Q_in with orthonormal columns
+*>  as input, stored in A, and performs Householder Reconstruction (HR),
+*>  i.e. reconstructs Householder vectors V(i) implicitly representing
+*>  another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+*>  where S is an N-by-N diagonal matrix with diagonal entries
+*>  equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+*>  stored in A on output, and the diagonal entries of S are stored in D.
+*>  Block reflectors are also returned in T
+*>  (same output format as CGEQRT).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size to be used in the reconstruction
+*>          of Householder column vector blocks in the array A and
+*>          corresponding block reflectors in the array T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size.)
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_in,
+*>             i.e the columns of A are orthogonal unit vectors.
+*>
+*>          On exit:
+*>
+*>             The elements below the diagonal of A represent the unit
+*>             lower-trapezoidal matrix V of Householder column vectors
+*>             V(i). The unit diagonal entries of V are not stored
+*>             (same format as the output below the diagonal in A from
+*>             CGEQRT). The matrix T and the matrix V stored on output
+*>             in A implicitly define Q_out.
+*>
+*>             The elements above the diagonal contain the factor U
+*>             of the "modified" LU-decomposition:
+*>                Q_in - ( S ) = V * U
+*>                       ( 0 )
+*>             where 0 is a (M-N)-by-(M-N) zero matrix.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX array,
+*>          dimension (LDT, N)
+*>
+*>          Let NOCB = Number_of_output_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          On exit, T(1:NB, 1:N) contains NOCB upper-triangular
+*>          block reflectors used to define Q_out stored in compact
+*>          form as a sequence of upper-triangular NB-by-NB column
+*>          blocks (same format as the output T in CGEQRT).
+*>          The matrix T and the matrix V stored on output in A
+*>          implicitly define Q_out. NOTE: The lower triangles
+*>          below the upper-triangular blcoks will be filled with
+*>          zeros. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is COMPLEX array, dimension min(M,N).
+*>          The elements can be only plus or minus one.
+*>
+*>          D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where
+*>          1 <= i <= min(M,N), and Q_in_i is Q_in after performing
+*>          i-1 steps of “modified” Gaussian elimination.
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*> The computed M-by-M unitary factor Q_out is defined implicitly as
+*> a product of unitary matrices Q_out(i). Each Q_out(i) is stored in
+*> the compact WY-representation format in the corresponding blocks of
+*> matrices V (stored in A) and T.
+*>
+*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N
+*> matrix A contains the column vectors V(i) in NB-size column
+*> blocks VB(j). For example, VB(1) contains the columns
+*> V(1), V(2), ... V(NB). NOTE: The unit entries on
+*> the diagonal of Y are not stored in A.
+*>
+*> The number of column blocks is
+*>
+*>     NOCB = Number_of_output_col_blocks = CEIL(N/NB)
+*>
+*> where each block is of order NB except for the last block, which
+*> is of order LAST_NB = N - (NOCB-1)*NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix V is
+*>
+*>
+*>     V = (    VB(1),   VB(2), VB(3) ) =
+*>
+*>       = (   1                      )
+*>         ( v21    1                 )
+*>         ( v31  v32    1            )
+*>         ( v41  v42  v43   1        )
+*>         ( v51  v52  v53  v54    1  )
+*>         ( v61  v62  v63  v54   v65 )
+*>
+*>
+*> For each of the column blocks VB(i), an upper-triangular block
+*> reflector TB(i) is computed. These blocks are stored as
+*> a sequence of upper-triangular column blocks in the NB-by-N
+*> matrix T. The size of each TB(i) block is NB-by-NB, except
+*> for the last block, whose size is LAST_NB-by-LAST_NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix T is
+*>
+*>     T  = (    TB(1),    TB(2), TB(3) ) =
+*>
+*>        = ( t11  t12  t13  t14   t15  )
+*>          (      t22       t24        )
+*>
+*>
+*> The M-by-M factor Q_out is given as a product of NOCB
+*> unitary M-by-M matrices Q_out(i).
+*>
+*>     Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB),
+*>
+*> where each matrix Q_out(i) is given by the WY-representation
+*> using corresponding blocks from the matrices V and T:
+*>
+*>     Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T,
+*>
+*> where I is the identity matrix. Here is the formula with matrix
+*> dimensions:
+*>
+*>  Q(i){M-by-M} = I{M-by-M} -
+*>    VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M},
+*>
+*> where INB = NB, except for the last block NOCB
+*> for which INB=LAST_NB.
+*>
+*> =====
+*> NOTE:
+*> =====
+*>
+*> If Q_in is the result of doing a QR factorization
+*> B = Q_in * R_in, then:
+*>
+*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out.
+*>
+*> So if one wants to interpret Q_out as the result
+*> of the QR factorization of B, then corresponding R_out
+*> should be obtained by R_out = S * R_in, i.e. some rows of R_in
+*> should be multiplied by -1.
+*>
+*> For the details of the algorithm, see [1].
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complexOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November   2019, Igor Kozachenko,
+*>            Computer Science Division,
+*>            University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE CUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, M, N, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX           A( LDA, * ), D( * ), T( LDT, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB,
+     $                   NPLUSONE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CCOPY, CLAUNHR_COL_GETRFNP, CSCAL, CTRSM,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. N.GT.M ) THEN
+         INFO = -2
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -7
+      END IF
+*
+*     Handle error in the input parameters.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'CUNHR_COL', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         RETURN
+      END IF
+*
+*     On input, the M-by-N matrix A contains the unitary
+*     M-by-N matrix Q_in.
+*
+*     (1) Compute the unit lower-trapezoidal V (ones on the diagonal
+*     are not stored) by performing the "modified" LU-decomposition.
+*
+*     Q_in - ( S ) = V * U = ( V1 ) * U,
+*            ( 0 )           ( V2 )
+*
+*     where 0 is an (M-N)-by-N zero matrix.
+*
+*     (1-1) Factor V1 and U.
+
+      CALL CLAUNHR_COL_GETRFNP( N, N, A, LDA, D, IINFO )
+*
+*     (1-2) Solve for V2.
+*
+      IF( M.GT.N ) THEN
+         CALL CTRSM( 'R', 'U', 'N', 'N', M-N, N, CONE, A, LDA,
+     $               A( N+1, 1 ), LDA )
+      END IF
+*
+*     (2) Reconstruct the block reflector T stored in T(1:NB, 1:N)
+*     as a sequence of upper-triangular blocks with NB-size column
+*     blocking.
+*
+*     Loop over the column blocks of size NB of the array A(1:M,1:N)
+*     and the array T(1:NB,1:N), JB is the column index of a column
+*     block, JNB is the column block size at each step JB.
+*
+      NPLUSONE = N + 1
+      DO JB = 1, N, NB
+*
+*        (2-0) Determine the column block size JNB.
+*
+         JNB = MIN( NPLUSONE-JB, NB )
+*
+*        (2-1) Copy the upper-triangular part of the current JNB-by-JNB
+*        diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part
+*        of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1)
+*        column-by-column, total JNB*(JNB+1)/2 elements.
+*
+         JBTEMP1 = JB - 1
+         DO J = JB, JB+JNB-1
+            CALL CCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 )
+         END DO
+*
+*        (2-2) Perform on the upper-triangular part of the current
+*        JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in T(1:JNB,JB:JB+JNB-1) the following operation in place:
+*        (-1)*U(JB)*S(JB), i.e the result will be stored in the upper-
+*        triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication
+*        of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB
+*        diagonal block S(JB) of the N-by-N sign matrix S from the
+*        right means changing the sign of each J-th column of the block
+*        U(JB) according to the sign of the diagonal element of the block
+*        S(JB), i.e. S(J,J) that is stored in the array element D(J).
+*
+         DO J = JB, JB+JNB-1
+            IF( D( J ).EQ.CONE ) THEN
+               CALL CSCAL( J-JBTEMP1, -CONE, T( 1, J ), 1 )
+            END IF
+         END DO
+*
+*        (2-3) Perform the triangular solve for the current block
+*        matrix X(JB):
+*
+*               X(JB) * (A(JB)**T) = B(JB), where:
+*
+*               A(JB)**T  is a JNB-by-JNB unit upper-triangular
+*                         coefficient block, and A(JB)=V1(JB), which
+*                         is a JNB-by-JNB unit lower-triangular block
+*                         stored in A(JB:JB+JNB-1,JB:JB+JNB-1).
+*                         The N-by-N matrix V1 is the upper part
+*                         of the M-by-N lower-trapezoidal matrix V
+*                         stored in A(1:M,1:N);
+*
+*               B(JB)     is a JNB-by-JNB  upper-triangular right-hand
+*                         side block, B(JB) = (-1)*U(JB)*S(JB), and
+*                         B(JB) is stored in T(1:JNB,JB:JB+JNB-1);
+*
+*               X(JB)     is a JNB-by-JNB upper-triangular solution
+*                         block, X(JB) is the upper-triangular block
+*                         reflector T(JB), and X(JB) is stored
+*                         in T(1:JNB,JB:JB+JNB-1).
+*
+*             In other words, we perform the triangular solve for the
+*             upper-triangular block T(JB):
+*
+*               T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB).
+*
+*             Even though the blocks X(JB) and B(JB) are upper-
+*             triangular, the routine CTRSM will access all JNB**2
+*             elements of the square T(1:JNB,JB:JB+JNB-1). Therefore,
+*             we need to set to zero the elements of the block
+*             T(1:JNB,JB:JB+JNB-1) below the diagonal before the call
+*             to CTRSM.
+*
+*        (2-3a) Set the elements to zero.
+*
+         JBTEMP2 = JB - 2
+         DO J = JB, JB+JNB-2
+            DO I = J-JBTEMP2, NB
+               T( I, J ) = CZERO
+            END DO
+         END DO
+*
+*        (2-3b) Perform the triangular solve.
+*
+         CALL CTRSM( 'R', 'L', 'C', 'U', JNB, JNB, CONE,
+     $               A( JB, JB ), LDA, T( 1, JB ), LDT )
+*
+      END DO
+*
+      RETURN
+*
+*     End of CUNHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dbdsqr.f b/lapack-netlib/SRC/dbdsqr.f
index 93db95e7a..7d47fa282 100644
--- a/lapack-netlib/SRC/dbdsqr.f
+++ b/lapack-netlib/SRC/dbdsqr.f
@@ -166,7 +166,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE PRECISION array, dimension (4*N)
+*>          WORK is DOUBLE PRECISION array, dimension (4*(N-1))
 *> \endverbatim
 *>
 *> \param[out] INFO
diff --git a/lapack-netlib/SRC/dbdsvdx.f b/lapack-netlib/SRC/dbdsvdx.f
index 96fdb3d61..10d97a71f 100644
--- a/lapack-netlib/SRC/dbdsvdx.f
+++ b/lapack-netlib/SRC/dbdsvdx.f
@@ -165,7 +165,7 @@
 *>
 *> \param[out] Z
 *> \verbatim
-*>          Z is DOUBLE PRECISION array, dimension (2*N,K) )
+*>          Z is DOUBLE PRECISION array, dimension (2*N,K)
 *>          If JOBZ = 'V', then if INFO = 0 the first NS columns of Z
 *>          contain the singular vectors of the matrix B corresponding to
 *>          the selected singular values, with U in rows 1 to N and V
diff --git a/lapack-netlib/SRC/dcombssq.f b/lapack-netlib/SRC/dcombssq.f
new file mode 100644
index 000000000..79f6d95c9
--- /dev/null
+++ b/lapack-netlib/SRC/dcombssq.f
@@ -0,0 +1,92 @@
+*> \brief \b DCOMBSSQ adds two scaled sum of squares quantities.
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DCOMBSSQ( V1, V2 )
+*
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   V1( 2 ), V2( 2 )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DCOMBSSQ adds two scaled sum of squares quantities, V1 := V1 + V2.
+*> That is,
+*>
+*>    V1_scale**2 * V1_sumsq := V1_scale**2 * V1_sumsq
+*>                            + V2_scale**2 * V2_sumsq
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in,out] V1
+*> \verbatim
+*>          V1 is DOUBLE PRECISION array, dimension (2).
+*>          The first scaled sum.
+*>          V1(1) = V1_scale, V1(2) = V1_sumsq.
+*> \endverbatim
+*>
+*> \param[in] V2
+*> \verbatim
+*>          V2 is DOUBLE PRECISION array, dimension (2).
+*>          The second scaled sum.
+*>          V2(1) = V2_scale, V2(2) = V2_sumsq.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2018
+*
+*> \ingroup OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE DCOMBSSQ( V1, V2 )
+*
+*  -- LAPACK auxiliary routine (version 3.7.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2018
+*
+*     .. Array Arguments ..
+      DOUBLE PRECISION   V1( 2 ), V2( 2 )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+      IF( V1( 1 ).GE.V2( 1 ) ) THEN
+         IF( V1( 1 ).NE.ZERO ) THEN
+            V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 )
+         END IF
+      ELSE
+         V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 )
+         V1( 1 ) = V2( 1 )
+      END IF
+      RETURN
+*
+*     End of DCOMBSSQ
+*
+      END
diff --git a/lapack-netlib/SRC/dgbrfsx.f b/lapack-netlib/SRC/dgbrfsx.f
index fb52d643f..76afb2d6a 100644
--- a/lapack-netlib/SRC/dgbrfsx.f
+++ b/lapack-netlib/SRC/dgbrfsx.f
@@ -308,7 +308,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -344,14 +344,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension (NPARAMS)
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -359,9 +359,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/dgbsvxx.f b/lapack-netlib/SRC/dgbsvxx.f
index 819d20c6d..058b20686 100644
--- a/lapack-netlib/SRC/dgbsvxx.f
+++ b/lapack-netlib/SRC/dgbsvxx.f
@@ -431,7 +431,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -467,14 +467,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension (NPARAMS)
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -482,9 +482,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/dgebak.f b/lapack-netlib/SRC/dgebak.f
index 45a86ee57..10a78aa1a 100644
--- a/lapack-netlib/SRC/dgebak.f
+++ b/lapack-netlib/SRC/dgebak.f
@@ -47,10 +47,10 @@
 *> \verbatim
 *>          JOB is CHARACTER*1
 *>          Specifies the type of backward transformation required:
-*>          = 'N', do nothing, return immediately;
-*>          = 'P', do backward transformation for permutation only;
-*>          = 'S', do backward transformation for scaling only;
-*>          = 'B', do backward transformations for both permutation and
+*>          = 'N': do nothing, return immediately;
+*>          = 'P': do backward transformation for permutation only;
+*>          = 'S': do backward transformation for scaling only;
+*>          = 'B': do backward transformations for both permutation and
 *>                 scaling.
 *>          JOB must be the same as the argument JOB supplied to DGEBAL.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dgeesx.f b/lapack-netlib/SRC/dgeesx.f
index 26042a5f9..a08104d3d 100644
--- a/lapack-netlib/SRC/dgeesx.f
+++ b/lapack-netlib/SRC/dgeesx.f
@@ -583,7 +583,9 @@
                      IF( N.GT.I+1 )
      $                  CALL DSWAP( N-I-1, A( I, I+2 ), LDA,
      $                              A( I+1, I+2 ), LDA )
-                     CALL DSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 )
+                     IF( WANTVS ) THEN
+                       CALL DSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 )
+                     END IF
                      A( I, I+1 ) = A( I+1, I )
                      A( I+1, I ) = ZERO
                   END IF
diff --git a/lapack-netlib/SRC/dgejsv.f b/lapack-netlib/SRC/dgejsv.f
index 25ed248d0..a30cfab87 100644
--- a/lapack-netlib/SRC/dgejsv.f
+++ b/lapack-netlib/SRC/dgejsv.f
@@ -82,7 +82,7 @@
 *>             desirable, then this option is advisable. The input matrix A
 *>             is preprocessed with QR factorization with FULL (row and
 *>             column) pivoting.
-*>       = 'G'  Computation as with 'F' with an additional estimate of the
+*>       = 'G': Computation as with 'F' with an additional estimate of the
 *>             condition number of B, where A=D*B. If A has heavily weighted
 *>             rows, then using this condition number gives too pessimistic
 *>             error bound.
@@ -133,7 +133,7 @@
 *>        specified range. If A .NE. 0 is scaled so that the largest singular
 *>        value of c*A is around DSQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
 *>        the licence to kill columns of A whose norm in c*A is less than
-*>        DSQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN,
+*>        DSQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
 *>        where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
 *>       = 'N': Do not kill small columns of c*A. This option assumes that
 *>             BLAS and QR factorizations and triangular solvers are
@@ -230,7 +230,7 @@
 *>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
 *>                         the left singular vectors, including an ONB
 *>                         of the orthogonal complement of the Range(A).
-*>          If JOBU = 'W'  .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N),
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
 *>                         then U is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [V] is computed
 *>                         in U as left singular vectors of A^t and then
@@ -252,7 +252,7 @@
 *>          V is DOUBLE PRECISION array, dimension ( LDV, N )
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
-*>          If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N),
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
 *>                         then V is used as workspace if the pprocedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
@@ -272,13 +272,13 @@
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (LWORK)
-*>          On exit, if N.GT.0 .AND. M.GT.0 (else not referenced),
+*>          On exit, if N > 0 .AND. M > 0 (else not referenced),
 *>          WORK(1) = SCALE = WORK(2) / WORK(1) is the scaling factor such
 *>                    that SCALE*SVA(1:N) are the computed singular values
 *>                    of A. (See the description of SVA().)
 *>          WORK(2) = See the description of WORK(1).
 *>          WORK(3) = SCONDA is an estimate for the condition number of
-*>                    column equilibrated A. (If JOBA .EQ. 'E' or 'G')
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
 *>                    SCONDA is an estimate of DSQRT(||(R^t * R)^(-1)||_1).
 *>                    It is computed using DPOCON. It holds
 *>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
@@ -297,7 +297,7 @@
 *>                    triangular factor in the first QR factorization.
 *>          WORK(5) = an estimate of the scaled condition number of the
 *>                    triangular factor in the second QR factorization.
-*>          The following two parameters are computed if JOBT .EQ. 'T'.
+*>          The following two parameters are computed if JOBT = 'T'.
 *>          They are provided for a developer/implementer who is familiar
 *>          with the details of the method.
 *>
@@ -313,8 +313,8 @@
 *>          Length of WORK to confirm proper allocation of work space.
 *>          LWORK depends on the job:
 *>
-*>          If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and
-*>            -> .. no scaled condition estimate required (JOBE.EQ.'N'):
+*>          If only SIGMA is needed (JOBU = 'N', JOBV = 'N') and
+*>            -> .. no scaled condition estimate required (JOBE = 'N'):
 *>               LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement.
 *>               ->> For optimal performance (blocked code) the optimal value
 *>               is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal
@@ -330,7 +330,7 @@
 *>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF),
 *>                                                     N+N*N+LWORK(DPOCON),7).
 *>
-*>          If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'),
+*>          If SIGMA and the right singular vectors are needed (JOBV = 'V'),
 *>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
 *>            -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
 *>               where NB is the optimal block size for DGEQP3, DGEQRF, DGELQF,
@@ -341,19 +341,19 @@
 *>          If SIGMA and the left singular vectors are needed
 *>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
 *>            -> For optimal performance:
-*>               if JOBU.EQ.'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
-*>               if JOBU.EQ.'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7),
+*>               if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7),
 *>               where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR.
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON),
 *>                        2*N+LWORK(DGEQRF), N+LWORK(DORMQR)).
-*>               Here LWORK(DORMQR) equals N*NB (for JOBU.EQ.'U') or
-*>               M*NB (for JOBU.EQ.'F').
+*>               Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or
+*>               M*NB (for JOBU = 'F').
 *>
-*>          If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and
-*>            -> if JOBV.EQ.'V'
+*>          If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            -> if JOBV = 'V'
 *>               the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N).
-*>            -> if JOBV.EQ.'J' the minimal requirement is
+*>            -> if JOBV = 'J' the minimal requirement is
 *>               LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6).
 *>            -> For optimal performance, LWORK should be additionally
 *>               larger than N+M*NB, where NB is the optimal block size
@@ -369,7 +369,7 @@
 *>                     of JOBA and JOBR.
 *>          IWORK(2) = the number of the computed nonzero singular values
 *>          IWORK(3) = if nonzero, a warning message:
-*>                     If IWORK(3).EQ.1 then some of the column norms of A
+*>                     If IWORK(3) = 1 then some of the column norms of A
 *>                     were denormalized floats. The requested high accuracy
 *>                     is not warranted by the data.
 *> \endverbatim
@@ -377,10 +377,10 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           < 0  : if INFO = -i, then the i-th argument had an illegal value.
-*>           = 0 :  successful exit;
-*>           > 0 :  DGEJSV  did not converge in the maximal allowed number
-*>                  of sweeps. The computed values may be inaccurate.
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  DGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
 *> \endverbatim
 *
 *  Authors:
@@ -953,7 +953,7 @@
       IF ( L2ABER ) THEN
 *        Standard absolute error bound suffices. All sigma_i with
 *        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
-*        agressive enforcement of lower numerical rank by introducing a
+*        aggressive enforcement of lower numerical rank by introducing a
 *        backward error of the order of N*EPSLN*||A||.
          TEMP1 = DSQRT(DBLE(N))*EPSLN
          DO 3001 p = 2, N
@@ -965,7 +965,7 @@
  3001    CONTINUE
  3002    CONTINUE
       ELSE IF ( L2RANK ) THEN
-*        .. similarly as above, only slightly more gentle (less agressive).
+*        .. similarly as above, only slightly more gentle (less aggressive).
 *        Sudden drop on the diagonal of R1 is used as the criterion for
 *        close-to-rank-deficient.
          TEMP1 = DSQRT(SFMIN)
@@ -1294,7 +1294,7 @@
             CALL DPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1,
      $                   WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR)
             CONDR1 = ONE / DSQRT(TEMP1)
-*           .. here need a second oppinion on the condition number
+*           .. here need a second opinion on the condition number
 *           .. then assume worst case scenario
 *           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
 *           more conservative    <=> CONDR1 .LT. DSQRT(DBLE(N))
@@ -1335,7 +1335,7 @@
             ELSE
 *
 *              .. ill-conditioned case: second QRF with pivoting
-*              Note that windowed pivoting would be equaly good
+*              Note that windowed pivoting would be equally good
 *              numerically, and more run-time efficient. So, in
 *              an optimal implementation, the next call to DGEQP3
 *              should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
@@ -1388,7 +1388,7 @@
 *
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
-*                 (this overwrittes the copy of R2, as it will not be
+*                 (this overwrites the copy of R2, as it will not be
 *                 needed in this branch, but it does not overwritte the
 *                 Huseholder vectors of Q2.).
                   CALL DLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
@@ -1638,7 +1638,7 @@
 *
 *        This branch deploys a preconditioned Jacobi SVD with explicitly
 *        accumulated rotations. It is included as optional, mainly for
-*        experimental purposes. It does perfom well, and can also be used.
+*        experimental purposes. It does perform well, and can also be used.
 *        In this implementation, this branch will be automatically activated
 *        if the  condition number sigma_max(A) / sigma_min(A) is predicted
 *        to be greater than the overflow threshold. This is because the
diff --git a/lapack-netlib/SRC/dgelq.f b/lapack-netlib/SRC/dgelq.f
index ece645079..fc14d892f 100644
--- a/lapack-netlib/SRC/dgelq.f
+++ b/lapack-netlib/SRC/dgelq.f
@@ -1,3 +1,4 @@
+*> \brief \b DGELQ
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,17 @@
 *  =============
 *>
 *> \verbatim
-*> DGELQ computes a LQ factorization of an M-by-N matrix A.
+*>
+*> DGELQ computes an LQ factorization of a real M-by-N matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +149,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -159,10 +170,10 @@
       SUBROUTINE DGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/dgelq2.f b/lapack-netlib/SRC/dgelq2.f
index 04aa57fc1..a6c835de4 100644
--- a/lapack-netlib/SRC/dgelq2.f
+++ b/lapack-netlib/SRC/dgelq2.f
@@ -33,8 +33,16 @@
 *>
 *> \verbatim
 *>
-*> DGELQ2 computes an LQ factorization of a real m by n matrix A:
-*> A = L * Q.
+*> DGELQ2 computes an LQ factorization of a real m-by-n matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a n-by-n orthogonal matrix;
+*>    L is an lower-triangular m-by-m matrix;
+*>    0 is a m-by-(n-m) zero matrix, if m < n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +104,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup doubleGEcomputational
 *
@@ -121,10 +129,10 @@
 *  =====================================================================
       SUBROUTINE DGELQ2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/dgelqf.f b/lapack-netlib/SRC/dgelqf.f
index 834c47168..4b11761f6 100644
--- a/lapack-netlib/SRC/dgelqf.f
+++ b/lapack-netlib/SRC/dgelqf.f
@@ -34,7 +34,15 @@
 *> \verbatim
 *>
 *> DGELQF computes an LQ factorization of a real M-by-N matrix A:
-*> A = L * Q.
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -110,7 +118,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup doubleGEcomputational
 *
@@ -135,10 +143,10 @@
 *  =====================================================================
       SUBROUTINE DGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/dgemlq.f b/lapack-netlib/SRC/dgemlq.f
index bb6b2868f..dea693c24 100644
--- a/lapack-netlib/SRC/dgemlq.f
+++ b/lapack-netlib/SRC/dgemlq.f
@@ -1,3 +1,4 @@
+*> \brief \b DGEMLQ
 *
 *  Definition:
 *  ===========
@@ -144,7 +145,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/dgemqr.f b/lapack-netlib/SRC/dgemqr.f
index 8509b13d9..0f7a42233 100644
--- a/lapack-netlib/SRC/dgemqr.f
+++ b/lapack-netlib/SRC/dgemqr.f
@@ -1,3 +1,4 @@
+*> \brief \b DGEMQR
 *
 *  Definition:
 *  ===========
@@ -144,7 +145,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/dgeqr.f b/lapack-netlib/SRC/dgeqr.f
index d0a1a18f9..0bff5d1f9 100644
--- a/lapack-netlib/SRC/dgeqr.f
+++ b/lapack-netlib/SRC/dgeqr.f
@@ -1,3 +1,4 @@
+*> \brief \b DGEQR
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,18 @@
 *  =============
 *>
 *> \verbatim
-*> DGEQR computes a QR factorization of an M-by-N matrix A.
+*>
+*> DGEQR computes a QR factorization of a real M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +150,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -160,10 +172,10 @@
       SUBROUTINE DGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/dgeqr2.f b/lapack-netlib/SRC/dgeqr2.f
index c1e91e9bd..9ce1feb25 100644
--- a/lapack-netlib/SRC/dgeqr2.f
+++ b/lapack-netlib/SRC/dgeqr2.f
@@ -33,8 +33,17 @@
 *>
 *> \verbatim
 *>
-*> DGEQR2 computes a QR factorization of a real m by n matrix A:
-*> A = Q * R.
+*> DGEQR2 computes a QR factorization of a real m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +105,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup doubleGEcomputational
 *
@@ -121,10 +130,10 @@
 *  =====================================================================
       SUBROUTINE DGEQR2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/dgeqr2p.f b/lapack-netlib/SRC/dgeqr2p.f
index 921f79921..9b81ccb33 100644
--- a/lapack-netlib/SRC/dgeqr2p.f
+++ b/lapack-netlib/SRC/dgeqr2p.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> DGEQR2P computes a QR factorization of a real m by n matrix A:
-*> A = Q * R. The diagonal entries of R are nonnegative.
+*> DGEQR2P computes a QR factorization of a real m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -97,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup doubleGEcomputational
 *
@@ -124,10 +134,10 @@
 *  =====================================================================
       SUBROUTINE DGEQR2P( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/dgeqrf.f b/lapack-netlib/SRC/dgeqrf.f
index 83d7d8dd7..98666221f 100644
--- a/lapack-netlib/SRC/dgeqrf.f
+++ b/lapack-netlib/SRC/dgeqrf.f
@@ -34,7 +34,16 @@
 *> \verbatim
 *>
 *> DGEQRF computes a QR factorization of a real M-by-N matrix A:
-*> A = Q * R.
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -111,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup doubleGEcomputational
 *
@@ -136,10 +145,10 @@
 *  =====================================================================
       SUBROUTINE DGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/dgeqrfp.f b/lapack-netlib/SRC/dgeqrfp.f
index d182f98c9..5cf4069ed 100644
--- a/lapack-netlib/SRC/dgeqrfp.f
+++ b/lapack-netlib/SRC/dgeqrfp.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> DGEQRFP computes a QR factorization of a real M-by-N matrix A:
-*> A = Q * R. The diagonal entries of R are nonnegative.
+*> DGEQR2P computes a QR factorization of a real M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -112,7 +122,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup doubleGEcomputational
 *
@@ -139,10 +149,10 @@
 *  =====================================================================
       SUBROUTINE DGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/dgerfsx.f b/lapack-netlib/SRC/dgerfsx.f
index aafca8d10..495ea1726 100644
--- a/lapack-netlib/SRC/dgerfsx.f
+++ b/lapack-netlib/SRC/dgerfsx.f
@@ -283,7 +283,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -319,14 +319,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension (NPARAMS)
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -334,9 +334,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/dgesc2.f b/lapack-netlib/SRC/dgesc2.f
index 2f01a762f..72d8a38f0 100644
--- a/lapack-netlib/SRC/dgesc2.f
+++ b/lapack-netlib/SRC/dgesc2.f
@@ -90,7 +90,7 @@
 *> \verbatim
 *>          SCALE is DOUBLE PRECISION
 *>          On exit, SCALE contains the scale factor. SCALE is chosen
-*>          0 <= SCALE <= 1 to prevent owerflow in the solution.
+*>          0 <= SCALE <= 1 to prevent overflow in the solution.
 *> \endverbatim
 *
 *  Authors:
@@ -151,7 +151,7 @@
 *     ..
 *     .. Executable Statements ..
 *
-*      Set constant to control owerflow
+*      Set constant to control overflow
 *
       EPS = DLAMCH( 'P' )
       SMLNUM = DLAMCH( 'S' ) / EPS
diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f
index 926607f98..0218900d2 100644
--- a/lapack-netlib/SRC/dgesdd.f
+++ b/lapack-netlib/SRC/dgesdd.f
@@ -322,7 +322,7 @@
 *
             IF( WNTQN ) THEN
 *              dbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6)
-*              keep 7*N for backwards compatability.
+*              keep 7*N for backwards compatibility.
                BDSPAC = 7*N
             ELSE
                BDSPAC = 3*N*N + 4*N
@@ -448,7 +448,7 @@
 *
             IF( WNTQN ) THEN
 *              dbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6)
-*              keep 7*N for backwards compatability.
+*              keep 7*N for backwards compatibility.
                BDSPAC = 7*M
             ELSE
                BDSPAC = 3*M*M + 4*M
diff --git a/lapack-netlib/SRC/dgesvdq.f b/lapack-netlib/SRC/dgesvdq.f
new file mode 100644
index 000000000..e495d2bf9
--- /dev/null
+++ b/lapack-netlib/SRC/dgesvdq.f
@@ -0,0 +1,1385 @@
+*> \brief <b> DGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DGESVDQ + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dgesvdq.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dgesvdq.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dgesvdq.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE DGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+*                          S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+*                          WORK, LWORK, RWORK, LRWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*      IMPLICIT    NONE
+*      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+*      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK,
+*                  INFO
+*     ..
+*     .. Array Arguments ..
+*      DOUBLE PRECISION  A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * )
+*      DOUBLE PRECISION  S( * ), RWORK( * )
+*      INTEGER     IWORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DGESVDQ computes the singular value decomposition (SVD) of a real
+*> M-by-N matrix A, where M >= N. The SVD of A is written as
+*>                                    [++]   [xx]   [x0]   [xx]
+*>              A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+*>                                    [++]   [xx]
+*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+*> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+*> of SIGMA are the singular values of A. The columns of U and V are the
+*> left and the right singular vectors of A, respectively.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBA
+*> \verbatim
+*>  JOBA is CHARACTER*1
+*>  Specifies the level of accuracy in the computed SVD
+*>  = 'A' The requested accuracy corresponds to having the backward
+*>        error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F,
+*>        where EPS = DLAMCH('Epsilon'). This authorises DGESVDQ to
+*>        truncate the computed triangular factor in a rank revealing
+*>        QR factorization whenever the truncated part is below the
+*>        threshold of the order of EPS * ||A||_F. This is aggressive
+*>        truncation level.
+*>  = 'M' Similarly as with 'A', but the truncation is more gentle: it
+*>        is allowed only when there is a drop on the diagonal of the
+*>        triangular factor in the QR factorization. This is medium
+*>        truncation level.
+*>  = 'H' High accuracy requested. No numerical rank determination based
+*>        on the rank revealing QR factorization is attempted.
+*>  = 'E' Same as 'H', and in addition the condition number of column
+*>        scaled A is estimated and returned in  RWORK(1).
+*>        N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1)
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>  JOBP is CHARACTER*1
+*>  = 'P' The rows of A are ordered in decreasing order with respect to
+*>        ||A(i,:)||_\infty. This enhances numerical accuracy at the cost
+*>        of extra data movement. Recommended for numerical robustness.
+*>  = 'N' No row pivoting.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>          = 'T' After the initial pivoted QR factorization, DGESVD is applied to
+*>          the transposed R**T of the computed triangular factor R. This involves
+*>          some extra data movement (matrix transpositions). Useful for
+*>          experiments, research and development.
+*>          = 'N' The triangular factor R is given as input to DGESVD. This may be
+*>          preferred as it involves less data movement.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>          = 'A' All M left singular vectors are computed and returned in the
+*>          matrix U. See the description of U.
+*>          = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned
+*>          in the matrix U. See the description of U.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular
+*>          vectors are computed and returned in the matrix U.
+*>          = 'F' The N left singular vectors are returned in factored form as the
+*>          product of the Q factor from the initial QR factorization and the
+*>          N left singular vectors of (R**T , 0)**T. If row pivoting is used,
+*>          then the necessary information on the row pivoting is stored in
+*>          IWORK(N+1:N+M-1).
+*>          = 'N' The left singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>          = 'A', 'V' All N right singular vectors are computed and returned in
+*>          the matrix V.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular
+*>          vectors are computed and returned in the matrix V. This option is
+*>          allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal.
+*>          = 'N' The right singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the input matrix A.  M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array of dimensions LDA x N
+*>          On entry, the input matrix A.
+*>          On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains
+*>          the Householder vectors as stored by DGEQP3. If JOBU = 'F', these Householder
+*>          vectors together with WORK(1:N) can be used to restore the Q factors from
+*>          the initial pivoted QR factorization of A. See the description of U.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER.
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array of dimension N.
+*>          The singular values of A, ordered so that S(i) >= S(i+1).
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is DOUBLE PRECISION array, dimension
+*>          LDU x M if JOBU = 'A'; see the description of LDU. In this case,
+*>          on exit, U contains the M left singular vectors.
+*>          LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this
+*>          case, U contains the leading N or the leading NUMRANK left singular vectors.
+*>          LDU x N if JOBU = 'F' ; see the description of LDU. In this case U
+*>          contains N x N orthogonal matrix that can be used to form the left
+*>          singular vectors.
+*>          If JOBU = 'N', U is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER.
+*>          The leading dimension of the array U.
+*>          If JOBU = 'A', 'S', 'U', 'R',  LDU >= max(1,M).
+*>          If JOBU = 'F',                 LDU >= max(1,N).
+*>          Otherwise,                     LDU >= 1.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is DOUBLE PRECISION array, dimension
+*>          LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' .
+*>          If JOBV = 'A', or 'V',  V contains the N-by-N orthogonal matrix  V**T;
+*>          If JOBV = 'R', V contains the first NUMRANK rows of V**T (the right
+*>          singular vectors, stored rowwise, of the NUMRANK largest singular values).
+*>          If JOBV = 'N' and JOBA = 'E', V is used as a workspace.
+*>          If JOBV = 'N', and JOBA.NE.'E', V is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If JOBV = 'A', 'V', 'R',  or JOBA = 'E', LDV >= max(1,N).
+*>          Otherwise,                               LDV >= 1.
+*> \endverbatim
+*>
+*> \param[out] NUMRANK
+*> \verbatim
+*>          NUMRANK is INTEGER
+*>          NUMRANK is the numerical rank first determined after the rank
+*>          revealing QR factorization, following the strategy specified by the
+*>          value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK
+*>          leading singular values and vectors are then requested in the call
+*>          of DGESVD. The final value of NUMRANK might be further reduced if
+*>          some singular values are computed as zeros.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (max(1, LIWORK)).
+*>          On exit, IWORK(1:N) contains column pivoting permutation of the
+*>          rank revealing QR factorization.
+*>          If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence
+*>          of row swaps used in row pivoting. These can be used to restore the
+*>          left singular vectors in the case JOBU = 'F'.
+*>
+*>          If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          LIWORK(1) returns the minimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          LIWORK >= N + M - 1,     if JOBP = 'P' and JOBA .NE. 'E';
+*>          LIWORK >= N              if JOBP = 'N' and JOBA .NE. 'E';
+*>          LIWORK >= N + M - 1 + N, if JOBP = 'P' and JOBA = 'E';
+*>          LIWORK >= N + N          if JOBP = 'N' and JOBA = 'E'.
+*
+*>          If LIWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the WORK, IWORK, and RWORK arrays, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is DOUBLE PRECISION array, dimension (max(2, LWORK)), used as a workspace.
+*>          On exit, if, on entry, LWORK.NE.-1, WORK(1:N) contains parameters
+*>          needed to recover the Q factor from the QR factorization computed by
+*>          DGEQP3.
+*>
+*>          If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          WORK(1) returns the optimal LWORK, and
+*>          WORK(2) returns the minimal LWORK.
+*> \endverbatim
+*>
+*> \param[in,out] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. It is determined as follows:
+*>          Let  LWQP3 = 3*N+1,  LWCON = 3*N, and let
+*>          LWORQ = { MAX( N, 1 ),  if JOBU = 'R', 'S', or 'U'
+*>                  { MAX( M, 1 ),  if JOBU = 'A'
+*>          LWSVD = MAX( 5*N, 1 )
+*>          LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 5*(N/2), 1 ), LWORLQ = MAX( N, 1 ),
+*>          LWQRF = MAX( N/2, 1 ), LWORQ2 = MAX( N, 1 )
+*>          Then the minimal value of LWORK is:
+*>          = MAX( N + LWQP3, LWSVD )        if only the singular values are needed;
+*>          = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed,
+*>                                   and a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWORQ ) if the singular values and the left
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the singular values and the left
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD )        if the singular values and the right
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition etimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWORQ ) if the full SVD is requested with JOBV = 'R';
+*>                                   independent of JOBR;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the full SVD is requested,
+*>                                   JOBV = 'R' and, also a scaled condition
+*>                                   estimate requested; independent of JOBR;
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ) ) if the
+*>                         full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ ) )
+*>                         if the full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N', and also a scaled condition number estimate
+*>                         requested.
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) ) if the
+*>                         full SVD is requested with JOBV = 'A', 'V', and JOBR ='T'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) )
+*>                         if the full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='T', and also a scaled condition number estimate
+*>                         requested.
+*>          Finally, LWORK must be at least two: LWORK = MAX( 2, LWORK ).
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the WORK, IWORK, and RWORK arrays, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (max(1, LRWORK)).
+*>          On exit,
+*>          1. If JOBA = 'E', RWORK(1) contains an estimate of the condition
+*>          number of column scaled A. If A = C * D where D is diagonal and C
+*>          has unit columns in the Euclidean norm, then, assuming full column rank,
+*>          N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1).
+*>          Otherwise, RWORK(1) = -1.
+*>          2. RWORK(2) contains the number of singular values computed as
+*>          exact zeros in DGESVD applied to the upper triangular or trapeziodal
+*>          R (from the initial QR factorization). In case of early exit (no call to
+*>          DGESVD, such as in the case of zero matrix) RWORK(2) = -1.
+*>
+*>          If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          RWORK(1) returns the minimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER.
+*>          The dimension of the array RWORK.
+*>          If JOBP ='P', then LRWORK >= MAX(2, M).
+*>          Otherwise, LRWORK >= 2
+*
+*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the WORK, IWORK, and RWORK arrays, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if DBDSQR did not converge, INFO specifies how many superdiagonals
+*>          of an intermediate bidiagonal form B (computed in DGESVD) did not
+*>          converge to zero.
+*> \endverbatim
+*
+*> \par Further Details:
+*  ========================
+*>
+*> \verbatim
+*>
+*>   1. The data movement (matrix transpose) is coded using simple nested
+*>   DO-loops because BLAS and LAPACK do not provide corresponding subroutines.
+*>   Those DO-loops are easily identified in this source code - by the CONTINUE
+*>   statements labeled with 11**. In an optimized version of this code, the
+*>   nested DO loops should be replaced with calls to an optimized subroutine.
+*>   2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause
+*>   column norm overflow. This is the minial precaution and it is left to the
+*>   SVD routine (CGESVD) to do its own preemptive scaling if potential over-
+*>   or underflows are detected. To avoid repeated scanning of the array A,
+*>   an optimal implementation would do all necessary scaling before calling
+*>   CGESVD and the scaling in CGESVD can be switched off.
+*>   3. Other comments related to code optimization are given in comments in the
+*>   code, enlosed in [[double brackets]].
+*> \endverbatim
+*
+*> \par Bugs, examples and comments
+*  ===========================
+*
+*> \verbatim
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*> \endverbatim
+*
+*> \par References
+*  ===============
+*
+*> \verbatim
+*>  [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for
+*>      Computing the SVD with High Accuracy. ACM Trans. Math. Softw.
+*>      44(1): 11:1-11:30 (2017)
+*>
+*>  SIGMA library, xGESVDQ section updated February 2016.
+*>  Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*> Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2018
+*
+*> \ingroup doubleGEsing
+*
+*  =====================================================================
+      SUBROUTINE DGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+     $                    S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+     $                    WORK, LWORK, RWORK, LRWORK, INFO )
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK,
+     $            INFO
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * )
+      DOUBLE PRECISION S( * ), RWORK( * )
+      INTEGER          IWORK( * )
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION ZERO,         ONE
+      PARAMETER      ( ZERO = 0.0D0, ONE = 1.0D0 )
+*     .. Local Scalars ..
+      INTEGER     IERR, IWOFF, NR, N1, OPTRATIO, p, q
+      INTEGER     LWCON, LWQP3, LWRK_DGELQF, LWRK_DGESVD, LWRK_DGESVD2,
+     $            LWRK_DGEQP3,  LWRK_DGEQRF, LWRK_DORMLQ, LWRK_DORMQR,
+     $            LWRK_DORMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWORQ,
+     $            LWORQ2, LWORLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2,
+     $            IMINWRK, RMINWRK
+      LOGICAL     ACCLA,  ACCLM, ACCLH, ASCALED, CONDA, DNTWU,  DNTWV,
+     $            LQUERY, LSVC0, LSVEC, ROWPRM,  RSVEC, RTRANS, WNTUA,
+     $            WNTUF,  WNTUR, WNTUS, WNTVA,   WNTVR
+      DOUBLE PRECISION BIG, EPSLN, RTMP, SCONDA, SFMIN
+*     .. Local Arrays
+      DOUBLE PRECISION RDUMMY(1)
+*     ..
+*     .. External Subroutines (BLAS, LAPACK)
+      EXTERNAL    DGELQF, DGEQP3, DGEQRF, DGESVD, DLACPY, DLAPMT,
+     $            DLASCL, DLASET, DLASWP, DSCAL,  DPOCON, DORMLQ,
+     $            DORMQR, XERBLA
+*     ..
+*     .. External Functions (BLAS, LAPACK)
+      LOGICAL    LSAME
+      INTEGER    IDAMAX
+      DOUBLE PRECISION  DLANGE, DNRM2, DLAMCH
+      EXTERNAL    DLANGE, LSAME, IDAMAX, DNRM2, DLAMCH
+*     ..
+*     .. Intrinsic Functions ..
+*
+      INTRINSIC   ABS, MAX, MIN, DBLE, SQRT
+*
+*     Test the input arguments
+*
+      WNTUS  = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' )
+      WNTUR  = LSAME( JOBU, 'R' )
+      WNTUA  = LSAME( JOBU, 'A' )
+      WNTUF  = LSAME( JOBU, 'F' )
+      LSVC0  = WNTUS .OR. WNTUR .OR. WNTUA
+      LSVEC  = LSVC0 .OR. WNTUF
+      DNTWU  = LSAME( JOBU, 'N' )
+*
+      WNTVR  = LSAME( JOBV, 'R' )
+      WNTVA  = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' )
+      RSVEC  = WNTVR .OR. WNTVA
+      DNTWV  = LSAME( JOBV, 'N' )
+*
+      ACCLA  = LSAME( JOBA, 'A' )
+      ACCLM  = LSAME( JOBA, 'M' )
+      CONDA  = LSAME( JOBA, 'E' )
+      ACCLH  = LSAME( JOBA, 'H' ) .OR. CONDA
+*
+      ROWPRM = LSAME( JOBP, 'P' )
+      RTRANS = LSAME( JOBR, 'T' )
+*
+      IF ( ROWPRM ) THEN
+         IF ( CONDA ) THEN
+            IMINWRK = MAX( 1, N + M - 1 + N )
+         ELSE
+            IMINWRK = MAX( 1, N + M - 1 )
+         END IF
+         RMINWRK = MAX( 2, M )
+      ELSE
+         IF ( CONDA ) THEN
+            IMINWRK = MAX( 1, N + N )
+         ELSE
+            IMINWRK = MAX( 1, N )
+         END IF
+         RMINWRK = 2
+      END IF
+      LQUERY = (LIWORK .EQ. -1 .OR. LWORK .EQ. -1 .OR. LRWORK .EQ. -1)
+      INFO  = 0
+      IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN
+         INFO = -1
+      ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN
+         INFO = -4
+      ELSE IF ( WNTUR .AND. WNTVA ) THEN
+         INFO = -5
+      ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN
+         INFO = -5
+      ELSE IF ( M.LT.0 ) THEN
+         INFO = -6
+      ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN
+         INFO = -7
+      ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -9
+      ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR.
+     $       ( WNTUF .AND. LDU.LT.N ) ) THEN
+         INFO = -12
+      ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR.
+     $          ( CONDA .AND. LDV.LT.N ) ) THEN
+         INFO = -14
+      ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN
+         INFO = -17
+      END IF
+*
+*
+      IF ( INFO .EQ. 0 ) THEN
+*        .. compute the minimal and the optimal workspace lengths
+*        [[The expressions for computing the minimal and the optimal
+*        values of LWORK are written with a lot of redundancy and
+*        can be simplified. However, this detailed form is easier for
+*        maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for DGEQP3 of an M x N matrix
+         LWQP3 = 3 * N + 1
+*        .. minimal workspace length for DORMQR to build left singular vectors
+         IF ( WNTUS .OR. WNTUR ) THEN
+             LWORQ  = MAX( N  , 1 )
+         ELSE IF ( WNTUA ) THEN
+             LWORQ = MAX( M , 1 )
+         END IF
+*        .. minimal workspace length for DPOCON of an N x N matrix
+         LWCON = 3 * N
+*        .. DGESVD of an N x N matrix
+         LWSVD = MAX( 5 * N, 1 )
+         IF ( LQUERY ) THEN
+             CALL DGEQP3( M, N, A, LDA, IWORK, RDUMMY, RDUMMY, -1,
+     $           IERR )
+             LWRK_DGEQP3 = INT( RDUMMY(1) )
+             IF ( WNTUS .OR. WNTUR ) THEN
+                 CALL DORMQR( 'L', 'N', M, N, N, A, LDA, RDUMMY, U,
+     $                LDU, RDUMMY, -1, IERR )
+                 LWRK_DORMQR = INT( RDUMMY(1) )
+             ELSE IF ( WNTUA ) THEN
+                 CALL DORMQR( 'L', 'N', M, M, N, A, LDA, RDUMMY, U,
+     $                LDU, RDUMMY, -1, IERR )
+                 LWRK_DORMQR = INT( RDUMMY(1) )
+             ELSE
+                 LWRK_DORMQR = 0
+             END IF
+         END IF
+         MINWRK = 2
+         OPTWRK = 2
+         IF ( .NOT. (LSVEC .OR. RSVEC )) THEN
+*            .. minimal and optimal sizes of the workspace if
+*            only the singular values are requested
+             IF ( CONDA ) THEN
+                MINWRK = MAX( N+LWQP3, LWCON, LWSVD )
+             ELSE
+                MINWRK = MAX( N+LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL DGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU,
+     $                V, LDV, RDUMMY, -1, IERR )
+                 LWRK_DGESVD = INT( RDUMMY(1) )
+                 IF ( CONDA ) THEN
+                    OPTWRK = MAX( N+LWRK_DGEQP3, N+LWCON, LWRK_DGESVD )
+                 ELSE
+                    OPTWRK = MAX( N+LWRK_DGEQP3, LWRK_DGESVD )
+                 END IF
+             END IF
+         ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN
+*            .. minimal and optimal sizes of the workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWORQ )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD, LWORQ )
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL DGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                ELSE
+                   CALL DGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                END IF
+                LWRK_DGESVD = INT( RDUMMY(1) )
+                IF ( CONDA ) THEN
+                    OPTWRK = N + MAX( LWRK_DGEQP3, LWCON, LWRK_DGESVD,
+     $                               LWRK_DORMQR )
+                ELSE
+                    OPTWRK = N + MAX( LWRK_DGEQP3, LWRK_DGESVD,
+     $                               LWRK_DORMQR )
+                END IF
+             END IF
+         ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 IF ( RTRANS ) THEN
+                     CALL DGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, RDUMMY, -1, IERR )
+                 ELSE
+                     CALL DGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, RDUMMY, -1, IERR )
+                 END IF
+                 LWRK_DGESVD = INT( RDUMMY(1) )
+                 IF ( CONDA ) THEN
+                     OPTWRK = N + MAX( LWRK_DGEQP3, LWCON, LWRK_DGESVD )
+                 ELSE
+                     OPTWRK = N + MAX( LWRK_DGEQP3, LWRK_DGESVD )
+                 END IF
+             END IF
+         ELSE
+*            .. minimal and optimal sizes of the workspace if the
+*            full SVD is requested
+             IF ( RTRANS ) THEN
+                 MINWRK = MAX( LWQP3, LWSVD, LWORQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N x N/2 DGEQRF
+                    LWQRF  = MAX( N/2, 1 )
+*                   .. minimal workspace lengt for N/2 x N/2 DGESVD
+                    LWSVD2 = MAX( 5 * (N/2), 1 )
+                    LWORQ2 = MAX( N, 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2,
+     $                        N/2+LWORQ2, LWORQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             ELSE
+                 MINWRK = MAX( LWQP3, LWSVD, LWORQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N/2 x N DGELQF
+                    LWLQF  = MAX( N/2, 1 )
+                    LWSVD2 = MAX( 5 * (N/2), 1 )
+                    LWORLQ = MAX( N , 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2,
+     $                        N/2+LWORLQ, LWORQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL DGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                   LWRK_DGESVD = INT( RDUMMY(1) )
+                   OPTWRK = MAX(LWRK_DGEQP3,LWRK_DGESVD,LWRK_DORMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                       CALL DGEQRF(N,N/2,U,LDU,RDUMMY,RDUMMY,-1,IERR)
+                       LWRK_DGEQRF = INT( RDUMMY(1) )
+                       CALL DGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU,
+     $                      V, LDV, RDUMMY, -1, IERR )
+                       LWRK_DGESVD2 = INT( RDUMMY(1) )
+                       CALL DORMQR( 'R', 'C', N, N, N/2, U, LDU, RDUMMY,
+     $                      V, LDV, RDUMMY, -1, IERR )
+                       LWRK_DORMQR2 = INT( RDUMMY(1) )
+                       OPTWRK2 = MAX( LWRK_DGEQP3, N/2+LWRK_DGEQRF,
+     $                           N/2+LWRK_DGESVD2, N/2+LWRK_DORMQR2 )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                ELSE
+                   CALL DGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                   LWRK_DGESVD = INT( RDUMMY(1) )
+                   OPTWRK = MAX(LWRK_DGEQP3,LWRK_DGESVD,LWRK_DORMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                      CALL DGELQF(N/2,N,U,LDU,RDUMMY,RDUMMY,-1,IERR)
+                      LWRK_DGELQF = INT( RDUMMY(1) )
+                      CALL DGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU,
+     $                     V, LDV, RDUMMY, -1, IERR )
+                      LWRK_DGESVD2 = INT( RDUMMY(1) )
+                      CALL DORMLQ( 'R', 'N', N, N, N/2, U, LDU, RDUMMY,
+     $                     V, LDV, RDUMMY,-1,IERR )
+                      LWRK_DORMLQ = INT( RDUMMY(1) )
+                      OPTWRK2 = MAX( LWRK_DGEQP3, N/2+LWRK_DGELQF,
+     $                           N/2+LWRK_DGESVD2, N/2+LWRK_DORMLQ )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                END IF
+             END IF
+         END IF
+*
+         MINWRK = MAX( 2, MINWRK )
+         OPTWRK = MAX( 2, OPTWRK )
+         IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19
+*
+      END IF
+*
+      IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN
+         INFO = -21
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DGESVDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+*
+*     Return optimal workspace
+*
+          IWORK(1) = IMINWRK
+          WORK(1) = OPTWRK
+          WORK(2) = MINWRK
+          RWORK(1) = RMINWRK
+          RETURN
+      END IF
+*
+*     Quick return if the matrix is void.
+*
+      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN
+*     .. all output is void.
+         RETURN
+      END IF
+*
+      BIG = DLAMCH('O')
+      ASCALED = .FALSE.
+      IWOFF = 1
+      IF ( ROWPRM ) THEN
+            IWOFF = M
+*           .. reordering the rows in decreasing sequence in the
+*           ell-infinity norm - this enhances numerical robustness in
+*           the case of differently scaled rows.
+            DO 1904 p = 1, M
+*               RWORK(p) = ABS( A(p,ICAMAX(N,A(p,1),LDA)) )
+*               [[DLANGE will return NaN if an entry of the p-th row is Nan]]
+                RWORK(p) = DLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY )
+*               .. check for NaN's and Inf's
+                IF ( ( RWORK(p) .NE. RWORK(p) ) .OR.
+     $               ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN
+                    INFO = -8
+                    CALL XERBLA( 'DGESVDQ', -INFO )
+                    RETURN
+                END IF
+ 1904       CONTINUE
+            DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, RWORK(p), 1 ) + p - 1
+            IWORK(N+p) = q
+            IF ( p .NE. q ) THEN
+               RTMP     = RWORK(p)
+               RWORK(p) = RWORK(q)
+               RWORK(q) = RTMP
+            END IF
+ 1952       CONTINUE
+*
+            IF ( RWORK(1) .EQ. ZERO ) THEN
+*              Quick return: A is the M x N zero matrix.
+               NUMRANK = 0
+               CALL DLASET( 'G', N, 1, ZERO, ZERO, S, N )
+               IF ( WNTUS ) CALL DLASET('G', M, N, ZERO, ONE, U, LDU)
+               IF ( WNTUA ) CALL DLASET('G', M, M, ZERO, ONE, U, LDU)
+               IF ( WNTVA ) CALL DLASET('G', N, N, ZERO, ONE, V, LDV)
+               IF ( WNTUF ) THEN
+                   CALL DLASET( 'G', N, 1, ZERO, ZERO, WORK, N )
+                   CALL DLASET( 'G', M, N, ZERO,  ONE, U, LDU )
+               END IF
+               DO 5001 p = 1, N
+                   IWORK(p) = p
+ 5001          CONTINUE
+               IF ( ROWPRM ) THEN
+                   DO 5002 p = N + 1, N + M - 1
+                       IWORK(p) = p - N
+ 5002              CONTINUE
+               END IF
+               IF ( CONDA ) RWORK(1) = -1
+               RWORK(2) = -1
+               RETURN
+            END IF
+*
+            IF ( RWORK(1) .GT. BIG / SQRT(DBLE(M)) ) THEN
+*               .. to prevent overflow in the QR factorization, scale the
+*               matrix by 1/sqrt(M) if too large entry detected
+                CALL DLASCL('G',0,0,SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR)
+                ASCALED = .TRUE.
+            END IF
+            CALL DLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 )
+      END IF
+*
+*    .. At this stage, preemptive scaling is done only to avoid column
+*    norms overflows during the QR factorization. The SVD procedure should
+*    have its own scaling to save the singular values from overflows and
+*    underflows. That depends on the SVD procedure.
+*
+      IF ( .NOT.ROWPRM ) THEN
+          RTMP = DLANGE( 'M', M, N, A, LDA, RDUMMY )
+          IF ( ( RTMP .NE. RTMP ) .OR.
+     $         ( (RTMP*ZERO) .NE. ZERO ) ) THEN
+               INFO = -8
+               CALL XERBLA( 'DGESVDQ', -INFO )
+               RETURN
+          END IF
+          IF ( RTMP .GT. BIG / SQRT(DBLE(M)) ) THEN
+*             .. to prevent overflow in the QR factorization, scale the
+*             matrix by 1/sqrt(M) if too large entry detected
+              CALL DLASCL('G',0,0, SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR)
+              ASCALED = .TRUE.
+          END IF
+      END IF
+*
+*     .. QR factorization with column pivoting
+*
+*     A * P = Q * [ R ]
+*                 [ 0 ]
+*
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL DGEQP3( M, N, A, LDA, IWORK, WORK, WORK(N+1), LWORK-N,
+     $      IERR )
+*
+*    If the user requested accuracy level allows truncation in the
+*    computed upper triangular factor, the matrix R is examined and,
+*    if possible, replaced with its leading upper trapezoidal part.
+*
+      EPSLN = DLAMCH('E')
+      SFMIN = DLAMCH('S')
+*     SMALL = SFMIN / EPSLN
+      NR = N
+*
+      IF ( ACCLA ) THEN
+*
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPS*||A||_F are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPS*||A||_F.
+         NR = 1
+         RTMP = SQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002
+               NR = NR + 1
+ 3001    CONTINUE
+ 3002    CONTINUE
+*
+      ELSEIF ( ACCLM ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R is used as the criterion for being
+*        close-to-rank-deficient. The threshold is set to EPSLN=DLAMCH('E').
+*        [[This can be made more flexible by replacing this hard-coded value
+*        with a user specified threshold.]] Also, the values that underflow
+*        will be truncated.
+         NR = 1
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        .. RRQR not authorized to determine numerical rank except in the
+*        obvious case of zero pivots.
+*        .. inspect R for exact zeros on the diagonal;
+*        R(i,i)=0 => R(i:N,i:N)=0.
+         NR = 1
+         DO 3501 p = 2, N
+            IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502
+            NR = NR + 1
+ 3501    CONTINUE
+ 3502    CONTINUE
+*
+         IF ( CONDA ) THEN
+*           Estimate the scaled condition number of A. Use the fact that it is
+*           the same as the scaled condition number of R.
+*              .. V is used as workspace
+               CALL DLACPY( 'U', N, N, A, LDA, V, LDV )
+*              Only the leading NR x NR submatrix of the triangular factor
+*              is considered. Only if NR=N will this give a reliable error
+*              bound. However, even for NR < N, this can be used on an
+*              expert level and obtain useful information in the sense of
+*              perturbation theory.
+               DO 3053 p = 1, NR
+                  RTMP = DNRM2( p, V(1,p), 1 )
+                  CALL DSCAL( p, ONE/RTMP, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN
+                   CALL DPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  WORK, IWORK(N+IWOFF), IERR )
+               ELSE
+                   CALL DPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  WORK(N+1), IWORK(N+IWOFF), IERR )
+               END IF
+               SCONDA = ONE / SQRT(RTMP)
+*           For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1),
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*           See the reference [1] for more details.
+         END IF
+*
+      ENDIF
+*
+      IF ( WNTUR ) THEN
+          N1 = NR
+      ELSE IF ( WNTUS .OR. WNTUF) THEN
+          N1 = N
+      ELSE IF ( WNTUA ) THEN
+          N1 = M
+      END IF
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*.......................................................................
+*        .. only the singular values are requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*         .. compute the singular values of R**T = [A](1:NR,1:N)**T
+*           .. set the lower triangle of [A] to [A](1:NR,1:N)**T and
+*           the upper triangle of [A] to zero.
+            DO 1146 p = 1, MIN( N, NR )
+               DO 1147 q = p + 1, N
+                  A(q,p) = A(p,q)
+                  IF ( q .LE. NR ) A(p,q) = ZERO
+ 1147          CONTINUE
+ 1146       CONTINUE
+*
+            CALL DGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU,
+     $           V, LDV, WORK, LWORK, INFO )
+*
+         ELSE
+*
+*           .. compute the singular values of R = [A](1:NR,1:N)
+*
+            IF ( NR .GT. 1 )
+     $          CALL DLASET( 'L', NR-1,NR-1, ZERO,ZERO, A(2,1), LDA )
+            CALL DGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU,
+     $           V, LDV, WORK, LWORK, INFO )
+*
+         END IF
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN
+*.......................................................................
+*       .. the singular values and the left singular vectors requested
+*.......................................................................""""""""
+         IF ( RTRANS ) THEN
+*            .. apply DGESVD to R**T
+*            .. copy R**T into [U] and overwrite [U] with the right singular
+*            vectors of R
+            DO 1192 p = 1, NR
+               DO 1193 q = p, N
+                  U(q,p) = A(p,q)
+ 1193          CONTINUE
+ 1192       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, U(1,2), LDU )
+*           .. the left singular vectors not computed, the NR right singular
+*           vectors overwrite [U](1:NR,1:NR) as transposed. These
+*           will be pre-multiplied by Q to build the left singular vectors of A.
+               CALL DGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU,
+     $              U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+               DO 1119 p = 1, NR
+                   DO 1120 q = p + 1, NR
+                      RTMP   = U(q,p)
+                      U(q,p) = U(p,q)
+                      U(p,q) = RTMP
+ 1120              CONTINUE
+ 1119          CONTINUE
+*
+         ELSE
+*            .. apply DGESVD to R
+*            .. copy R into [U] and overwrite [U] with the left singular vectors
+             CALL DLACPY( 'U', NR, N, A, LDA, U, LDU )
+             IF ( NR .GT. 1 )
+     $         CALL DLASET( 'L', NR-1, NR-1, ZERO, ZERO, U(2,1), LDU )
+*            .. the right singular vectors not computed, the NR left singular
+*            vectors overwrite [U](1:NR,1:NR)
+                CALL DGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU,
+     $               V, LDV, WORK(N+1), LWORK-N, INFO )
+*               .. now [U](1:NR,1:NR) contains the NR left singular vectors of
+*               R. These will be pre-multiplied by Q to build the left singular
+*               vectors of A.
+         END IF
+*
+*           .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+         IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN
+             CALL DLASET('A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU)
+             IF ( NR .LT. N1 ) THEN
+                CALL DLASET( 'A',NR,N1-NR,ZERO,ZERO,U(1,NR+1), LDU )
+                CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $               U(NR+1,NR+1), LDU )
+             END IF
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT.WNTUF )
+     $       CALL DORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U,
+     $            LDU, WORK(N+1), LWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+      ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN
+*.......................................................................
+*       .. the singular values and the right singular vectors requested
+*.......................................................................
+          IF ( RTRANS ) THEN
+*            .. apply DGESVD to R**T
+*            .. copy R**T into V and overwrite V with the left singular vectors
+            DO 1165 p = 1, NR
+               DO 1166 q = p, N
+                  V(q,p) = (A(p,q))
+ 1166          CONTINUE
+ 1165       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV )
+*           .. the left singular vectors of R**T overwrite V, the right singular
+*           vectors not computed
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+               CALL DGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU,
+     $              U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+               DO 1121 p = 1, NR
+                   DO 1122 q = p + 1, NR
+                      RTMP   = V(q,p)
+                      V(q,p) = V(p,q)
+                      V(p,q) = RTMP
+ 1122              CONTINUE
+ 1121          CONTINUE
+*
+               IF ( NR .LT. N ) THEN
+                   DO 1103 p = 1, NR
+                      DO 1104 q = NR + 1, N
+                          V(p,q) = V(q,p)
+ 1104                 CONTINUE
+ 1103              CONTINUE
+               END IF
+               CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:N,1:NR)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the QR factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                CALL DLASET('G', N, N-NR, ZERO, ZERO, V(1,NR+1), LDV)
+                CALL DGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU,
+     $               U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+                DO 1123 p = 1, N
+                   DO 1124 q = p + 1, N
+                      RTMP   = V(q,p)
+                      V(q,p) = V(p,q)
+                      V(p,q) = RTMP
+ 1124              CONTINUE
+ 1123           CONTINUE
+                CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK )
+            END IF
+*
+          ELSE
+*            .. aply DGESVD to R
+*            .. copy R into V and overwrite V with the right singular vectors
+             CALL DLACPY( 'U', NR, N, A, LDA, V, LDV )
+             IF ( NR .GT. 1 )
+     $         CALL DLASET( 'L', NR-1, NR-1, ZERO, ZERO, V(2,1), LDV )
+*            .. the right singular vectors overwrite V, the NR left singular
+*            vectors stored in U(1:NR,1:NR)
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+                CALL DGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, WORK(N+1), LWORK-N, INFO )
+                CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T
+             ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:NR,1:N)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the LQ factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                 CALL DLASET('G', N-NR, N, ZERO,ZERO, V(NR+1,1), LDV)
+                 CALL DGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU,
+     $                V, LDV, WORK(N+1), LWORK-N, INFO )
+                 CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK )
+             END IF
+*            .. now [V] contains the transposed matrix of the right singular
+*            vectors of A.
+          END IF
+*
+      ELSE
+*.......................................................................
+*       .. FULL SVD requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*            .. apply DGESVD to R**T [[this option is left for R&D&T]]
+*
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*            .. copy R**T into [V] and overwrite [V] with the left singular
+*            vectors of R**T
+            DO 1168 p = 1, NR
+               DO 1169 q = p, N
+                  V(q,p) = A(p,q)
+ 1169          CONTINUE
+ 1168       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL DLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV )
+*
+*           .. the left singular vectors of R**T overwrite [V], the NR right
+*           singular vectors of R**T stored in [U](1:NR,1:NR) as transposed
+               CALL DGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV,
+     $              U, LDU, WORK(N+1), LWORK-N, INFO )
+*              .. assemble V
+               DO 1115 p = 1, NR
+                  DO 1116 q = p + 1, NR
+                     RTMP   = V(q,p)
+                     V(q,p) = V(p,q)
+                     V(p,q) = RTMP
+ 1116             CONTINUE
+ 1115          CONTINUE
+               IF ( NR .LT. N ) THEN
+                   DO 1101 p = 1, NR
+                      DO 1102 q = NR+1, N
+                         V(p,q) = V(q,p)
+ 1102                 CONTINUE
+ 1101              CONTINUE
+               END IF
+               CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*
+                DO 1117 p = 1, NR
+                   DO 1118 q = p + 1, NR
+                      RTMP   = U(q,p)
+                      U(q,p) = U(p,q)
+                      U(p,q) = RTMP
+ 1118              CONTINUE
+ 1117           CONTINUE
+*
+                IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL DLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*            .. copy R**T into [V] and overwrite [V] with the left singular
+*            vectors of R**T
+*               [[The optimal ratio N/NR for using QRF instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'DGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+                OPTRATIO = 2
+                IF ( OPTRATIO*NR .GT. N ) THEN
+                   DO 1198 p = 1, NR
+                      DO 1199 q = p, N
+                         V(q,p) = A(p,q)
+ 1199                 CONTINUE
+ 1198              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL DLASET('U',NR-1,NR-1, ZERO,ZERO, V(1,2),LDV)
+*
+                   CALL DLASET('A',N,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                   CALL DGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV,
+     $                  U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+                   DO 1113 p = 1, N
+                      DO 1114 q = p + 1, N
+                         RTMP   = V(q,p)
+                         V(q,p) = V(p,q)
+                         V(p,q) = RTMP
+ 1114                 CONTINUE
+ 1113              CONTINUE
+                   CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*              .. assemble the left singular vector matrix U of dimensions
+*              (M x N1), i.e. (M x N) or (M x M).
+*
+                   DO 1111 p = 1, N
+                      DO 1112 q = p + 1, N
+                         RTMP   = U(q,p)
+                         U(q,p) = U(p,q)
+                         U(p,q) = RTMP
+ 1112                 CONTINUE
+ 1111              CONTINUE
+*
+                   IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL DLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL DLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU)
+                        CALL DLASET('A',M-N,N1-N,ZERO,ONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                   END IF
+                ELSE
+*                  .. copy R**T into [U] and overwrite [U] with the right
+*                  singular vectors of R
+                   DO 1196 p = 1, NR
+                      DO 1197 q = p, N
+                         U(q,NR+p) = A(p,q)
+ 1197                 CONTINUE
+ 1196              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL DLASET('U',NR-1,NR-1,ZERO,ZERO,U(1,NR+2),LDU)
+                   CALL DGEQRF( N, NR, U(1,NR+1), LDU, WORK(N+1),
+     $                  WORK(N+NR+1), LWORK-N-NR, IERR )
+                   DO 1143 p = 1, NR
+                       DO 1144 q = 1, N
+                           V(q,p) = U(p,NR+q)
+ 1144                  CONTINUE
+ 1143              CONTINUE
+                  CALL DLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV)
+                  CALL DGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V,LDV, WORK(N+NR+1),LWORK-N-NR, INFO )
+                  CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV)
+                  CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                  CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV)
+                  CALL DORMQR('R','C', N, N, NR, U(1,NR+1), LDU,
+     $                 WORK(N+1),V,LDV,WORK(N+NR+1),LWORK-N-NR,IERR)
+                  CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL DLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1),LDU)
+                     END IF
+                  END IF
+                END IF
+            END IF
+*
+         ELSE
+*
+*            .. apply DGESVD to R [[this is the recommended option]]
+*
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*                .. copy R into [V] and overwrite V with the right singular vectors
+                 CALL DLACPY( 'U', NR, N, A, LDA, V, LDV )
+                IF ( NR .GT. 1 )
+     $          CALL DLASET( 'L', NR-1,NR-1, ZERO,ZERO, V(2,1), LDV )
+*               .. the right singular vectors of R overwrite [V], the NR left
+*               singular vectors of R stored in [U](1:NR,1:NR)
+                CALL DGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, WORK(N+1), LWORK-N, INFO )
+                CALL DLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+               IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL DLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+             ELSE
+*              .. need all N right singular vectors and NR < N
+*              .. the requested number of the left singular vectors
+*               is then N1 (N or M)
+*               [[The optimal ratio N/NR for using LQ instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'DGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+               OPTRATIO = 2
+               IF ( OPTRATIO * NR .GT. N ) THEN
+                  CALL DLACPY( 'U', NR, N, A, LDA, V, LDV )
+                  IF ( NR .GT. 1 )
+     $            CALL DLASET('L', NR-1,NR-1, ZERO,ZERO, V(2,1),LDV)
+*              .. the right singular vectors of R overwrite [V], the NR left
+*                 singular vectors of R stored in [U](1:NR,1:NR)
+                  CALL DLASET('A', N-NR,N, ZERO,ZERO, V(NR+1,1),LDV)
+                  CALL DGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU,
+     $                 V, LDV, WORK(N+1), LWORK-N, INFO )
+                  CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. now [V] contains the transposed matrix of the right
+*                 singular vectors of A. The leading N left singular vectors
+*                 are in [U](1:N,1:N)
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x N1), i.e. (M x N) or (M x M).
+                  IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL DLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL DLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU)
+                        CALL DLASET( 'A',M-N,N1-N,ZERO,ONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                  END IF
+               ELSE
+                  CALL DLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU )
+                  IF ( NR .GT. 1 )
+     $            CALL DLASET('L',NR-1,NR-1,ZERO,ZERO,U(NR+2,1),LDU)
+                  CALL DGELQF( NR, N, U(NR+1,1), LDU, WORK(N+1),
+     $                 WORK(N+NR+1), LWORK-N-NR, IERR )
+                  CALL DLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV)
+                  IF ( NR .GT. 1 )
+     $            CALL DLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV)
+                  CALL DGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V, LDV, WORK(N+NR+1), LWORK-N-NR, INFO )
+                  CALL DLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV)
+                  CALL DLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                  CALL DLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV)
+                  CALL DORMLQ('R','N',N,N,NR,U(NR+1,1),LDU,WORK(N+1),
+     $                 V, LDV, WORK(N+NR+1),LWORK-N-NR,IERR)
+                  CALL DLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL DLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL DLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL DLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1), LDU )
+                     END IF
+                  END IF
+               END IF
+             END IF
+*        .. end of the "R**T or R" branch
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT. WNTUF )
+     $       CALL DORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U,
+     $            LDU, WORK(N+1), LWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL DLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+*     ... end of the "full SVD" branch
+      END IF
+*
+*     Check whether some singular values are returned as zeros, e.g.
+*     due to underflow, and update the numerical rank.
+      p = NR
+      DO 4001 q = p, 1, -1
+          IF ( S(q) .GT. ZERO ) GO TO 4002
+          NR = NR - 1
+ 4001 CONTINUE
+ 4002 CONTINUE
+*
+*     .. if numerical rank deficiency is detected, the truncated
+*     singular values are set to zero.
+      IF ( NR .LT. N ) CALL DLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N )
+*     .. undo scaling; this may cause overflow in the largest singular
+*     values.
+      IF ( ASCALED )
+     $   CALL DLASCL( 'G',0,0, ONE,SQRT(DBLE(M)), NR,1, S, N, IERR )
+      IF ( CONDA ) RWORK(1) = SCONDA
+      RWORK(2) = p - NR
+*     .. p-NR is the number of singular values that are computed as
+*     exact zeros in DGESVD() applied to the (possibly truncated)
+*     full row rank triangular (trapezoidal) factor of A.
+      NUMRANK = NR
+*
+      RETURN
+*
+*     End of DGESVDQ
+*
+      END
diff --git a/lapack-netlib/SRC/dgesvj.f b/lapack-netlib/SRC/dgesvj.f
index 2cbc5ce0e..cf7aac982 100644
--- a/lapack-netlib/SRC/dgesvj.f
+++ b/lapack-netlib/SRC/dgesvj.f
@@ -90,13 +90,13 @@
 *>          JOBV is CHARACTER*1
 *>          Specifies whether to compute the right singular vectors, that
 *>          is, the matrix V:
-*>          = 'V' : the matrix V is computed and returned in the array V
-*>          = 'A' : the Jacobi rotations are applied to the MV-by-N
+*>          = 'V':  the matrix V is computed and returned in the array V
+*>          = 'A':  the Jacobi rotations are applied to the MV-by-N
 *>                  array V. In other words, the right singular vector
 *>                  matrix V is not computed explicitly, instead it is
 *>                  applied to an MV-by-N matrix initially stored in the
 *>                  first MV rows of V.
-*>          = 'N' : the matrix V is not computed and the array V is not
+*>          = 'N':  the matrix V is not computed and the array V is not
 *>                  referenced
 *> \endverbatim
 *>
@@ -118,8 +118,8 @@
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
 *>          On exit :
-*>          If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C' :
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'U' .OR. JOBU = 'C' :
+*>                 If INFO = 0 :
 *>                 RANKA orthonormal columns of U are returned in the
 *>                 leading RANKA columns of the array A. Here RANKA <= N
 *>                 is the number of computed singular values of A that are
@@ -129,9 +129,9 @@
 *>                 in the array WORK as RANKA=NINT(WORK(2)). Also see the
 *>                 descriptions of SVA and WORK. The computed columns of U
 *>                 are mutually numerically orthogonal up to approximately
-*>                 TOL=DSQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'),
+*>                 TOL=DSQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'),
 *>                 see the description of JOBU.
-*>                 If INFO .GT. 0 :
+*>                 If INFO > 0 :
 *>                 the procedure DGESVJ did not converge in the given number
 *>                 of iterations (sweeps). In that case, the computed
 *>                 columns of U may not be orthogonal up to TOL. The output
@@ -140,8 +140,8 @@
 *>                 input matrix A in the sense that the residual
 *>                 ||A-SCALE*U*SIGMA*V^T||_2 / ||A||_2 is small.
 *>
-*>          If JOBU .EQ. 'N' :
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'N' :
+*>                 If INFO = 0 :
 *>                 Note that the left singular vectors are 'for free' in the
 *>                 one-sided Jacobi SVD algorithm. However, if only the
 *>                 singular values are needed, the level of numerical
@@ -150,7 +150,7 @@
 *>                 numerically orthogonal up to approximately M*EPS. Thus,
 *>                 on exit, A contains the columns of U scaled with the
 *>                 corresponding singular values.
-*>                 If INFO .GT. 0 :
+*>                 If INFO > 0 :
 *>                 the procedure DGESVJ did not converge in the given number
 *>                 of iterations (sweeps).
 *> \endverbatim
@@ -165,9 +165,9 @@
 *> \verbatim
 *>          SVA is DOUBLE PRECISION array, dimension (N)
 *>          On exit :
-*>          If INFO .EQ. 0 :
+*>          If INFO = 0 :
 *>          depending on the value SCALE = WORK(1), we have:
-*>                 If SCALE .EQ. ONE :
+*>                 If SCALE = ONE :
 *>                 SVA(1:N) contains the computed singular values of A.
 *>                 During the computation SVA contains the Euclidean column
 *>                 norms of the iterated matrices in the array A.
@@ -175,7 +175,7 @@
 *>                 The singular values of A are SCALE*SVA(1:N), and this
 *>                 factored representation is due to the fact that some of the
 *>                 singular values of A might underflow or overflow.
-*>          If INFO .GT. 0 :
+*>          If INFO > 0 :
 *>          the procedure DGESVJ did not converge in the given number of
 *>          iterations (sweeps) and SCALE*SVA(1:N) may not be accurate.
 *> \endverbatim
@@ -183,7 +183,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then the product of Jacobi rotations in DGESVJ
+*>          If JOBV = 'A', then the product of Jacobi rotations in DGESVJ
 *>          is applied to the first MV rows of V. See the description of JOBV.
 *> \endverbatim
 *>
@@ -201,16 +201,16 @@
 *> \param[in] LDV
 *> \verbatim
 *>          LDV is INTEGER
-*>          The leading dimension of the array V, LDV .GE. 1.
-*>          If JOBV .EQ. 'V', then LDV .GE. max(1,N).
-*>          If JOBV .EQ. 'A', then LDV .GE. max(1,MV) .
+*>          The leading dimension of the array V, LDV >= 1.
+*>          If JOBV = 'V', then LDV >= max(1,N).
+*>          If JOBV = 'A', then LDV >= max(1,MV) .
 *> \endverbatim
 *>
 *> \param[in,out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (LWORK)
 *>          On entry :
-*>          If JOBU .EQ. 'C' :
+*>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
 *>                    The process stops if all columns of A are mutually
 *>                    orthogonal up to CTOL*EPS, EPS=DLAMCH('E').
@@ -230,7 +230,7 @@
 *>          WORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep.
 *>                    This is useful information in cases when DGESVJ did
 *>                    not converge, as it can be used to estimate whether
-*>                    the output is stil useful and for post festum analysis.
+*>                    the output is still useful and for post festum analysis.
 *>          WORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
@@ -245,9 +245,9 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
-*>          > 0 : DGESVJ did not converge in the maximal allowed number (30)
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
+*>          > 0:  DGESVJ did not converge in the maximal allowed number (30)
 *>                of sweeps. The output may still be useful. See the
 *>                description of WORK.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dgesvxx.f b/lapack-netlib/SRC/dgesvxx.f
index afcd05d8e..21b56f61c 100644
--- a/lapack-netlib/SRC/dgesvxx.f
+++ b/lapack-netlib/SRC/dgesvxx.f
@@ -411,7 +411,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -447,14 +447,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension (NPARAMS)
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -462,9 +462,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/dgetc2.f b/lapack-netlib/SRC/dgetc2.f
index 0896a7013..5bf5b890f 100644
--- a/lapack-netlib/SRC/dgetc2.f
+++ b/lapack-netlib/SRC/dgetc2.f
@@ -85,7 +85,7 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>           = 0: successful exit
-*>           > 0: if INFO = k, U(k, k) is likely to produce owerflow if
+*>           > 0: if INFO = k, U(k, k) is likely to produce overflow if
 *>                we try to solve for x in Ax = b. So U is perturbed to
 *>                avoid the overflow.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dgetsls.f b/lapack-netlib/SRC/dgetsls.f
index 3b44a40ab..dfc72c8b2 100644
--- a/lapack-netlib/SRC/dgetsls.f
+++ b/lapack-netlib/SRC/dgetsls.f
@@ -1,3 +1,5 @@
+*> \brief \b DGETSLS
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/dggesx.f b/lapack-netlib/SRC/dggesx.f
index 47022fbdf..0e57d636e 100644
--- a/lapack-netlib/SRC/dggesx.f
+++ b/lapack-netlib/SRC/dggesx.f
@@ -131,10 +131,10 @@
 *> \verbatim
 *>          SENSE is CHARACTER*1
 *>          Determines which reciprocal condition numbers are computed.
-*>          = 'N' : None are computed;
-*>          = 'E' : Computed for average of selected eigenvalues only;
-*>          = 'V' : Computed for selected deflating subspaces only;
-*>          = 'B' : Computed for both.
+*>          = 'N':  None are computed;
+*>          = 'E':  Computed for average of selected eigenvalues only;
+*>          = 'V':  Computed for selected deflating subspaces only;
+*>          = 'B':  Computed for both.
 *>          If SENSE = 'E', 'V', or 'B', SORT must equal 'S'.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dgsvj0.f b/lapack-netlib/SRC/dgsvj0.f
index 4fd38d37e..318e369fd 100644
--- a/lapack-netlib/SRC/dgsvj0.f
+++ b/lapack-netlib/SRC/dgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
@@ -136,8 +136,8 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -157,7 +157,7 @@
 *>          TOL is DOUBLE PRECISION
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if DABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if DABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -175,14 +175,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
@@ -1045,7 +1045,7 @@
 
  1993 CONTINUE
 *     end i=1:NSWEEP loop
-* #:) Reaching this point means that the procedure has comleted the given
+* #:) Reaching this point means that the procedure has completed the given
 *     number of iterations.
       INFO = NSWEEP - 1
       GO TO 1995
diff --git a/lapack-netlib/SRC/dgsvj1.f b/lapack-netlib/SRC/dgsvj1.f
index 376682c7f..35a93619f 100644
--- a/lapack-netlib/SRC/dgsvj1.f
+++ b/lapack-netlib/SRC/dgsvj1.f
@@ -61,7 +61,7 @@
 *> In terms of the columns of A, the first N1 columns are rotated 'against'
 *> the remaining N-N1 columns, trying to increase the angle between the
 *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
-*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter.
+*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter.
 *> The number of sweeps is given in NSWEEP and the orthogonality threshold
 *> is given in TOL.
 *> \endverbatim
@@ -147,27 +147,27 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
-*>                           sequence of Jacobi rotations.
-*>          If JOBV = 'N',   then MV is not referenced.
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>                         sequence of Jacobi rotations.
+*>          If JOBV = 'N', then MV is not referenced.
 *> \endverbatim
 *>
 *> \param[in,out] V
 *> \verbatim
 *>          V is DOUBLE PRECISION array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
-*>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
-*>                           sequence of Jacobi rotations.
-*>          If JOBV = 'N',   then V is not referenced.
+*>          If JOBV = 'V', then N rows of V are post-multipled by a
+*>                         sequence of Jacobi rotations.
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
+*>                         sequence of Jacobi rotations.
+*>          If JOBV = 'N', then V is not referenced.
 *> \endverbatim
 *>
 *> \param[in] LDV
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -187,7 +187,7 @@
 *>          TOL is DOUBLE PRECISION
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if DABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if DABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -205,14 +205,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f
index 4444b955f..b4fc3af90 100644
--- a/lapack-netlib/SRC/dhseqr.f
+++ b/lapack-netlib/SRC/dhseqr.f
@@ -70,7 +70,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -87,7 +87,7 @@
 *>           set by a previous call to DGEBAL, and then passed to ZGEHRD
 *>           when the matrix output by DGEBAL is reduced to Hessenberg
 *>           form. Otherwise ILO and IHI should be set to 1 and N
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -100,20 +100,20 @@
 *>           (the Schur form); 2-by-2 diagonal blocks (corresponding to
 *>           complex conjugate pairs of eigenvalues) are returned in
 *>           standard form, with H(i,i) = H(i+1,i+1) and
-*>           H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the
+*>           H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and JOB = 'E', the
 *>           contents of H are unspecified on exit.  (The output value of
-*>           H when INFO.GT.0 is given under the description of INFO
+*>           H when INFO > 0 is given under the description of INFO
 *>           below.)
 *>
 *>           Unlike earlier versions of DHSEQR, this subroutine may
-*>           explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+*>           explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1
 *>           or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] WR
@@ -128,8 +128,8 @@
 *>           The real and imaginary parts, respectively, of the computed
 *>           eigenvalues. If two eigenvalues are computed as a complex
 *>           conjugate pair, they are stored in consecutive elements of
-*>           WR and WI, say the i-th and (i+1)th, with WI(i) .GT. 0 and
-*>           WI(i+1) .LT. 0. If JOB = 'S', the eigenvalues are stored in
+*>           WR and WI, say the i-th and (i+1)th, with WI(i) > 0 and
+*>           WI(i+1) < 0. If JOB = 'S', the eigenvalues are stored in
 *>           the same order as on the diagonal of the Schur form returned
 *>           in H, with WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2
 *>           diagonal block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
@@ -148,7 +148,7 @@
 *>           if INFO = 0, Z contains Q*Z.
 *>           Normally Q is the orthogonal matrix generated by DORGHR
 *>           after the call to DGEHRD which formed the Hessenberg matrix
-*>           H. (The output value of Z when INFO.GT.0 is given under
+*>           H. (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -156,7 +156,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if COMPZ = 'I' or
-*>           COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+*>           COMPZ = 'V', then LDZ >= MAX(1,N).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -169,7 +169,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient and delivers very good and sometimes
 *>           optimal performance.  However, LWORK as large as 11*N
 *>           may be required for optimal performance.  A workspace
@@ -187,21 +187,21 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .LT. 0:  if INFO = -i, the i-th argument had an illegal
+*>             = 0:  successful exit
+*>             < 0:  if INFO = -i, the i-th argument had an illegal
 *>                    value
-*>           .GT. 0:  if INFO = i, DHSEQR failed to compute all of
+*>             > 0:  if INFO = i, DHSEQR failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and JOB = 'E', then on exit, the
+*>                If INFO > 0 and JOB = 'E', then on exit, the
 *>                remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and JOB   = 'S', then on exit
+*>                If INFO > 0 and JOB   = 'S', then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -209,19 +209,19 @@
 *>                value of H is upper Hessenberg and quasi-triangular
 *>                in rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and COMPZ = 'V', then on exit
+*>                If INFO > 0 and COMPZ = 'V', then on exit
 *>
 *>                  (final value of Z)  =  (initial value of Z)*U
 *>
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'I', then on exit
+*>                If INFO > 0 and COMPZ = 'I', then on exit
 *>                      (final value of Z)  = U
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'N', then Z is not
+*>                If INFO > 0 and COMPZ = 'N', then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -261,8 +261,8 @@
 *>                      This depends on ILO, IHI and NS.  NS is the
 *>                      number of simultaneous shifts returned
 *>                      by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
-*>                      The default for (IHI-ILO+1).LE.500 is NS.
-*>                      The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+*>                      The default for (IHI-ILO+1) <= 500 is NS.
+*>                      The default for (IHI-ILO+1) >  500 is 3*NS/2.
 *>
 *>            ISPEC=14: Nibble crossover point. (See IPARMQ for
 *>                      details.)  Default: 14% of deflation window
@@ -341,8 +341,8 @@
       PARAMETER          ( NTINY = 11 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare DLAHQR failure.  NL .GT. NTINY = 11 is
-*     .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+*     .    through a rare DLAHQR failure.  NL > NTINY = 11 is
+*     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
 *     .    deflation window.  ====
diff --git a/lapack-netlib/SRC/dla_gbrcond.f b/lapack-netlib/SRC/dla_gbrcond.f
index e9713c9ca..c9eebcbea 100644
--- a/lapack-netlib/SRC/dla_gbrcond.f
+++ b/lapack-netlib/SRC/dla_gbrcond.f
@@ -141,13 +141,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (5*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/dla_gbrfsx_extended.f b/lapack-netlib/SRC/dla_gbrfsx_extended.f
index 12b2a32a4..282a63f1c 100644
--- a/lapack-netlib/SRC/dla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gbrfsx_extended.f
@@ -66,19 +66,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -270,7 +270,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/dla_gercond.f b/lapack-netlib/SRC/dla_gercond.f
index aa93ca5a4..6f7d70a6a 100644
--- a/lapack-netlib/SRC/dla_gercond.f
+++ b/lapack-netlib/SRC/dla_gercond.f
@@ -123,13 +123,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (3*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/dla_gerfsx_extended.f b/lapack-netlib/SRC/dla_gerfsx_extended.f
index 082f810f0..4cb9ef4f9 100644
--- a/lapack-netlib/SRC/dla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/dla_gerfsx_extended.f
@@ -64,19 +64,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -256,7 +256,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERRS_C is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERRS_C is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERRS_C(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/dla_porcond.f b/lapack-netlib/SRC/dla_porcond.f
index 498e707e3..b2f9c4b1e 100644
--- a/lapack-netlib/SRC/dla_porcond.f
+++ b/lapack-netlib/SRC/dla_porcond.f
@@ -113,13 +113,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (3*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/dla_porfsx_extended.f b/lapack-netlib/SRC/dla_porfsx_extended.f
index 8c0d6bebd..ece9b00ed 100644
--- a/lapack-netlib/SRC/dla_porfsx_extended.f
+++ b/lapack-netlib/SRC/dla_porfsx_extended.f
@@ -65,11 +65,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -246,7 +246,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/dla_porpvgrw.f b/lapack-netlib/SRC/dla_porpvgrw.f
index 4fe1a1922..8a6f9e1a7 100644
--- a/lapack-netlib/SRC/dla_porpvgrw.f
+++ b/lapack-netlib/SRC/dla_porpvgrw.f
@@ -85,7 +85,7 @@
 *>     The leading dimension of the array AF.  LDAF >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dla_syrcond.f b/lapack-netlib/SRC/dla_syrcond.f
index 91d557145..23ed82588 100644
--- a/lapack-netlib/SRC/dla_syrcond.f
+++ b/lapack-netlib/SRC/dla_syrcond.f
@@ -119,13 +119,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (3*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/dla_syrfsx_extended.f b/lapack-netlib/SRC/dla_syrfsx_extended.f
index f54d15194..b390600c0 100644
--- a/lapack-netlib/SRC/dla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/dla_syrfsx_extended.f
@@ -67,11 +67,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -255,7 +255,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/dla_syrpvgrw.f b/lapack-netlib/SRC/dla_syrpvgrw.f
index c2e5cb018..5ba03093b 100644
--- a/lapack-netlib/SRC/dla_syrpvgrw.f
+++ b/lapack-netlib/SRC/dla_syrpvgrw.f
@@ -101,7 +101,7 @@
 *>     as determined by DSYTRF.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dla_wwaddw.f b/lapack-netlib/SRC/dla_wwaddw.f
index 99a86c553..4f50540d6 100644
--- a/lapack-netlib/SRC/dla_wwaddw.f
+++ b/lapack-netlib/SRC/dla_wwaddw.f
@@ -36,7 +36,7 @@
 *>    DLA_WWADDW adds a vector W into a doubled-single vector (X, Y).
 *>
 *>    This works for all extant IBM's hex and binary floating point
-*>    arithmetics, but not for decimal.
+*>    arithmetic, but not for decimal.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/dlaed4.f b/lapack-netlib/SRC/dlaed4.f
index e7dc839df..033438d73 100644
--- a/lapack-netlib/SRC/dlaed4.f
+++ b/lapack-netlib/SRC/dlaed4.f
@@ -82,7 +82,7 @@
 *> \param[out] DELTA
 *> \verbatim
 *>          DELTA is DOUBLE PRECISION array, dimension (N)
-*>         If N .GT. 2, DELTA contains (D(j) - lambda_I) in its  j-th
+*>         If N > 2, DELTA contains (D(j) - lambda_I) in its  j-th
 *>         component.  If N = 1, then DELTA(1) = 1. If N = 2, see DLAED5
 *>         for detail. The vector DELTA contains the information necessary
 *>         to construct the eigenvectors by DLAED3 and DLAED9.
diff --git a/lapack-netlib/SRC/dlaed8.f b/lapack-netlib/SRC/dlaed8.f
index c053347b1..f64679dc0 100644
--- a/lapack-netlib/SRC/dlaed8.f
+++ b/lapack-netlib/SRC/dlaed8.f
@@ -353,7 +353,7 @@
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
-*     Calculate the allowable deflation tolerence
+*     Calculate the allowable deflation tolerance
 *
       IMAX = IDAMAX( N, Z, 1 )
       JMAX = IDAMAX( N, D, 1 )
diff --git a/lapack-netlib/SRC/dlagtf.f b/lapack-netlib/SRC/dlagtf.f
index 4b257c64f..b92c84f39 100644
--- a/lapack-netlib/SRC/dlagtf.f
+++ b/lapack-netlib/SRC/dlagtf.f
@@ -125,7 +125,7 @@
 *>          then IN(k) = 1, otherwise IN(k) = 0. The element IN(n)
 *>          returns the smallest positive integer j such that
 *>
-*>             abs( u(j,j) ).le. norm( (T - lambda*I)(j) )*TOL,
+*>             abs( u(j,j) ) <= norm( (T - lambda*I)(j) )*TOL,
 *>
 *>          where norm( A(j) ) denotes the sum of the absolute values of
 *>          the jth row of the matrix A. If no such j exists then IN(n)
@@ -137,8 +137,8 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0   : successful exit
-*>          .lt. 0: if INFO = -k, the kth argument had an illegal value
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -k, the kth argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dlagts.f b/lapack-netlib/SRC/dlagts.f
index 926075827..cbd35ae14 100644
--- a/lapack-netlib/SRC/dlagts.f
+++ b/lapack-netlib/SRC/dlagts.f
@@ -122,12 +122,12 @@
 *> \param[in,out] TOL
 *> \verbatim
 *>          TOL is DOUBLE PRECISION
-*>          On entry, with  JOB .lt. 0, TOL should be the minimum
+*>          On entry, with  JOB < 0, TOL should be the minimum
 *>          perturbation to be made to very small diagonal elements of U.
 *>          TOL should normally be chosen as about eps*norm(U), where eps
 *>          is the relative machine precision, but if TOL is supplied as
 *>          non-positive, then it is reset to eps*max( abs( u(i,j) ) ).
-*>          If  JOB .gt. 0  then TOL is not referenced.
+*>          If  JOB > 0  then TOL is not referenced.
 *>
 *>          On exit, TOL is changed as described above, only if TOL is
 *>          non-positive on entry. Otherwise TOL is unchanged.
@@ -136,14 +136,14 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0   : successful exit
-*>          .lt. 0: if INFO = -i, the i-th argument had an illegal value
-*>          .gt. 0: overflow would occur when computing the INFO(th)
-*>                  element of the solution vector x. This can only occur
-*>                  when JOB is supplied as positive and either means
-*>                  that a diagonal element of U is very small, or that
-*>                  the elements of the right-hand side vector y are very
-*>                  large.
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*>          > 0:  overflow would occur when computing the INFO(th)
+*>                element of the solution vector x. This can only occur
+*>                when JOB is supplied as positive and either means
+*>                that a diagonal element of U is very small, or that
+*>                the elements of the right-hand side vector y are very
+*>                large.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/dlahqr.f b/lapack-netlib/SRC/dlahqr.f
index f7365d21e..e863829ec 100644
--- a/lapack-netlib/SRC/dlahqr.f
+++ b/lapack-netlib/SRC/dlahqr.f
@@ -150,26 +150,26 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           =   0: successful exit
-*>          .GT. 0: If INFO = i, DLAHQR failed to compute all the
+*>           = 0:  successful exit
+*>           > 0:  If INFO = i, DLAHQR failed to compute all the
 *>                  eigenvalues ILO to IHI in a total of 30 iterations
 *>                  per eigenvalue; elements i+1:ihi of WR and WI
 *>                  contain those eigenvalues which have been
 *>                  successfully computed.
 *>
-*>                  If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+*>                  If INFO > 0 and WANTT is .FALSE., then on exit,
 *>                  the remaining unconverged eigenvalues are the
 *>                  eigenvalues of the upper Hessenberg matrix rows
-*>                  and columns ILO thorugh INFO of the final, output
+*>                  and columns ILO through INFO of the final, output
 *>                  value of H.
 *>
-*>                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                  If INFO > 0 and WANTT is .TRUE., then on exit
 *>          (*)       (initial value of H)*U  = U*(final value of H)
-*>                  where U is an orthognal matrix.    The final
+*>                  where U is an orthogonal matrix.    The final
 *>                  value of H is upper Hessenberg and triangular in
 *>                  rows and columns INFO+1 through IHI.
 *>
-*>                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                  If INFO > 0 and WANTZ is .TRUE., then on exit
 *>                      (final value of Z)  = (initial value of Z)*U
 *>                  where U is the orthogonal matrix in (*)
 *>                  (regardless of the value of WANTT.)
diff --git a/lapack-netlib/SRC/dlaln2.f b/lapack-netlib/SRC/dlaln2.f
index a094b737b..0c94ea308 100644
--- a/lapack-netlib/SRC/dlaln2.f
+++ b/lapack-netlib/SRC/dlaln2.f
@@ -49,7 +49,7 @@
 *> the first column of each being the real part and the second
 *> being the imaginary part.
 *>
-*> "s" is a scaling factor (.LE. 1), computed by DLALN2, which is
+*> "s" is a scaling factor (<= 1), computed by DLALN2, which is
 *> so chosen that X can be computed without overflow.  X is further
 *> scaled if necessary to assure that norm(ca A - w D)*norm(X) is less
 *> than overflow.
diff --git a/lapack-netlib/SRC/dlamswlq.f b/lapack-netlib/SRC/dlamswlq.f
index 19e32f888..306c3d3de 100644
--- a/lapack-netlib/SRC/dlamswlq.f
+++ b/lapack-netlib/SRC/dlamswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b DLAMSWLQ
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/dlamtsqr.f b/lapack-netlib/SRC/dlamtsqr.f
index 6af89d28e..41a067780 100644
--- a/lapack-netlib/SRC/dlamtsqr.f
+++ b/lapack-netlib/SRC/dlamtsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b DLAMTSQR
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/dlangb.f b/lapack-netlib/SRC/dlangb.f
index 078573b87..0c4f938f7 100644
--- a/lapack-netlib/SRC/dlangb.f
+++ b/lapack-netlib/SRC/dlangb.f
@@ -129,6 +129,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            KL, KU, LDAB, N
@@ -139,22 +140,24 @@
 *
 * =====================================================================
 *
-*
 *     .. Parameters ..
       DOUBLE PRECISION   ONE, ZERO
       PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K, L
-      DOUBLE PRECISION   SCALE, SUM, VALUE, TEMP
+      DOUBLE PRECISION   SUM, VALUE, TEMP
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
 *     ..
@@ -206,15 +209,22 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
             L = MAX( 1, J-KU )
             K = KU + 1 - J + L
-            CALL DLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL DLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL DCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANGB = VALUE
diff --git a/lapack-netlib/SRC/dlange.f b/lapack-netlib/SRC/dlange.f
index 9dbf45e81..6b32fbefd 100644
--- a/lapack-netlib/SRC/dlange.f
+++ b/lapack-netlib/SRC/dlange.f
@@ -119,6 +119,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, M, N
@@ -135,10 +136,13 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      DOUBLE PRECISION   SCALE, SUM, VALUE, TEMP
+      DOUBLE PRECISION   SUM, VALUE, TEMP
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+      EXTERNAL           DLASSQ, DCOMBSSQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
@@ -194,13 +198,19 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL DLASSQ( M, A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL DLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL DCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANGE = VALUE
diff --git a/lapack-netlib/SRC/dlanhs.f b/lapack-netlib/SRC/dlanhs.f
index 691dbc21e..a859d2216 100644
--- a/lapack-netlib/SRC/dlanhs.f
+++ b/lapack-netlib/SRC/dlanhs.f
@@ -113,6 +113,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, N
@@ -129,15 +130,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
 *     ..
@@ -188,13 +192,20 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL DLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL DLASSQ( MIN( N, J+1 ), A( 1, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL DCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANHS = VALUE
diff --git a/lapack-netlib/SRC/dlansb.f b/lapack-netlib/SRC/dlansb.f
index 4ccf5f27e..a82dc41b1 100644
--- a/lapack-netlib/SRC/dlansb.f
+++ b/lapack-netlib/SRC/dlansb.f
@@ -134,6 +134,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            K, LDAB, N
@@ -150,15 +151,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, L
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
 *     ..
@@ -225,29 +229,47 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( K.GT.0 ) THEN
             IF( LSAME( UPLO, 'U' ) ) THEN
                DO 110 J = 2, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL DLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   110          CONTINUE
                L = K + 1
             ELSE
                DO 120 J = 1, N - 1
-                  CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   120          CONTINUE
                L = 1
             END IF
-            SUM = 2*SUM
+            SSQ( 2 ) = 2*SSQ( 2 )
          ELSE
             L = 1
          END IF
-         CALL DLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL DLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANSB = VALUE
diff --git a/lapack-netlib/SRC/dlansp.f b/lapack-netlib/SRC/dlansp.f
index a1829db75..b6ad1ffcf 100644
--- a/lapack-netlib/SRC/dlansp.f
+++ b/lapack-netlib/SRC/dlansp.f
@@ -119,6 +119,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            N
@@ -135,15 +136,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
 *     ..
@@ -217,31 +221,48 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          K = 2
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL DLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL DLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
                K = K + J
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL DLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL DLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
                K = K + N - J + 1
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          K = 1
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 I = 1, N
             IF( AP( K ).NE.ZERO ) THEN
                ABSA = ABS( AP( K ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( LSAME( UPLO, 'U' ) ) THEN
@@ -250,7 +271,8 @@
                K = K + N - I + 1
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANSP = VALUE
diff --git a/lapack-netlib/SRC/dlansy.f b/lapack-netlib/SRC/dlansy.f
index 2372fce0a..87d514c11 100644
--- a/lapack-netlib/SRC/dlansy.f
+++ b/lapack-netlib/SRC/dlansy.f
@@ -127,6 +127,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            LDA, N
@@ -143,15 +144,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
 *     ..
@@ -216,21 +220,39 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL DLASSQ( J-1, A( 1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL DLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL DLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL DLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
   120       CONTINUE
          END IF
-         SUM = 2*SUM
-         CALL DLASSQ( N, A, LDA+1, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL DLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANSY = VALUE
diff --git a/lapack-netlib/SRC/dlantb.f b/lapack-netlib/SRC/dlantb.f
index 3d2bfe7e4..0d46f6cc8 100644
--- a/lapack-netlib/SRC/dlantb.f
+++ b/lapack-netlib/SRC/dlantb.f
@@ -145,6 +145,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            K, LDAB, N
@@ -162,15 +163,18 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, L
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
 *     ..
@@ -311,46 +315,61 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 280 J = 2, N
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
                      CALL DLASSQ( MIN( J-1, K ),
-     $                            AB( MAX( K+2-J, 1 ), J ), 1, SCALE,
-     $                            SUM )
+     $                            AB( MAX( K+2-J, 1 ), J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL DCOMBSSQ( SSQ, COLSSQ )
   280             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 290 J = 1, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL DLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 300 J = 1, N - 1
-                     CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                            SUM )
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
+                     CALL DLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL DCOMBSSQ( SSQ, COLSSQ )
   300             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 310 J = 1, N
-                  CALL DLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANTB = VALUE
diff --git a/lapack-netlib/SRC/dlantp.f b/lapack-netlib/SRC/dlantp.f
index f84a9e9d7..a7b89dec7 100644
--- a/lapack-netlib/SRC/dlantp.f
+++ b/lapack-netlib/SRC/dlantp.f
@@ -129,6 +129,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            N
@@ -146,15 +147,18 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, K
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
 *     ..
@@ -306,45 +310,64 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 280 J = 2, N
-                  CALL DLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( J-1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   280          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 290 J = 1, N
-                  CALL DLASSQ( J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 300 J = 1, N - 1
-                  CALL DLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( N-J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   300          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 310 J = 1, N
-                  CALL DLASSQ( N-J+1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( N-J+1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANTP = VALUE
diff --git a/lapack-netlib/SRC/dlantr.f b/lapack-netlib/SRC/dlantr.f
index 8585b2f68..adc7da4c4 100644
--- a/lapack-netlib/SRC/dlantr.f
+++ b/lapack-netlib/SRC/dlantr.f
@@ -146,6 +146,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            LDA, M, N
@@ -163,15 +164,18 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           DLASSQ
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLASSQ, DCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
 *     ..
@@ -281,7 +285,7 @@
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               DO 210 I = 1, N
+               DO 210 I = 1, MIN( M, N )
                   WORK( I ) = ONE
   210          CONTINUE
                DO 220 I = N + 1, M
@@ -311,38 +315,56 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 290 J = 2, N
-                  CALL DLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( MIN( M, J-1 ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 300 J = 1, N
-                  CALL DLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( MIN( M, J ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   300          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 310 J = 1, N
-                  CALL DLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( M-J, A( MIN( M, J+1 ), J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 320 J = 1, N
-                  CALL DLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL DLASSQ( M-J+1, A( J, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   320          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       DLANTR = VALUE
diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f
index 91fa14ff2..d68481f7e 100644
--- a/lapack-netlib/SRC/dlanv2.f
+++ b/lapack-netlib/SRC/dlanv2.f
@@ -161,7 +161,6 @@
       IF( C.EQ.ZERO ) THEN
          CS = ONE
          SN = ZERO
-         GO TO 10
 *
       ELSE IF( B.EQ.ZERO ) THEN
 *
@@ -174,12 +173,12 @@
          A = TEMP
          B = -C
          C = ZERO
-         GO TO 10
+*
       ELSE IF( ( A-D ).EQ.ZERO .AND. SIGN( ONE, B ).NE.SIGN( ONE, C ) )
      $          THEN
          CS = ONE
          SN = ZERO
-         GO TO 10
+*
       ELSE
 *
          TEMP = A - D
@@ -207,6 +206,7 @@
             SN = C / TAU
             B = B - C
             C = ZERO
+*
          ELSE
 *
 *           Complex eigenvalues, or real (almost) equal eigenvalues.
@@ -268,8 +268,6 @@
          END IF
 *
       END IF
-*
-   10 CONTINUE
 *
 *     Store eigenvalues in (RT1R,RT1I) and (RT2R,RT2I).
 *
diff --git a/lapack-netlib/SRC/dlaorhr_col_getrfnp.f b/lapack-netlib/SRC/dlaorhr_col_getrfnp.f
new file mode 100644
index 000000000..6a7c629e8
--- /dev/null
+++ b/lapack-netlib/SRC/dlaorhr_col_getrfnp.f
@@ -0,0 +1,248 @@
+*> \brief \b DLAORHR_COL_GETRFNP
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAORHR_COL_GETRFNP + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaorhr_col_getrfnp.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaorhr_col_getrfnp.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaorhr_col_getrfnp.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAORHR_COL_GETRFNP computes the modified LU factorization without
+*> pivoting of a real general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is
+*>    at least one in absolute value (so that division-by-zero not
+*>    not possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the blocked right-looking version of the algorithm,
+*> calling Level 3 BLAS to update the submatrix. To factorize a block,
+*> this routine calls the recursive routine DLAORHR_COL_GETRFNP2.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is DOUBLE PRECISION array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can
+*>          be only plus or minus one.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup doubleGEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            IINFO, J, JB, NB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DLAORHR_COL_GETRFNP2, DTRSM, XERBLA
+*     ..
+*     .. External Functions ..
+      INTEGER            ILAENV
+      EXTERNAL           ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DLAORHR_COL_GETRFNP', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+*
+*     Determine the block size for this environment.
+*
+
+      NB = ILAENV( 1, 'DLAORHR_COL_GETRFNP', ' ', M, N, -1, -1 )
+
+      IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN
+*
+*        Use unblocked code.
+*
+         CALL DLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      ELSE
+*
+*        Use blocked code.
+*
+         DO J = 1, MIN( M, N ), NB
+            JB = MIN( MIN( M, N )-J+1, NB )
+*
+*           Factor diagonal and subdiagonal blocks.
+*
+            CALL DLAORHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA,
+     $                                 D( J ), IINFO )
+*
+            IF( J+JB.LE.N ) THEN
+*
+*              Compute block row of U.
+*
+               CALL DTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB,
+     $                     N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ),
+     $                     LDA )
+               IF( J+JB.LE.M ) THEN
+*
+*                 Update trailing submatrix.
+*
+                  CALL DGEMM( 'No transpose', 'No transpose', M-J-JB+1,
+     $                        N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA,
+     $                        A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ),
+     $                        LDA )
+               END IF
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of DLAORHR_COL_GETRFNP
+*
+      END
diff --git a/lapack-netlib/SRC/dlaorhr_col_getrfnp2.f b/lapack-netlib/SRC/dlaorhr_col_getrfnp2.f
new file mode 100644
index 000000000..f7781f2e5
--- /dev/null
+++ b/lapack-netlib/SRC/dlaorhr_col_getrfnp2.f
@@ -0,0 +1,305 @@
+*> \brief \b DLAORHR_COL_GETRFNP2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAORHR_GETRF2NP + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dlaorhr_col_getrfnp2.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dlaorhr_col_getrfnp2.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dlaorhr_col_getrfnp2.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       RECURSIVE SUBROUTINE DLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION   A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DLAORHR_COL_GETRFNP2 computes the modified LU factorization without
+*> pivoting of a real general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is at
+*>    least one in absolute value (so that division-by-zero not
+*>    possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine DORHR_COL. In DORHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the recursive version of the LU factorization algorithm.
+*> Denote A - S by B. The algorithm divides the matrix B into four
+*> submatrices:
+*>
+*>        [  B11 | B12  ]  where B11 is n1 by n1,
+*>    B = [ -----|----- ]        B21 is (m-n1) by n1,
+*>        [  B21 | B22  ]        B12 is n1 by n2,
+*>                               B22 is (m-n1) by n2,
+*>                               with n1 = min(m,n)/2, n2 = n-n1.
+*>
+*>
+*> The subroutine calls itself to factor B11, solves for B21,
+*> solves for B12, updates B22, then calls itself to factor B22.
+*>
+*> For more details on the recursive LU algorithm, see [2].
+*>
+*> DLAORHR_COL_GETRFNP2 is called to factorize a block by the blocked
+*> routine DLAORHR_COL_GETRFNP, which uses blocked code calling
+*. Level 3 BLAS to update the submatrix. However, DLAORHR_COL_GETRFNP2
+*> is self-sufficient and can be used without DLAORHR_COL_GETRFNP.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*>
+*> [2] "Recursion leads to automatic variable blocking for dense linear
+*>     algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+*>     vol. 41, no. 6, pp. 737-755, 1997.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is DOUBLE PRECISION array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can
+*>          be only plus or minus one.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup doubleGEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      RECURSIVE SUBROUTINE DLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION   A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   SFMIN
+      INTEGER            I, IINFO, N1, N2
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
+      EXTERNAL           DLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DGEMM, DSCAL, DTRSM, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DSIGN, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DLAORHR_COL_GETRFNP2', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+
+      IF ( M.EQ.1 ) THEN
+*
+*        One row case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = -DSIGN( ONE, A( 1, 1 ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+      ELSE IF( N.EQ.1 ) THEN
+*
+*        One column case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = -DSIGN( ONE, A( 1, 1 ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+*        Scale the elements 2:M of the column
+*
+*        Determine machine safe minimum
+*
+         SFMIN = DLAMCH('S')
+*
+*        Construct the subdiagonal elements of L
+*
+         IF( ABS( A( 1, 1 ) ) .GE. SFMIN ) THEN
+            CALL DSCAL( M-1, ONE / A( 1, 1 ), A( 2, 1 ), 1 )
+         ELSE
+            DO I = 2, M
+               A( I, 1 ) = A( I, 1 ) / A( 1, 1 )
+            END DO
+         END IF
+*
+      ELSE
+*
+*        Divide the matrix B into four submatrices
+*
+         N1 = MIN( M, N ) / 2
+         N2 = N-N1
+
+*
+*        Factor B11, recursive call
+*
+         CALL DLAORHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO )
+*
+*        Solve for B21
+*
+         CALL DTRSM( 'R', 'U', 'N', 'N', M-N1, N1, ONE, A, LDA,
+     $               A( N1+1, 1 ), LDA )
+*
+*        Solve for B12
+*
+         CALL DTRSM( 'L', 'L', 'N', 'U', N1, N2, ONE, A, LDA,
+     $               A( 1, N1+1 ), LDA )
+*
+*        Update B22, i.e. compute the Schur complement
+*        B22 := B22 - B21*B12
+*
+         CALL DGEMM( 'N', 'N', M-N1, N2, N1, -ONE, A( N1+1, 1 ), LDA,
+     $               A( 1, N1+1 ), LDA, ONE, A( N1+1, N1+1 ), LDA )
+*
+*        Factor B22, recursive call
+*
+         CALL DLAORHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA,
+     $                              D( N1+1 ), IINFO )
+*
+      END IF
+      RETURN
+*
+*     End of DLAORHR_COL_GETRFNP2
+*
+      END
diff --git a/lapack-netlib/SRC/dlaqps.f b/lapack-netlib/SRC/dlaqps.f
index 395d8e0b1..0009de951 100644
--- a/lapack-netlib/SRC/dlaqps.f
+++ b/lapack-netlib/SRC/dlaqps.f
@@ -127,7 +127,7 @@
 *> \param[in,out] AUXV
 *> \verbatim
 *>          AUXV is DOUBLE PRECISION array, dimension (NB)
-*>          Auxiliar vector.
+*>          Auxiliary vector.
 *> \endverbatim
 *>
 *> \param[in,out] F
diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f
index 247d4ef30..f362c096c 100644
--- a/lapack-netlib/SRC/dlaqr0.f
+++ b/lapack-netlib/SRC/dlaqr0.f
@@ -67,7 +67,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -79,12 +79,12 @@
 *> \verbatim
 *>          IHI is INTEGER
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to DGEBAL, and then passed to DGEHRD when the
 *>           matrix output by DGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -97,19 +97,19 @@
 *>           decomposition (the Schur form); 2-by-2 diagonal blocks
 *>           (corresponding to complex conjugate pairs of eigenvalues)
 *>           are returned in standard form, with H(i,i) = H(i+1,i+1)
-*>           and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+*>           and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] WR
@@ -125,7 +125,7 @@
 *>           and WI(ILO:IHI). If two eigenvalues are computed as a
 *>           complex conjugate pair, they are stored in consecutive
 *>           elements of WR and WI, say the i-th and (i+1)th, with
-*>           WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+*>           WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then
 *>           the eigenvalues are stored in the same order as on the
 *>           diagonal of the Schur form returned in H, with
 *>           WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
@@ -143,7 +143,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -153,7 +153,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -161,7 +161,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -174,7 +174,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -190,19 +190,19 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, DLAQR0 failed to compute all of
+*>             = 0:  successful exit
+*>             > 0:  if INFO = i, DLAQR0 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -210,7 +210,7 @@
 *>                value of H is upper Hessenberg and quasi-triangular
 *>                in rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -218,7 +218,7 @@
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -678,7 +678,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/dlaqr1.f b/lapack-netlib/SRC/dlaqr1.f
index 795b072ab..4ccf997e7 100644
--- a/lapack-netlib/SRC/dlaqr1.f
+++ b/lapack-netlib/SRC/dlaqr1.f
@@ -69,7 +69,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>              The leading dimension of H as declared in
-*>              the calling procedure.  LDH.GE.N
+*>              the calling procedure.  LDH >= N
 *> \endverbatim
 *>
 *> \param[in] SR1
diff --git a/lapack-netlib/SRC/dlaqr2.f b/lapack-netlib/SRC/dlaqr2.f
index 431b3f123..01fdf3046 100644
--- a/lapack-netlib/SRC/dlaqr2.f
+++ b/lapack-netlib/SRC/dlaqr2.f
@@ -103,7 +103,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -121,7 +121,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -133,7 +133,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -149,7 +149,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -194,13 +194,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -212,14 +212,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -231,7 +231,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/dlaqr3.f b/lapack-netlib/SRC/dlaqr3.f
index aa23617c3..1dbf55c9e 100644
--- a/lapack-netlib/SRC/dlaqr3.f
+++ b/lapack-netlib/SRC/dlaqr3.f
@@ -100,7 +100,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -118,7 +118,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -130,7 +130,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -146,7 +146,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -191,13 +191,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -209,14 +209,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -228,7 +228,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f
index 89b9b7f20..454bf9608 100644
--- a/lapack-netlib/SRC/dlaqr4.f
+++ b/lapack-netlib/SRC/dlaqr4.f
@@ -74,7 +74,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -86,12 +86,12 @@
 *> \verbatim
 *>          IHI is INTEGER
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to DGEBAL, and then passed to DGEHRD when the
 *>           matrix output by DGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -104,19 +104,19 @@
 *>           decomposition (the Schur form); 2-by-2 diagonal blocks
 *>           (corresponding to complex conjugate pairs of eigenvalues)
 *>           are returned in standard form, with H(i,i) = H(i+1,i+1)
-*>           and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+*>           and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] WR
@@ -132,7 +132,7 @@
 *>           and WI(ILO:IHI). If two eigenvalues are computed as a
 *>           complex conjugate pair, they are stored in consecutive
 *>           elements of WR and WI, say the i-th and (i+1)th, with
-*>           WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+*>           WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then
 *>           the eigenvalues are stored in the same order as on the
 *>           diagonal of the Schur form returned in H, with
 *>           WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
@@ -150,7 +150,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -160,7 +160,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -168,7 +168,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -181,7 +181,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -197,19 +197,19 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, DLAQR4 failed to compute all of
+*>             = 0:  successful exit
+*>             > 0:  if INFO = i, DLAQR4 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -217,7 +217,7 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -225,7 +225,7 @@
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -677,7 +677,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f
index 5cc4eda1a..f58db9c89 100644
--- a/lapack-netlib/SRC/dlaqr5.f
+++ b/lapack-netlib/SRC/dlaqr5.f
@@ -133,7 +133,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>             LDH is the leading dimension of H just as declared in the
-*>             calling procedure.  LDH.GE.MAX(1,N).
+*>             calling procedure.  LDH >= MAX(1,N).
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -145,7 +145,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>             Specify the rows of Z to which transformations must be
-*>             applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+*>             applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -161,7 +161,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>             LDA is the leading dimension of Z just as declared in
-*>             the calling procedure. LDZ.GE.N.
+*>             the calling procedure. LDZ >= N.
 *> \endverbatim
 *>
 *> \param[out] V
@@ -173,7 +173,7 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>             LDV is the leading dimension of V as declared in the
-*>             calling procedure.  LDV.GE.3.
+*>             calling procedure.  LDV >= 3.
 *> \endverbatim
 *>
 *> \param[out] U
@@ -185,33 +185,14 @@
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU.GE.3*NSHFTS-3.
-*> \endverbatim
-*>
-*> \param[in] NH
-*> \verbatim
-*>          NH is INTEGER
-*>             NH is the number of columns in array WH available for
-*>             workspace. NH.GE.1.
-*> \endverbatim
-*>
-*> \param[out] WH
-*> \verbatim
-*>          WH is DOUBLE PRECISION array, dimension (LDWH,NH)
-*> \endverbatim
-*>
-*> \param[in] LDWH
-*> \verbatim
-*>          LDWH is INTEGER
-*>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH.GE.3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>             NV is the number of rows in WV agailable for workspace.
-*>             NV.GE.1.
+*>             NV >= 1.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -223,9 +204,28 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>             LDWV is the leading dimension of WV as declared in the
-*>             in the calling subroutine.  LDWV.GE.NV.
+*>             in the calling subroutine.  LDWV >= NV.
 *> \endverbatim
 *
+*> \param[in] NH
+*> \verbatim
+*>          NH is INTEGER
+*>             NH is the number of columns in array WH available for
+*>             workspace. NH >= 1.
+*> \endverbatim
+*>
+*> \param[out] WH
+*> \verbatim
+*>          WH is DOUBLE PRECISION array, dimension (LDWH,NH)
+*> \endverbatim
+*>
+*> \param[in] LDWH
+*> \verbatim
+*>          LDWH is INTEGER
+*>             Leading dimension of WH just as declared in the
+*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*> \endverbatim
+*>
 *  Authors:
 *  ========
 *
diff --git a/lapack-netlib/SRC/dlarfb.f b/lapack-netlib/SRC/dlarfb.f
index 5b2cc2ba8..e63641213 100644
--- a/lapack-netlib/SRC/dlarfb.f
+++ b/lapack-netlib/SRC/dlarfb.f
@@ -92,6 +92,8 @@
 *>          K is INTEGER
 *>          The order of the matrix T (= the number of elementary
 *>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
 *> \endverbatim
 *>
 *> \param[in] V
diff --git a/lapack-netlib/SRC/dlarfx.f b/lapack-netlib/SRC/dlarfx.f
index 260d367d4..a9e4496f9 100644
--- a/lapack-netlib/SRC/dlarfx.f
+++ b/lapack-netlib/SRC/dlarfx.f
@@ -94,7 +94,7 @@
 *> \param[in] LDC
 *> \verbatim
 *>          LDC is INTEGER
-*>          The leading dimension of the array C. LDA >= (1,M).
+*>          The leading dimension of the array C. LDC >= (1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/dlarfy.f b/lapack-netlib/SRC/dlarfy.f
index a0b0ebb31..3000b38bc 100644
--- a/lapack-netlib/SRC/dlarfy.f
+++ b/lapack-netlib/SRC/dlarfy.f
@@ -103,7 +103,7 @@
 *
 *> \date December 2016
 *
-*> \ingroup double_eig
+*> \ingroup doubleOTHERauxiliary
 *
 *  =====================================================================
       SUBROUTINE DLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
diff --git a/lapack-netlib/SRC/dlarrb.f b/lapack-netlib/SRC/dlarrb.f
index 2b6389e25..ddf3888b9 100644
--- a/lapack-netlib/SRC/dlarrb.f
+++ b/lapack-netlib/SRC/dlarrb.f
@@ -91,7 +91,7 @@
 *>          RTOL2 is DOUBLE PRECISION
 *>          Tolerance for the convergence of the bisection intervals.
 *>          An interval [LEFT,RIGHT] has converged if
-*>          RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>          RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *>          where GAP is the (estimated) distance to the nearest
 *>          eigenvalue.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *>          WGAP is DOUBLE PRECISION array, dimension (N-1)
 *>          On input, the (estimated) gaps between consecutive
 *>          eigenvalues of L D L^T, i.e., WGAP(I-OFFSET) is the gap between
-*>          eigenvalues I and I+1. Note that if IFIRST.EQ.ILAST
+*>          eigenvalues I and I+1. Note that if IFIRST = ILAST
 *>          then WGAP(IFIRST-OFFSET) must be set to ZERO.
 *>          On output, these gaps are refined.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dlarre.f b/lapack-netlib/SRC/dlarre.f
index 0613efbc3..ce55442e2 100644
--- a/lapack-netlib/SRC/dlarre.f
+++ b/lapack-netlib/SRC/dlarre.f
@@ -150,7 +150,7 @@
 *>          RTOL2 is DOUBLE PRECISION
 *>           Parameters for bisection.
 *>           An interval [LEFT,RIGHT] has converged if
-*>           RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>           RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *> \endverbatim
 *>
 *> \param[in] SPLTOL
diff --git a/lapack-netlib/SRC/dlarrj.f b/lapack-netlib/SRC/dlarrj.f
index 097ba9f77..a4bfb210c 100644
--- a/lapack-netlib/SRC/dlarrj.f
+++ b/lapack-netlib/SRC/dlarrj.f
@@ -85,7 +85,7 @@
 *>          RTOL is DOUBLE PRECISION
 *>          Tolerance for the convergence of the bisection intervals.
 *>          An interval [LEFT,RIGHT] has converged if
-*>          RIGHT-LEFT.LT.RTOL*MAX(|LEFT|,|RIGHT|).
+*>          RIGHT-LEFT < RTOL*MAX(|LEFT|,|RIGHT|).
 *> \endverbatim
 *>
 *> \param[in] OFFSET
diff --git a/lapack-netlib/SRC/dlarrv.f b/lapack-netlib/SRC/dlarrv.f
index cace17c0e..4a59a2bbf 100644
--- a/lapack-netlib/SRC/dlarrv.f
+++ b/lapack-netlib/SRC/dlarrv.f
@@ -149,7 +149,7 @@
 *>          RTOL2 is DOUBLE PRECISION
 *>           Parameters for bisection.
 *>           An interval [LEFT,RIGHT] has converged if
-*>           RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>           RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *> \endverbatim
 *>
 *> \param[in,out] W
diff --git a/lapack-netlib/SRC/dlasd7.f b/lapack-netlib/SRC/dlasd7.f
index e0ddedeb5..66f665cf8 100644
--- a/lapack-netlib/SRC/dlasd7.f
+++ b/lapack-netlib/SRC/dlasd7.f
@@ -400,7 +400,7 @@
          VL( I ) = VLW( IDXI )
    50 CONTINUE
 *
-*     Calculate the allowable deflation tolerence
+*     Calculate the allowable deflation tolerance
 *
       EPS = DLAMCH( 'Epsilon' )
       TOL = MAX( ABS( ALPHA ), ABS( BETA ) )
diff --git a/lapack-netlib/SRC/dlasr.f b/lapack-netlib/SRC/dlasr.f
index 6059c6293..f707970e4 100644
--- a/lapack-netlib/SRC/dlasr.f
+++ b/lapack-netlib/SRC/dlasr.f
@@ -175,7 +175,7 @@
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>          The M-by-N matrix A.  On exit, A is overwritten by P*A if
-*>          SIDE = 'R' or by A*P**T if SIDE = 'L'.
+*>          SIDE = 'L' or by A*P**T if SIDE = 'R'.
 *> \endverbatim
 *>
 *> \param[in] LDA
diff --git a/lapack-netlib/SRC/dlassq.f b/lapack-netlib/SRC/dlassq.f
index 885395e3c..5922360f9 100644
--- a/lapack-netlib/SRC/dlassq.f
+++ b/lapack-netlib/SRC/dlassq.f
@@ -60,7 +60,7 @@
 *>
 *> \param[in] X
 *> \verbatim
-*>          X is DOUBLE PRECISION array, dimension (N)
+*>          X is DOUBLE PRECISION array, dimension (1+(N-1)*INCX)
 *>          The vector for which a scaled sum of squares is computed.
 *>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dlaswlq.f b/lapack-netlib/SRC/dlaswlq.f
index 6e4ca20fd..619a1f1a2 100644
--- a/lapack-netlib/SRC/dlaswlq.f
+++ b/lapack-netlib/SRC/dlaswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b DLASWLQ
 *
 *  Definition:
 *  ===========
@@ -18,9 +19,20 @@
 *>
 *> \verbatim
 *>
-*>          DLASWLQ computes a blocked Short-Wide LQ factorization of a
-*>          M-by-N matrix A, where N >= M:
-*>          A = L * Q
+*> DLASWLQ computes a blocked Tall-Skinny LQ factorization of
+*> a real M-by-N matrix A for M <= N:
+*>
+*>    A = ( L 0 ) *  Q,
+*>
+*> where:
+*>
+*>    Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+*>    form in the elements above the digonal of the array A and in
+*>    the elemenst of the array T;
+*>    L is an lower-triangular M-by-M matrix stored on exit in
+*>    the elements on and below the diagonal of the array A.
+*>    0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -150,7 +162,7 @@
       SUBROUTINE DLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
      $                  INFO)
 *
-*  -- LAPACK computational routine (version 3.7.1) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *     June 2017
diff --git a/lapack-netlib/SRC/dlasyf_aa.f b/lapack-netlib/SRC/dlasyf_aa.f
index 6b75e46e0..793537e04 100644
--- a/lapack-netlib/SRC/dlasyf_aa.f
+++ b/lapack-netlib/SRC/dlasyf_aa.f
@@ -284,8 +284,9 @@
 *
 *              Swap A(I1, I2+1:M) with A(I2, I2+1:M)
 *
-               CALL DSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
-     $                           A( J1+I2-1, I2+1 ), LDA )
+               IF( I2.LT.M )
+     $            CALL DSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
+     $                              A( J1+I2-1, I2+1 ), LDA )
 *
 *              Swap A(I1, I1) with A(I2,I2)
 *
@@ -325,13 +326,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( K, J+1 ).NE.ZERO ) THEN
-               ALPHA = ONE / A( K, J+1 )
-               CALL DCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
-               CALL DSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
-            ELSE
-               CALL DLASET( 'Full', 1, M-J-1, ZERO, ZERO,
-     $                      A( K, J+2 ), LDA)
+            IF( J.LT.(M-1) ) THEN
+               IF( A( K, J+1 ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( K, J+1 )
+                  CALL DCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
+                  CALL DSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
+               ELSE
+                  CALL DLASET( 'Full', 1, M-J-1, ZERO, ZERO,
+     $                         A( K, J+2 ), LDA)
+               END IF
             END IF
          END IF
          J = J + 1
@@ -432,8 +435,9 @@
 *
 *              Swap A(I2+1:M, I1) with A(I2+1:M, I2)
 *
-               CALL DSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
-     $                           A( I2+1, J1+I2-1 ), 1 )
+               IF( I2.LT.M )
+     $            CALL DSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
+     $                              A( I2+1, J1+I2-1 ), 1 )
 *
 *              Swap A(I1, I1) with A(I2, I2)
 *
@@ -473,13 +477,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( J+1, K ).NE.ZERO ) THEN
-               ALPHA = ONE / A( J+1, K )
-               CALL DCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
-               CALL DSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
-            ELSE
-               CALL DLASET( 'Full', M-J-1, 1, ZERO, ZERO,
-     $                      A( J+2, K ), LDA )
+            IF( J.LT.(M-1) ) THEN
+               IF( A( J+1, K ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( J+1, K )
+                  CALL DCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
+                  CALL DSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
+               ELSE
+                  CALL DLASET( 'Full', M-J-1, 1, ZERO, ZERO,
+     $                         A( J+2, K ), LDA )
+               END IF
             END IF
          END IF
          J = J + 1
diff --git a/lapack-netlib/SRC/dlasyf_rk.f b/lapack-netlib/SRC/dlasyf_rk.f
index 209b4c89d..d581eeedc 100644
--- a/lapack-netlib/SRC/dlasyf_rk.f
+++ b/lapack-netlib/SRC/dlasyf_rk.f
@@ -321,7 +321,7 @@
 *        of A and working backwards, and compute the matrix W = U12*D
 *        for use in updating A11
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = ZERO
@@ -649,7 +649,7 @@
 *        of A and working forwards, and compute the matrix W = L21*D
 *        for use in updating A22
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = ZERO
 *
diff --git a/lapack-netlib/SRC/dlasyf_rook.f b/lapack-netlib/SRC/dlasyf_rook.f
index 49ee7a6c9..557032104 100644
--- a/lapack-netlib/SRC/dlasyf_rook.f
+++ b/lapack-netlib/SRC/dlasyf_rook.f
@@ -21,7 +21,7 @@
 *       SUBROUTINE DLASYF_ROOK( UPLO, N, NB, KB, A, LDA, IPIV, W, LDW, INFO )
 *
 *       .. Scalar Arguments ..
-*       CHARADLATER          UPLO
+*       CHARACTER          UPLO
 *       INTEGER            INFO, KB, LDA, LDW, N, NB
 *       ..
 *       .. Array Arguments ..
diff --git a/lapack-netlib/SRC/dlatdf.f b/lapack-netlib/SRC/dlatdf.f
index fd05059b3..8001e0830 100644
--- a/lapack-netlib/SRC/dlatdf.f
+++ b/lapack-netlib/SRC/dlatdf.f
@@ -85,7 +85,7 @@
 *>          RHS is DOUBLE PRECISION array, dimension (N)
 *>          On entry, RHS contains contributions from other subsystems.
 *>          On exit, RHS contains the solution of the subsystem with
-*>          entries acoording to the value of IJOB (see above).
+*>          entries according to the value of IJOB (see above).
 *> \endverbatim
 *>
 *> \param[in,out] RDSUM
@@ -260,7 +260,7 @@
 *
 *        Solve for U-part, look-ahead for RHS(N) = +-1. This is not done
 *        in BSOLVE and will hopefully give us a better estimate because
-*        any ill-conditioning of the original matrix is transfered to U
+*        any ill-conditioning of the original matrix is transferred to U
 *        and not to L. U(N, N) is an approximation to sigma_min(LU).
 *
          CALL DCOPY( N-1, RHS, 1, XP, 1 )
diff --git a/lapack-netlib/SRC/dlatsqr.f b/lapack-netlib/SRC/dlatsqr.f
index 1ce7c4de0..598d2938e 100644
--- a/lapack-netlib/SRC/dlatsqr.f
+++ b/lapack-netlib/SRC/dlatsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b DLATSQR
 *
 *  Definition:
 *  ===========
@@ -19,8 +20,22 @@
 *> \verbatim
 *>
 *> DLATSQR computes a blocked Tall-Skinny QR factorization of
-*> an M-by-N matrix A, where M >= N:
-*> A = Q * R .
+*> a real M-by-N matrix A for M >= N:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+*>    form in the elements below the digonal of the array A and in
+*>    the elemenst of the array T;
+*>
+*>    R is an upper-triangular N-by-N matrix, stored on exit in
+*>    the elements on and above the diagonal of the array A.
+*>
+*>    0 is a (M-N)-by-N zero matrix, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -149,10 +164,10 @@
       SUBROUTINE DLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
      $                    LWORK, INFO)
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
diff --git a/lapack-netlib/SRC/dorgtsqr.f b/lapack-netlib/SRC/dorgtsqr.f
new file mode 100644
index 000000000..85b05b6b5
--- /dev/null
+++ b/lapack-netlib/SRC/dorgtsqr.f
@@ -0,0 +1,306 @@
+*> \brief \b DORGTSQR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DORGTSQR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorgtsqr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dorgtsqr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorgtsqr.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+*      $                     INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DORGTSQR generates an M-by-N real matrix Q_out with orthonormal columns,
+*> which are the first N columns of a product of real orthogonal
+*> matrices of order M which are returned by DLATSQR
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> See the documentation for DLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by DLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by DLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not accessed.
+*>             The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by DLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored) (same format as the output A
+*>             below the diagonal in DLATSQR).
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of NIRB block reflector sequences
+*>          is stored in a larger NB-by-N column block of T and consists
+*>          of NICB smaller NB-by-NB upper-triangular column blocks.
+*>          (same format as the output T in DLATSQR).
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB1,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) DOUBLE PRECISION array, dimension (MAX(2,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup doubleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION  A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DLAMTSQR, DLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      LQUERY  = LWORK.EQ.-1
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array C(LDC, N) and WORK(LWORK)
+*        in the call to DLAMTSQR. See the documentation for DLAMTSQR.
+*
+         IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN
+            INFO = -10
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NBLOCAL = MIN( NB, N )
+*
+*           LWORK = -1, then set the size for the array C(LDC,N)
+*           in DLAMTSQR call and set the optimal size of the work array
+*           WORK(LWORK) in DLAMTSQR call.
+*
+            LDC = M
+            LC = LDC*N
+            LW = N * NBLOCAL
+*
+            LWORKOPT = LC+LW
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -10
+            END IF
+         END IF
+*
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DORGTSQR', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DBLE( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in
+*     of M-by-M orthogonal matrix Q_in, which is implicitly stored in
+*     the subdiagonal part of input array A and in the input array T.
+*     Perform by the following operation using the routine DLAMTSQR.
+*
+*         Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix,
+*                        ( 0 )        0 is a (M-N)-by-N zero matrix.
+*
+*     (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones
+*     on the diagonal and zeros elsewhere.
+*
+      CALL DLASET( 'F', M, N, ZERO, ONE, WORK, LDC )
+*
+*     (1b)  On input, WORK(1:LDC*N) stores ( I );
+*                                          ( 0 )
+*
+*           On output, WORK(1:LDC*N) stores Q1_in.
+*
+      CALL DLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT,
+     $               WORK, LDC, WORK( LC+1 ), LW, IINFO )
+*
+*     (2) Copy the result from the part of the work array (1:M,1:N)
+*     with the leading dimension LDC that starts at WORK(1) into
+*     the output array A(1:M,1:N) column-by-column.
+*
+      DO J = 1, N
+         CALL DCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 )
+      END DO
+*
+      WORK( 1 ) = DBLE( LWORKOPT )
+      RETURN
+*
+*     End of DORGTSQR
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dorhr_col.f b/lapack-netlib/SRC/dorhr_col.f
new file mode 100644
index 000000000..b5a65973d
--- /dev/null
+++ b/lapack-netlib/SRC/dorhr_col.f
@@ -0,0 +1,440 @@
+*> \brief \b DORHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DORHR_COL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dorhr_col.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dorhr_col.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dorhr_col.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, M, N, NB
+*       ..
+*       .. Array Arguments ..
+*       DOUBLE PRECISION  A( LDA, * ), D( * ), T( LDT, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>  DORHR_COL takes an M-by-N real matrix Q_in with orthonormal columns
+*>  as input, stored in A, and performs Householder Reconstruction (HR),
+*>  i.e. reconstructs Householder vectors V(i) implicitly representing
+*>  another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+*>  where S is an N-by-N diagonal matrix with diagonal entries
+*>  equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+*>  stored in A on output, and the diagonal entries of S are stored in D.
+*>  Block reflectors are also returned in T
+*>  (same output format as DGEQRT).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size to be used in the reconstruction
+*>          of Householder column vector blocks in the array A and
+*>          corresponding block reflectors in the array T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size.)
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is DOUBLE PRECISION array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_in,
+*>             i.e the columns of A are orthogonal unit vectors.
+*>
+*>          On exit:
+*>
+*>             The elements below the diagonal of A represent the unit
+*>             lower-trapezoidal matrix V of Householder column vectors
+*>             V(i). The unit diagonal entries of V are not stored
+*>             (same format as the output below the diagonal in A from
+*>             DGEQRT). The matrix T and the matrix V stored on output
+*>             in A implicitly define Q_out.
+*>
+*>             The elements above the diagonal contain the factor U
+*>             of the "modified" LU-decomposition:
+*>                Q_in - ( S ) = V * U
+*>                       ( 0 )
+*>             where 0 is a (M-N)-by-(M-N) zero matrix.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is DOUBLE PRECISION array,
+*>          dimension (LDT, N)
+*>
+*>          Let NOCB = Number_of_output_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          On exit, T(1:NB, 1:N) contains NOCB upper-triangular
+*>          block reflectors used to define Q_out stored in compact
+*>          form as a sequence of upper-triangular NB-by-NB column
+*>          blocks (same format as the output T in DGEQRT).
+*>          The matrix T and the matrix V stored on output in A
+*>          implicitly define Q_out. NOTE: The lower triangles
+*>          below the upper-triangular blcoks will be filled with
+*>          zeros. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is DOUBLE PRECISION array, dimension min(M,N).
+*>          The elements can be only plus or minus one.
+*>
+*>          D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where
+*>          1 <= i <= min(M,N), and Q_in_i is Q_in after performing
+*>          i-1 steps of “modified” Gaussian elimination.
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*> The computed M-by-M orthogonal factor Q_out is defined implicitly as
+*> a product of orthogonal matrices Q_out(i). Each Q_out(i) is stored in
+*> the compact WY-representation format in the corresponding blocks of
+*> matrices V (stored in A) and T.
+*>
+*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N
+*> matrix A contains the column vectors V(i) in NB-size column
+*> blocks VB(j). For example, VB(1) contains the columns
+*> V(1), V(2), ... V(NB). NOTE: The unit entries on
+*> the diagonal of Y are not stored in A.
+*>
+*> The number of column blocks is
+*>
+*>     NOCB = Number_of_output_col_blocks = CEIL(N/NB)
+*>
+*> where each block is of order NB except for the last block, which
+*> is of order LAST_NB = N - (NOCB-1)*NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix V is
+*>
+*>
+*>     V = (    VB(1),   VB(2), VB(3) ) =
+*>
+*>       = (   1                      )
+*>         ( v21    1                 )
+*>         ( v31  v32    1            )
+*>         ( v41  v42  v43   1        )
+*>         ( v51  v52  v53  v54    1  )
+*>         ( v61  v62  v63  v54   v65 )
+*>
+*>
+*> For each of the column blocks VB(i), an upper-triangular block
+*> reflector TB(i) is computed. These blocks are stored as
+*> a sequence of upper-triangular column blocks in the NB-by-N
+*> matrix T. The size of each TB(i) block is NB-by-NB, except
+*> for the last block, whose size is LAST_NB-by-LAST_NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix T is
+*>
+*>     T  = (    TB(1),    TB(2), TB(3) ) =
+*>
+*>        = ( t11  t12  t13  t14   t15  )
+*>          (      t22       t24        )
+*>
+*>
+*> The M-by-M factor Q_out is given as a product of NOCB
+*> orthogonal M-by-M matrices Q_out(i).
+*>
+*>     Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB),
+*>
+*> where each matrix Q_out(i) is given by the WY-representation
+*> using corresponding blocks from the matrices V and T:
+*>
+*>     Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T,
+*>
+*> where I is the identity matrix. Here is the formula with matrix
+*> dimensions:
+*>
+*>  Q(i){M-by-M} = I{M-by-M} -
+*>    VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M},
+*>
+*> where INB = NB, except for the last block NOCB
+*> for which INB=LAST_NB.
+*>
+*> =====
+*> NOTE:
+*> =====
+*>
+*> If Q_in is the result of doing a QR factorization
+*> B = Q_in * R_in, then:
+*>
+*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out.
+*>
+*> So if one wants to interpret Q_out as the result
+*> of the QR factorization of B, then corresponding R_out
+*> should be obtained by R_out = S * R_in, i.e. some rows of R_in
+*> should be multiplied by -1.
+*>
+*> For the details of the algorithm, see [1].
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup doubleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November   2019, Igor Kozachenko,
+*>            Computer Science Division,
+*>            University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE DORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, M, N, NB
+*     ..
+*     .. Array Arguments ..
+      DOUBLE PRECISION  A( LDA, * ), D( * ), T( LDT, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ONE = 1.0D+0, ZERO = 0.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB,
+     $                   NPLUSONE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DCOPY, DLAORHR_COL_GETRFNP, DSCAL, DTRSM,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. N.GT.M ) THEN
+         INFO = -2
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -7
+      END IF
+*
+*     Handle error in the input parameters.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'DORHR_COL', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         RETURN
+      END IF
+*
+*     On input, the M-by-N matrix A contains the orthogonal
+*     M-by-N matrix Q_in.
+*
+*     (1) Compute the unit lower-trapezoidal V (ones on the diagonal
+*     are not stored) by performing the "modified" LU-decomposition.
+*
+*     Q_in - ( S ) = V * U = ( V1 ) * U,
+*            ( 0 )           ( V2 )
+*
+*     where 0 is an (M-N)-by-N zero matrix.
+*
+*     (1-1) Factor V1 and U.
+
+      CALL DLAORHR_COL_GETRFNP( N, N, A, LDA, D, IINFO )
+*
+*     (1-2) Solve for V2.
+*
+      IF( M.GT.N ) THEN
+         CALL DTRSM( 'R', 'U', 'N', 'N', M-N, N, ONE, A, LDA,
+     $               A( N+1, 1 ), LDA )
+      END IF
+*
+*     (2) Reconstruct the block reflector T stored in T(1:NB, 1:N)
+*     as a sequence of upper-triangular blocks with NB-size column
+*     blocking.
+*
+*     Loop over the column blocks of size NB of the array A(1:M,1:N)
+*     and the array T(1:NB,1:N), JB is the column index of a column
+*     block, JNB is the column block size at each step JB.
+*
+      NPLUSONE = N + 1
+      DO JB = 1, N, NB
+*
+*        (2-0) Determine the column block size JNB.
+*
+         JNB = MIN( NPLUSONE-JB, NB )
+*
+*        (2-1) Copy the upper-triangular part of the current JNB-by-JNB
+*        diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part
+*        of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1)
+*        column-by-column, total JNB*(JNB+1)/2 elements.
+*
+         JBTEMP1 = JB - 1
+         DO J = JB, JB+JNB-1
+            CALL DCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 )
+         END DO
+*
+*        (2-2) Perform on the upper-triangular part of the current
+*        JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in T(1:JNB,JB:JB+JNB-1) the following operation in place:
+*        (-1)*U(JB)*S(JB), i.e the result will be stored in the upper-
+*        triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication
+*        of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB
+*        diagonal block S(JB) of the N-by-N sign matrix S from the
+*        right means changing the sign of each J-th column of the block
+*        U(JB) according to the sign of the diagonal element of the block
+*        S(JB), i.e. S(J,J) that is stored in the array element D(J).
+*
+         DO J = JB, JB+JNB-1
+            IF( D( J ).EQ.ONE ) THEN
+               CALL DSCAL( J-JBTEMP1, -ONE, T( 1, J ), 1 )
+            END IF
+         END DO
+*
+*        (2-3) Perform the triangular solve for the current block
+*        matrix X(JB):
+*
+*               X(JB) * (A(JB)**T) = B(JB), where:
+*
+*               A(JB)**T  is a JNB-by-JNB unit upper-triangular
+*                         coefficient block, and A(JB)=V1(JB), which
+*                         is a JNB-by-JNB unit lower-triangular block
+*                         stored in A(JB:JB+JNB-1,JB:JB+JNB-1).
+*                         The N-by-N matrix V1 is the upper part
+*                         of the M-by-N lower-trapezoidal matrix V
+*                         stored in A(1:M,1:N);
+*
+*               B(JB)     is a JNB-by-JNB  upper-triangular right-hand
+*                         side block, B(JB) = (-1)*U(JB)*S(JB), and
+*                         B(JB) is stored in T(1:JNB,JB:JB+JNB-1);
+*
+*               X(JB)     is a JNB-by-JNB upper-triangular solution
+*                         block, X(JB) is the upper-triangular block
+*                         reflector T(JB), and X(JB) is stored
+*                         in T(1:JNB,JB:JB+JNB-1).
+*
+*             In other words, we perform the triangular solve for the
+*             upper-triangular block T(JB):
+*
+*               T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB).
+*
+*             Even though the blocks X(JB) and B(JB) are upper-
+*             triangular, the routine DTRSM will access all JNB**2
+*             elements of the square T(1:JNB,JB:JB+JNB-1). Therefore,
+*             we need to set to zero the elements of the block
+*             T(1:JNB,JB:JB+JNB-1) below the diagonal before the call
+*             to DTRSM.
+*
+*        (2-3a) Set the elements to zero.
+*
+         JBTEMP2 = JB - 2
+         DO J = JB, JB+JNB-2
+            DO I = J-JBTEMP2, NB
+               T( I, J ) = ZERO
+            END DO
+         END DO
+*
+*        (2-3b) Perform the triangular solve.
+*
+         CALL DTRSM( 'R', 'L', 'T', 'U', JNB, JNB, ONE,
+     $               A( JB, JB ), LDA, T( 1, JB ), LDT )
+*
+      END DO
+*
+      RETURN
+*
+*     End of DORHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/dporfsx.f b/lapack-netlib/SRC/dporfsx.f
index 53724925e..67cca9ccf 100644
--- a/lapack-netlib/SRC/dporfsx.f
+++ b/lapack-netlib/SRC/dporfsx.f
@@ -135,7 +135,7 @@
 *> \param[in,out] S
 *> \verbatim
 *>          S is DOUBLE PRECISION array, dimension (N)
-*>     The row scale factors for A.  If EQUED = 'Y', A is multiplied on
+*>     The scale factors for A.  If EQUED = 'Y', A is multiplied on
 *>     the left and right by diag(S).  S is an input argument if FACT =
 *>     'F'; otherwise, S is an output argument.  If FACT = 'F' and EQUED
 *>     = 'Y', each element of S must be positive.  If S is output, each
@@ -263,7 +263,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -299,14 +299,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension (NPARAMS)
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -314,9 +314,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/dposvxx.f b/lapack-netlib/SRC/dposvxx.f
index 488e0b15a..b0de44910 100644
--- a/lapack-netlib/SRC/dposvxx.f
+++ b/lapack-netlib/SRC/dposvxx.f
@@ -366,7 +366,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -402,14 +402,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -417,9 +417,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/dsb2st_kernels.f b/lapack-netlib/SRC/dsb2st_kernels.f
index 3bf126d5b..a9dc6b5ca 100644
--- a/lapack-netlib/SRC/dsb2st_kernels.f
+++ b/lapack-netlib/SRC/dsb2st_kernels.f
@@ -1,26 +1,26 @@
 *> \brief \b DSB2ST_KERNELS
 *
 *  @generated from zhb2st_kernels.f, fortran z -> d, Wed Dec  7 08:22:39 2016
-*      
+*
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download DSB2ST_KERNELS + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsb2st_kernels.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsb2st_kernels.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsb2st_kernels.f"> 
+*> Download DSB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/dsb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/dsb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/dsb2st_kernels.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE  DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*       SUBROUTINE  DSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
 *                                   ST, ED, SWEEP, N, NB, IB,
 *                                   A, LDA, V, TAU, LDVT, WORK)
 *
@@ -32,9 +32,9 @@
 *       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *       ..
 *       .. Array Arguments ..
-*       DOUBLE PRECISION   A( LDA, * ), V( * ), 
+*       DOUBLE PRECISION   A( LDA, * ), V( * ),
 *                          TAU( * ), WORK( * )
-*  
+*
 *> \par Purpose:
 *  =============
 *>
@@ -124,7 +124,7 @@
 *>          LDVT is INTEGER.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array. Workspace of size nb.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -155,16 +155,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE  DSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+      SUBROUTINE  DSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
      $                            ST, ED, SWEEP, N, NB, IB,
      $                            A, LDA, V, TAU, LDVT, WORK)
 *
@@ -181,7 +181,7 @@
       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *     ..
 *     .. Array Arguments ..
-      DOUBLE PRECISION   A( LDA, * ), V( * ), 
+      DOUBLE PRECISION   A( LDA, * ), V( * ),
      $                   TAU( * ), WORK( * )
 *     ..
 *
@@ -195,8 +195,8 @@
 *     .. Local Scalars ..
       LOGICAL            UPPER
       INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
-     $                   DPOS, OFDPOS, AJETER 
-      DOUBLE PRECISION   CTMP 
+     $                   DPOS, OFDPOS, AJETER
+      DOUBLE PRECISION   CTMP
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           DLARFG, DLARFX, DLARFY
@@ -209,7 +209,7 @@
 *     ..
 *     ..
 *     .. Executable Statements ..
-*      
+*
       AJETER = IB + LDVT
       UPPER = LSAME( UPLO, 'U' )
 
@@ -240,10 +240,10 @@
               V( VPOS ) = ONE
               DO 10 I = 1, LM-1
                   V( VPOS+I )         = ( A( OFDPOS-I, ST+I ) )
-                  A( OFDPOS-I, ST+I ) = ZERO  
+                  A( OFDPOS-I, ST+I ) = ZERO
    10         CONTINUE
               CTMP = ( A( OFDPOS, ST ) )
-              CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+              CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
               A( OFDPOS, ST ) = CTMP
 *
@@ -281,14 +281,14 @@
 *
                   V( VPOS ) = ONE
                   DO 30 I = 1, LM-1
-                      V( VPOS+I )          = 
+                      V( VPOS+I )          =
      $                                    ( A( DPOS-NB-I, J1+I ) )
                       A( DPOS-NB-I, J1+I ) = ZERO
    30             CONTINUE
                   CTMP = ( A( DPOS-NB, J1 ) )
                   CALL DLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
                   A( DPOS-NB, J1 ) = CTMP
-*                 
+*
                   CALL DLARFX( 'Right', LN-1, LM, V( VPOS ),
      $                         TAU( TAUPOS ),
      $                         A( DPOS-NB+1, J1 ), LDA-1, WORK)
@@ -296,9 +296,9 @@
           ENDIF
 *
 *     Lower case
-*  
+*
       ELSE
-*      
+*
           IF( WANTZ ) THEN
               VPOS   = MOD( SWEEP-1, 2 ) * N + ST
               TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
@@ -313,9 +313,9 @@
               V( VPOS ) = ONE
               DO 20 I = 1, LM-1
                   V( VPOS+I )         = A( OFDPOS+I, ST-1 )
-                  A( OFDPOS+I, ST-1 ) = ZERO  
+                  A( OFDPOS+I, ST-1 ) = ZERO
    20         CONTINUE
-              CALL DLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+              CALL DLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
 *
               LM = ED - ST + 1
@@ -342,7 +342,7 @@
               LM = J2-J1+1
 *
               IF( LM.GT.0) THEN
-                  CALL DLARFX( 'Right', LM, LN, V( VPOS ), 
+                  CALL DLARFX( 'Right', LM, LN, V( VPOS ),
      $                         TAU( TAUPOS ), A( DPOS+NB, ST ),
      $                         LDA-1, WORK)
 *
@@ -359,13 +359,13 @@
                       V( VPOS+I )        = A( DPOS+NB+I, ST )
                       A( DPOS+NB+I, ST ) = ZERO
    40             CONTINUE
-                  CALL DLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+                  CALL DLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
      $                                        TAU( TAUPOS ) )
 *
-                  CALL DLARFX( 'Left', LM, LN-1, V( VPOS ), 
+                  CALL DLARFX( 'Left', LM, LN-1, V( VPOS ),
      $                         ( TAU( TAUPOS ) ),
      $                         A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
-             
+
               ENDIF
           ENDIF
       ENDIF
@@ -374,4 +374,4 @@
 *
 *     END OF DSB2ST_KERNELS
 *
-      END      
+      END
diff --git a/lapack-netlib/SRC/dsbgvx.f b/lapack-netlib/SRC/dsbgvx.f
index eab5ebcbb..6de1eb89b 100644
--- a/lapack-netlib/SRC/dsbgvx.f
+++ b/lapack-netlib/SRC/dsbgvx.f
@@ -261,11 +261,11 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit
-*>          < 0 : if INFO = -i, the i-th argument had an illegal value
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          <= N: if INFO = i, then i eigenvectors failed to converge.
 *>                  Their indices are stored in IFAIL.
-*>          > N : DPBSTF returned an error code; i.e.,
+*>          > N:  DPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
 *>                minor of order i of B is not positive definite.
 *>                The factorization of B could not be completed and
diff --git a/lapack-netlib/SRC/dsgesv.f b/lapack-netlib/SRC/dsgesv.f
index f47327d00..edbb87e7a 100644
--- a/lapack-netlib/SRC/dsgesv.f
+++ b/lapack-netlib/SRC/dsgesv.f
@@ -92,9 +92,9 @@
 *>          dimension (LDA,N)
 *>          On entry, the N-by-N coefficient matrix A.
 *>          On exit, if iterative refinement has been successfully used
-*>          (INFO.EQ.0 and ITER.GE.0, see description below), then A is
+*>          (INFO = 0 and ITER >= 0, see description below), then A is
 *>          unchanged, if double precision factorization has been used
-*>          (INFO.EQ.0 and ITER.LT.0, see description below), then the
+*>          (INFO = 0 and ITER < 0, see description below), then the
 *>          array A contains the factors L and U from the factorization
 *>          A = P*L*U; the unit diagonal elements of L are not stored.
 *> \endverbatim
@@ -111,8 +111,8 @@
 *>          The pivot indices that define the permutation matrix P;
 *>          row i of the matrix was interchanged with row IPIV(i).
 *>          Corresponds either to the single precision factorization
-*>          (if INFO.EQ.0 and ITER.GE.0) or the double precision
-*>          factorization (if INFO.EQ.0 and ITER.LT.0).
+*>          (if INFO = 0 and ITER >= 0) or the double precision
+*>          factorization (if INFO = 0 and ITER < 0).
 *> \endverbatim
 *>
 *> \param[in] B
@@ -406,7 +406,7 @@
    30 CONTINUE
 *
 *     If we are at this place of the code, this is because we have
-*     performed ITER=ITERMAX iterations and never satisified the
+*     performed ITER=ITERMAX iterations and never satisfied the
 *     stopping criterion, set up the ITER flag accordingly and follow up
 *     on double precision routine.
 *
diff --git a/lapack-netlib/SRC/dsposv.f b/lapack-netlib/SRC/dsposv.f
index 4a8575241..6c8baa56b 100644
--- a/lapack-netlib/SRC/dsposv.f
+++ b/lapack-netlib/SRC/dsposv.f
@@ -106,9 +106,9 @@
 *>          triangular part of the matrix A, and the strictly upper
 *>          triangular part of A is not referenced.
 *>          On exit, if iterative refinement has been successfully used
-*>          (INFO.EQ.0 and ITER.GE.0, see description below), then A is
+*>          (INFO = 0 and ITER >= 0, see description below), then A is
 *>          unchanged, if double precision factorization has been used
-*>          (INFO.EQ.0 and ITER.LT.0, see description below), then the
+*>          (INFO = 0 and ITER < 0, see description below), then the
 *>          array A contains the factor U or L from the Cholesky
 *>          factorization A = U**T*U or A = L*L**T.
 *> \endverbatim
@@ -413,7 +413,7 @@
    30 CONTINUE
 *
 *     If we are at this place of the code, this is because we have
-*     performed ITER=ITERMAX iterations and never satisified the
+*     performed ITER=ITERMAX iterations and never satisfied the
 *     stopping criterion, set up the ITER flag accordingly and follow
 *     up on double precision routine.
 *
diff --git a/lapack-netlib/SRC/dstemr.f b/lapack-netlib/SRC/dstemr.f
index a1a8e3433..16c9d970d 100644
--- a/lapack-netlib/SRC/dstemr.f
+++ b/lapack-netlib/SRC/dstemr.f
@@ -233,13 +233,13 @@
 *> \param[in,out] TRYRAC
 *> \verbatim
 *>          TRYRAC is LOGICAL
-*>          If TRYRAC.EQ..TRUE., indicates that the code should check whether
+*>          If TRYRAC = .TRUE., indicates that the code should check whether
 *>          the tridiagonal matrix defines its eigenvalues to high relative
 *>          accuracy.  If so, the code uses relative-accuracy preserving
 *>          algorithms that might be (a bit) slower depending on the matrix.
 *>          If the matrix does not define its eigenvalues to high relative
 *>          accuracy, the code can uses possibly faster algorithms.
-*>          If TRYRAC.EQ..FALSE., the code is not required to guarantee
+*>          If TRYRAC = .FALSE., the code is not required to guarantee
 *>          relatively accurate eigenvalues and can use the fastest possible
 *>          techniques.
 *>          On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix
diff --git a/lapack-netlib/SRC/dsyconvf.f b/lapack-netlib/SRC/dsyconvf.f
index 37c8157ba..60cfd1e65 100644
--- a/lapack-netlib/SRC/dsyconvf.f
+++ b/lapack-netlib/SRC/dsyconvf.f
@@ -291,7 +291,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -344,7 +344,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -435,7 +435,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where k increases from 1 to N
 *
             I = 1
@@ -488,7 +488,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/dsyconvf_rook.f b/lapack-netlib/SRC/dsyconvf_rook.f
index 5c774906e..bd683a087 100644
--- a/lapack-netlib/SRC/dsyconvf_rook.f
+++ b/lapack-netlib/SRC/dsyconvf_rook.f
@@ -282,7 +282,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -333,7 +333,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -423,7 +423,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where i increases from 1 to N
 *
             I = 1
@@ -474,7 +474,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/dsyev_2stage.f b/lapack-netlib/SRC/dsyev_2stage.f
index fff0dedbc..9d802905c 100644
--- a/lapack-netlib/SRC/dsyev_2stage.f
+++ b/lapack-netlib/SRC/dsyev_2stage.f
@@ -317,7 +317,7 @@
       IF( .NOT.WANTZ ) THEN
          CALL DSTERF( N, W, WORK( INDE ), INFO )
       ELSE
-*        Not available in this release, and agrument checking should not
+*        Not available in this release, and argument checking should not
 *        let it getting here
          RETURN
          CALL DORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
diff --git a/lapack-netlib/SRC/dsyevd_2stage.f b/lapack-netlib/SRC/dsyevd_2stage.f
index 75a6da436..ff8e08d71 100644
--- a/lapack-netlib/SRC/dsyevd_2stage.f
+++ b/lapack-netlib/SRC/dsyevd_2stage.f
@@ -385,7 +385,7 @@
       IF( .NOT.WANTZ ) THEN
          CALL DSTERF( N, W, WORK( INDE ), INFO )
       ELSE
-*        Not available in this release, and agrument checking should not
+*        Not available in this release, and argument checking should not
 *        let it getting here
          RETURN
          CALL DSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
diff --git a/lapack-netlib/SRC/dsyrfsx.f b/lapack-netlib/SRC/dsyrfsx.f
index e128cd4e0..eb091e720 100644
--- a/lapack-netlib/SRC/dsyrfsx.f
+++ b/lapack-netlib/SRC/dsyrfsx.f
@@ -271,7 +271,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -307,14 +307,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension (NPARAMS)
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -322,9 +322,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/dsysv_aa.f b/lapack-netlib/SRC/dsysv_aa.f
index 7192928c6..4ee474448 100644
--- a/lapack-netlib/SRC/dsysv_aa.f
+++ b/lapack-netlib/SRC/dsysv_aa.f
@@ -42,7 +42,7 @@
 *> matrices.
 *>
 *> Aasen's algorithm is used to factor A as
-*>    A = U * T * U**T,  if UPLO = 'U', or
+*>    A = U**T * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric tridiagonal. The factored
@@ -86,7 +86,7 @@
 *>
 *>          On exit, if INFO = 0, the tridiagonal matrix T and the
 *>          multipliers used to obtain the factor U or L from the
-*>          factorization A = U*T*U**T or A = L*T*L**T as computed by
+*>          factorization A = U**T*T*U or A = L*T*L**T as computed by
 *>          DSYTRF.
 *> \endverbatim
 *>
@@ -230,7 +230,7 @@
          RETURN
       END IF
 *
-*     Compute the factorization A = U*T*U**T or A = L*T*L**T.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL DSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
       IF( INFO.EQ.0 ) THEN
diff --git a/lapack-netlib/SRC/dsysv_aa_2stage.f b/lapack-netlib/SRC/dsysv_aa_2stage.f
index 05e538f0b..ef593bc7e 100644
--- a/lapack-netlib/SRC/dsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/dsysv_aa_2stage.f
@@ -45,7 +45,7 @@
 *> matrices.
 *>
 *> Aasen's 2-stage algorithm is used to factor A as
-*>    A = U * T * U**T,  if UPLO = 'U', or
+*>    A = U**T * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric and band. The matrix T is
@@ -259,7 +259,7 @@
       END IF
 *
 *
-*     Compute the factorization A = U*T*U**T or A = L*T*L**T.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL DSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
      $                       WORK, LWORK, INFO )
diff --git a/lapack-netlib/SRC/dsysvxx.f b/lapack-netlib/SRC/dsysvxx.f
index 6e167d81e..0be50bcd1 100644
--- a/lapack-netlib/SRC/dsysvxx.f
+++ b/lapack-netlib/SRC/dsysvxx.f
@@ -377,7 +377,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -413,14 +413,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension (NPARAMS)
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -428,9 +428,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/dsytf2_rk.f b/lapack-netlib/SRC/dsytf2_rk.f
index 45cf62ab9..cc9e4616e 100644
--- a/lapack-netlib/SRC/dsytf2_rk.f
+++ b/lapack-netlib/SRC/dsytf2_rk.f
@@ -312,7 +312,7 @@
 *
 *        Factorize A as U*D*U**T using the upper triangle of A
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = ZERO
@@ -623,7 +623,7 @@
 *
 *        Factorize A as L*D*L**T using the lower triangle of A
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = ZERO
 *
diff --git a/lapack-netlib/SRC/dsytrd_2stage.f b/lapack-netlib/SRC/dsytrd_2stage.f
index 522602bb2..fc4b92908 100644
--- a/lapack-netlib/SRC/dsytrd_2stage.f
+++ b/lapack-netlib/SRC/dsytrd_2stage.f
@@ -123,23 +123,22 @@
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is DOUBLE PRECISION array, dimension LHOUS2, that
-*>          store the Householder representation of the stage2
+*>          HOUS2 is DOUBLE PRECISION array, dimension (LHOUS2)
+*>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
 *>
 *> \param[in] LHOUS2
 *> \verbatim
 *>          LHOUS2 is INTEGER
-*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array HOUS2.
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
 *>          this value as the first entry of the HOUS2 array, and no error
 *>          message related to LHOUS2 is issued by XERBLA.
-*>          LHOUS2 = MAX(1, dimension) where
-*>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'
+*>          If VECT='N', LHOUS2 = max(1, 4*n);
+*>          if VECT='V', option not yet available.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/dsytrd_sb2st.F b/lapack-netlib/SRC/dsytrd_sb2st.F
index 4d81fe226..0c0dbf125 100644
--- a/lapack-netlib/SRC/dsytrd_sb2st.F
+++ b/lapack-netlib/SRC/dsytrd_sb2st.F
@@ -50,9 +50,9 @@
 *  Arguments:
 *  ==========
 *
-*> \param[in] STAGE
+*> \param[in] STAGE1
 *> \verbatim
-*>          STAGE is CHARACTER*1
+*>          STAGE1 is CHARACTER*1
 *>          = 'N':  "No": to mention that the stage 1 of the reduction  
 *>                  from dense to band using the dsytrd_sy2sb routine
 *>                  was not called before this routine to reproduce AB. 
diff --git a/lapack-netlib/SRC/dsytrd_sy2sb.f b/lapack-netlib/SRC/dsytrd_sy2sb.f
index e0a5debc5..7f30817b0 100644
--- a/lapack-netlib/SRC/dsytrd_sy2sb.f
+++ b/lapack-netlib/SRC/dsytrd_sy2sb.f
@@ -363,7 +363,7 @@
 *
 *
 *     Set the workspace of the triangular matrix T to zero once such a
-*     way everytime T is generated the upper/lower portion will be always zero  
+*     way every time T is generated the upper/lower portion will be always zero
 *   
       CALL DLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
 *
diff --git a/lapack-netlib/SRC/dsytrf.f b/lapack-netlib/SRC/dsytrf.f
index d8da4f122..a3bd30a2f 100644
--- a/lapack-netlib/SRC/dsytrf.f
+++ b/lapack-netlib/SRC/dsytrf.f
@@ -39,7 +39,7 @@
 *> the Bunch-Kaufman diagonal pivoting method.  The form of the
 *> factorization is
 *>
-*>    A = U*D*U**T  or  A = L*D*L**T
+*>    A = U**T*D*U  or  A = L*D*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and D is symmetric and block diagonal with
@@ -144,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*>  If UPLO = 'U', then A = U*D*U**T, where
+*>  If UPLO = 'U', then A = U**T*D*U, where
 *>     U = P(n)*U(n)* ... *P(k)U(k)* ...,
 *>  i.e., U is a product of terms P(k)*U(k), where k decreases from n to
 *>  1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1
@@ -262,7 +262,7 @@
 *
       IF( UPPER ) THEN
 *
-*        Factorize A as U*D*U**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *
 *        K is the main loop index, decreasing from N to 1 in steps of
 *        KB, where KB is the number of columns factorized by DLASYF;
diff --git a/lapack-netlib/SRC/dsytrf_aa.f b/lapack-netlib/SRC/dsytrf_aa.f
index 24b3f393b..6df0da2cd 100644
--- a/lapack-netlib/SRC/dsytrf_aa.f
+++ b/lapack-netlib/SRC/dsytrf_aa.f
@@ -37,7 +37,7 @@
 *> DSYTRF_AA computes the factorization of a real symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a symmetric tridiagonal matrix.
@@ -223,7 +223,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
 *        Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N))
@@ -256,7 +256,7 @@
      $                      A( MAX(1, J), J+1 ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
@@ -375,7 +375,7 @@
      $                      A( J+1, MAX(1, J) ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
diff --git a/lapack-netlib/SRC/dsytrf_aa_2stage.f b/lapack-netlib/SRC/dsytrf_aa_2stage.f
index 25fc1a2eb..a37be5bdd 100644
--- a/lapack-netlib/SRC/dsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrf_aa_2stage.f
@@ -38,7 +38,7 @@
 *> DSYTRF_AA_2STAGE computes the factorization of a real symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a symmetric band matrix with the
@@ -103,6 +103,22 @@
 *>          no error message related to LTB is issued by XERBLA.
 *> \endverbatim
 *>
+*> \param[out] IPIV
+*> \verbatim
+*>          IPIV is INTEGER array, dimension (N)
+*>          On exit, it contains the details of the interchanges, i.e.,
+*>          the row and column k of A were interchanged with the
+*>          row and column IPIV(k).
+*> \endverbatim
+*>
+*> \param[out] IPIV2
+*> \verbatim
+*>          IPIV2 is INTEGER array, dimension (N)
+*>          On exit, it contains the details of the interchanges, i.e.,
+*>          the row and column k of T were interchanged with the
+*>          row and column IPIV2(k).
+*> \endverbatim
+*>
 *> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION workspace of size LWORK
@@ -120,22 +136,6 @@
 *>          no error message related to LWORK is issued by XERBLA.
 *> \endverbatim
 *>
-*> \param[out] IPIV
-*> \verbatim
-*>          IPIV is INTEGER array, dimension (N)
-*>          On exit, it contains the details of the interchanges, i.e.,
-*>          the row and column k of A were interchanged with the
-*>          row and column IPIV(k).
-*> \endverbatim
-*>
-*> \param[out] IPIV2
-*> \verbatim
-*>          IPIV2 is INTEGER array, dimension (N)
-*>          On exit, it contains the details of the interchanges, i.e.,
-*>          the row and column k of T were interchanged with the
-*>          row and column IPIV2(k).
-*> \endverbatim
-*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
@@ -275,7 +275,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
          DO J = 0, NT-1
@@ -442,12 +442,14 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL DSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, 
      $                                A( (J+1)*NB+1, I2 ), 1 )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL DSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
-     $                                    A( I1+1, I2 ), 1 )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) )
+     $                  CALL DSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
+     $                                       A( I1+1, I2 ), 1 )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL DSWAP( N-I2, A( I1, I2+1 ), LDA,
-     $                                 A( I2, I2+1 ), LDA ) 
+                     IF( I2.LT.N )
+     $                  CALL DSWAP( N-I2, A( I1, I2+1 ), LDA,
+     $                                    A( I2, I2+1 ), LDA ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
@@ -616,11 +618,13 @@ c               END IF
                      CALL DSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, 
      $                                A( I2, (J+1)*NB+1 ), LDA )
 *                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL DSWAP( I2-I1-1, A( I1+1, I1 ), 1,
-     $                                    A( I2, I1+1 ), LDA )
+                     IF( I2.GT.(I1+1) )
+     $                  CALL DSWAP( I2-I1-1, A( I1+1, I1 ), 1,
+     $                                       A( I2, I1+1 ), LDA )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL DSWAP( N-I2, A( I2+1, I1 ), 1,
-     $                                 A( I2+1, I2 ), 1 ) 
+                     IF( I2.LT.N )
+     $                  CALL DSWAP( N-I2, A( I2+1, I1 ), 1,
+     $                                    A( I2+1, I2 ), 1 ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
diff --git a/lapack-netlib/SRC/dsytri2.f b/lapack-netlib/SRC/dsytri2.f
index 23f8b9fa2..5c3a5ec76 100644
--- a/lapack-netlib/SRC/dsytri2.f
+++ b/lapack-netlib/SRC/dsytri2.f
@@ -62,7 +62,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
+*>          On entry, the block diagonal matrix D and the multipliers
 *>          used to obtain the factor U or L as computed by DSYTRF.
 *>
 *>          On exit, if INFO = 0, the (symmetric) inverse of the original
@@ -82,7 +82,7 @@
 *> \param[in] IPIV
 *> \verbatim
 *>          IPIV is INTEGER array, dimension (N)
-*>          Details of the interchanges and the NB structure of D
+*>          Details of the interchanges and the block structure of D
 *>          as determined by DSYTRF.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dsytrs_aa.f b/lapack-netlib/SRC/dsytrs_aa.f
index 05ef31ff3..d9dc0a6d1 100644
--- a/lapack-netlib/SRC/dsytrs_aa.f
+++ b/lapack-netlib/SRC/dsytrs_aa.f
@@ -37,7 +37,7 @@
 *> \verbatim
 *>
 *> DSYTRS_AA solves a system of linear equations A*X = B with a real
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by DSYTRF_AA.
 *> \endverbatim
 *
@@ -49,7 +49,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -97,14 +97,16 @@
 *>          The leading dimension of the array B.  LDB >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE array, dimension (MAX(1,LWORK))
+*>          WORK is DOUBLE PRECISION array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER, LWORK >= MAX(1,3*N-2).
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -198,22 +200,29 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
+*
+*        1) Forward substitution with U**T
+*
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
 *
-*        Pivot, P**T * B
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $             CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $          CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Compute U**T \ B -> B    [ (U**T \P**T * B) ]
 *
-*        Compute (U \P**T * B) -> B    [ (U \P**T * B) ]
+            CALL DTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-         CALL DTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+*        2) Solve with triangular matrix T
 *
-*        Compute T \ B -> B   [ T \ (U \P**T * B) ]
+*        Compute T \ B -> B   [ T \ (U**T \P**T * B) ]
 *
          CALL DLACPY( 'F', 1, N, A( 1, 1 ), LDA+1, WORK( N ), 1)
          IF( N.GT.1 ) THEN
@@ -223,35 +232,47 @@
          CALL DGTSV( N, NRHS, WORK( 1 ), WORK( N ), WORK( 2*N ), B, LDB,
      $               INFO )
 *
-*        Compute (U**T \ B) -> B   [ U**T \ (T \ (U \P**T * B) ) ]
+*        3) Backward substitution with U
+*
+         IF( N.GT.1 ) THEN
 *
-         CALL DTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+*           Compute U \ B -> B   [ U \ (T \ (U**T \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (U**T \ (T \ (U \P**T * B) )) ]
+            CALL DTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
+*
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       ELSE
 *
 *        Solve A*X = B, where A = L*T*L**T.
 *
-*        Pivot, P**T * B
+*        1) Forward substitution with L
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
+*
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
 *
-*        Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute L \ B -> B    [ (L \P**T * B) ]
+*
+            CALL DTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-         CALL DTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $               B( 2, 1 ), LDB)
+*        2) Solve with triangular matrix T
 *
 *        Compute T \ B -> B   [ T \ (L \P**T * B) ]
 *
@@ -263,18 +284,23 @@
          CALL DGTSV( N, NRHS, WORK( 1 ), WORK(N), WORK( 2*N ), B, LDB,
      $               INFO)
 *
-*        Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
+*        3) Backward substitution with L**T
 *
-         CALL DTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $              B( 2, 1 ), LDB)
+         IF( N.GT.1 ) THEN
+*
+*           Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+            CALL DTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL DSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       END IF
 *
diff --git a/lapack-netlib/SRC/dsytrs_aa_2stage.f b/lapack-netlib/SRC/dsytrs_aa_2stage.f
index bb283cb95..69c702f8a 100644
--- a/lapack-netlib/SRC/dsytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/dsytrs_aa_2stage.f
@@ -36,7 +36,7 @@
 *> \verbatim
 *>
 *> DSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a real
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by DSYTRF_AA_2STAGE.
 *> \endverbatim
 *
@@ -48,7 +48,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -208,15 +208,15 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (U**T \P**T * B) -> B    [ (U**T \P**T * B) ]
+*           Compute (U**T \ B) -> B    [ (U**T \P**T * B) ]
 *
             CALL DTRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -234,7 +234,7 @@
             CALL DTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
 *
             CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
@@ -246,11 +246,11 @@
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute (L \ B) -> B    [ (L \P**T * B) ]
 *
             CALL DTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -268,7 +268,7 @@
             CALL DTRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
 *
             CALL DLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
diff --git a/lapack-netlib/SRC/dtgsy2.f b/lapack-netlib/SRC/dtgsy2.f
index 1c687b15e..e8c9b4001 100644
--- a/lapack-netlib/SRC/dtgsy2.f
+++ b/lapack-netlib/SRC/dtgsy2.f
@@ -71,7 +71,7 @@
 *>             R  * B**T + L  * E**T  = scale * -F
 *>
 *> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
-*> sigma_min(Z) using reverse communicaton with DLACON.
+*> sigma_min(Z) using reverse communication with DLACON.
 *>
 *> DTGSY2 also (IJOB >= 1) contributes to the computation in DTGSYL
 *> of an upper bound on the separation between to matrix pairs. Then
@@ -85,7 +85,7 @@
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
-*>          = 'N', solve the generalized Sylvester equation (1).
+*>          = 'N': solve the generalized Sylvester equation (1).
 *>          = 'T': solve the 'transposed' system (3).
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/dtgsyl.f b/lapack-netlib/SRC/dtgsyl.f
index 1cc3a1bf8..bb0751794 100644
--- a/lapack-netlib/SRC/dtgsyl.f
+++ b/lapack-netlib/SRC/dtgsyl.f
@@ -88,20 +88,20 @@
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
-*>          = 'N', solve the generalized Sylvester equation (1).
-*>          = 'T', solve the 'transposed' system (3).
+*>          = 'N': solve the generalized Sylvester equation (1).
+*>          = 'T': solve the 'transposed' system (3).
 *> \endverbatim
 *>
 *> \param[in] IJOB
 *> \verbatim
 *>          IJOB is INTEGER
 *>          Specifies what kind of functionality to be performed.
-*>           =0: solve (1) only.
-*>           =1: The functionality of 0 and 3.
-*>           =2: The functionality of 0 and 4.
-*>           =3: Only an estimate of Dif[(A,D), (B,E)] is computed.
+*>          = 0: solve (1) only.
+*>          = 1: The functionality of 0 and 3.
+*>          = 2: The functionality of 0 and 4.
+*>          = 3: Only an estimate of Dif[(A,D), (B,E)] is computed.
 *>               (look ahead strategy IJOB  = 1 is used).
-*>           =4: Only an estimate of Dif[(A,D), (B,E)] is computed.
+*>          = 4: Only an estimate of Dif[(A,D), (B,E)] is computed.
 *>               ( DGECON on sub-systems is used ).
 *>          Not referenced if TRANS = 'T'.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/dtpmlqt.f b/lapack-netlib/SRC/dtpmlqt.f
index 3782d0c71..975ebdc27 100644
--- a/lapack-netlib/SRC/dtpmlqt.f
+++ b/lapack-netlib/SRC/dtpmlqt.f
@@ -94,7 +94,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is DOUBLE PRECISION array, dimension (LDA,K)
+*>          V is DOUBLE PRECISION array, dimension (LDV,K)
 *>          The i-th row must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          DTPLQT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/dtpmqrt.f b/lapack-netlib/SRC/dtpmqrt.f
index 44985a80d..a7888e192 100644
--- a/lapack-netlib/SRC/dtpmqrt.f
+++ b/lapack-netlib/SRC/dtpmqrt.f
@@ -94,7 +94,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is DOUBLE PRECISION array, dimension (LDA,K)
+*>          V is DOUBLE PRECISION array, dimension (LDV,K)
 *>          The i-th column must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          CTPQRT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/dtprfb.f b/lapack-netlib/SRC/dtprfb.f
index 6ae8fad8c..6d3b4dae1 100644
--- a/lapack-netlib/SRC/dtprfb.f
+++ b/lapack-netlib/SRC/dtprfb.f
@@ -152,8 +152,8 @@
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.
-*>          If SIDE = 'L', LDC >= max(1,K);
-*>          If SIDE = 'R', LDC >= max(1,M).
+*>          If SIDE = 'L', LDA >= max(1,K);
+*>          If SIDE = 'R', LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in,out] B
diff --git a/lapack-netlib/SRC/ilaenv.f b/lapack-netlib/SRC/ilaenv.f
index a438ada38..7f68c383d 100644
--- a/lapack-netlib/SRC/ilaenv.f
+++ b/lapack-netlib/SRC/ilaenv.f
@@ -132,7 +132,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2017
+*> \date November 2019
 *
 *> \ingroup OTHERauxiliary
 *
@@ -162,10 +162,10 @@
 *  =====================================================================
       INTEGER FUNCTION ILAENV( ISPEC, NAME, OPTS, N1, N2, N3, N4 )
 *
-*  -- LAPACK auxiliary routine (version 3.8.0) --
+*  -- LAPACK auxiliary routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2017
+*     November 2019
 *
 *     .. Scalar Arguments ..
       CHARACTER*( * )    NAME, OPTS
@@ -271,7 +271,16 @@
 *
       NB = 1
 *
-      IF( C2.EQ.'GE' ) THEN
+      IF( SUBNAM(2:6).EQ.'LAORH' ) THEN
+*
+*        This is for *LAORHR_GETRFNP routine
+*
+         IF( SNAME ) THEN
+             NB = 32
+         ELSE
+             NB = 32
+         END IF
+      ELSE IF( C2.EQ.'GE' ) THEN
          IF( C3.EQ.'TRF' ) THEN
             IF( SNAME ) THEN
                NB = 64
diff --git a/lapack-netlib/SRC/ilaenv2stage.f b/lapack-netlib/SRC/ilaenv2stage.f
index 3c0d34a12..db30a1b4d 100644
--- a/lapack-netlib/SRC/ilaenv2stage.f
+++ b/lapack-netlib/SRC/ilaenv2stage.f
@@ -39,9 +39,9 @@
 *>
 *> ILAENV2STAGE returns an INTEGER
 *> if ILAENV2STAGE >= 0: ILAENV2STAGE returns the value of the parameter
-*                        specified by ISPEC
+*>                       specified by ISPEC
 *> if ILAENV2STAGE < 0:  if ILAENV2STAGE = -k, the k-th argument had an
-*                        illegal value.
+*>                       illegal value.
 *>
 *> This version provides a set of parameters which should give good,
 *> but not optimal, performance on many of the currently available
diff --git a/lapack-netlib/SRC/iparam2stage.F b/lapack-netlib/SRC/iparam2stage.F
index 836e20eed..1a37300a7 100644
--- a/lapack-netlib/SRC/iparam2stage.F
+++ b/lapack-netlib/SRC/iparam2stage.F
@@ -35,7 +35,7 @@
 *> \verbatim
 *>
 *>      This program sets problem and machine dependent parameters
-*>      useful for xHETRD_2STAGE, xHETRD_H@2HB, xHETRD_HB2ST,
+*>      useful for xHETRD_2STAGE, xHETRD_HE2HB, xHETRD_HB2ST,
 *>      xGEBRD_2STAGE, xGEBRD_GE2GB, xGEBRD_GB2BD 
 *>      and related subroutines for eigenvalue problems. 
 *>      It is called whenever ILAENV is called with 17 <= ISPEC <= 21.
@@ -53,7 +53,7 @@
 *>              return.
 *>
 *>              ISPEC=17: the optimal blocksize nb for the reduction to
-*                         BAND
+*>                        BAND
 *>
 *>              ISPEC=18: the optimal blocksize ib for the eigenvectors
 *>                        singular vectors update routine
@@ -90,14 +90,14 @@
 *> \param[in] NBI
 *> \verbatim
 *>          NBI is INTEGER which is the used in the reduciton, 
-*           (e.g., the size of the band), needed to compute workspace
-*           and LHOUS2.
+*>          (e.g., the size of the band), needed to compute workspace
+*>          and LHOUS2.
 *> \endverbatim
 *>
 *> \param[in] IBI
 *> \verbatim
 *>          IBI is INTEGER which represent the IB of the reduciton,
-*           needed to compute workspace and LHOUS2.
+*>          needed to compute workspace and LHOUS2.
 *> \endverbatim
 *>
 *> \param[in] NXI
diff --git a/lapack-netlib/SRC/iparmq.f b/lapack-netlib/SRC/iparmq.f
index a9212b3e0..bb711243d 100644
--- a/lapack-netlib/SRC/iparmq.f
+++ b/lapack-netlib/SRC/iparmq.f
@@ -60,7 +60,7 @@
 *>                        invest in an (expensive) multi-shift QR sweep.
 *>                        If the aggressive early deflation subroutine
 *>                        finds LD converged eigenvalues from an order
-*>                        NW deflation window and LD.GT.(NW*NIBBLE)/100,
+*>                        NW deflation window and LD > (NW*NIBBLE)/100,
 *>                        then the next QR sweep is skipped and early
 *>                        deflation is applied immediately to the
 *>                        remaining active diagonal block.  Setting
@@ -184,8 +184,8 @@
 *>                        This depends on ILO, IHI and NS, the
 *>                        number of simultaneous shifts returned
 *>                        by IPARMQ(ISPEC=15).  The default for
-*>                        (IHI-ILO+1).LE.500 is NS.  The default
-*>                        for (IHI-ILO+1).GT.500 is 3*NS/2.
+*>                        (IHI-ILO+1) <= 500 is NS.  The default
+*>                        for (IHI-ILO+1) > 500 is 3*NS/2.
 *>
 *>       IPARMQ(ISPEC=14) Nibble crossover point.  Default: 14.
 *>
diff --git a/lapack-netlib/SRC/meson.build b/lapack-netlib/SRC/meson.build
new file mode 100644
index 000000000..bad682401
--- /dev/null
+++ b/lapack-netlib/SRC/meson.build
@@ -0,0 +1,11 @@
+ALLAUX = files('ilaenv.f', 'ilaenv2stage.f', 'ieeeck.f', 'lsamen.f', 'xerbla.f', 'xerbla_array.f', 'iparmq.f', 'iparam2stage.F', 'ilaprec.f', 'ilatrans.f', 'ilauplo.f', 'iladiag.f', 'chla_transtype.f', '../INSTALL/ilaver.f', '../INSTALL/lsame.f', '../INSTALL/slamch.f')
+
+
+SCLAUX = files('sbdsdc.f', 'sbdsqr.f', 'sdisna.f', 'slabad.f', 'slacpy.f', 'sladiv.f', 'slae2.f', 'slaebz.f', 'slaed0.f', 'slaed1.f', 'slaed2.f', 'slaed3.f', 'slaed4.f', 'slaed5.f', 'slaed6.f', 'slaed7.f', 'slaed8.f', 'slaed9.f', 'slaeda.f', 'slaev2.f', 'slagtf.f', 'slagts.f', 'slamrg.f', 'slanst.f', 'slapy2.f', 'slapy3.f', 'slarnv.f', 'slarra.f', 'slarrb.f', 'slarrc.f', 'slarrd.f', 'slarre.f', 'slarrf.f', 'slarrj.f', 'slarrk.f', 'slarrr.f', 'slaneg.f', 'slartg.f', 'slaruv.f', 'slas2.f', 'slascl.f', 'slasd0.f', 'slasd1.f', 'slasd2.f', 'slasd3.f', 'slasd4.f', 'slasd5.f', 'slasd6.f', 'slasd7.f', 'slasd8.f', 'slasda.f', 'slasdq.f', 'slasdt.f', 'slaset.f', 'slasq1.f', 'slasq2.f', 'slasq3.f', 'slasq4.f', 'slasq5.f', 'slasq6.f', 'slasr.f', 'slasrt.f', 'slassq.f', 'slasv2.f', 'spttrf.f', 'sstebz.f', 'sstedc.f', 'ssteqr.f', 'ssterf.f', 'slaisnan.f', 'sisnan.f', 'slartgp.f', 'slartgs.f', '../INSTALL/second_INT_CPU_TIME.f')
+
+DZLAUX = files('dbdsdc.f', 'dbdsqr.f', 'ddisna.f', 'dlabad.f', 'dlacpy.f', 'dladiv.f', 'dlae2.f', 'dlaebz.f', 'dlaed0.f', 'dlaed1.f', 'dlaed2.f', 'dlaed3.f', 'dlaed4.f', 'dlaed5.f', 'dlaed6.f', 'dlaed7.f', 'dlaed8.f', 'dlaed9.f', 'dlaeda.f', 'dlaev2.f', 'dlagtf.f', 'dlagts.f', 'dlamrg.f', 'dlanst.f', 'dlapy2.f', 'dlapy3.f', 'dlarnv.f', 'dlarra.f', 'dlarrb.f', 'dlarrc.f', 'dlarrd.f', 'dlarre.f', 'dlarrf.f', 'dlarrj.f', 'dlarrk.f', 'dlarrr.f', 'dlaneg.f', 'dlartg.f', 'dlaruv.f', 'dlas2.f', 'dlascl.f', 'dlasd0.f', 'dlasd1.f', 'dlasd2.f', 'dlasd3.f', 'dlasd4.f', 'dlasd5.f', 'dlasd6.f', 'dlasd7.f', 'dlasd8.f', 'dlasda.f', 'dlasdq.f', 'dlasdt.f', 'dlaset.f', 'dlasq1.f', 'dlasq2.f', 'dlasq3.f', 'dlasq4.f', 'dlasq5.f', 'dlasq6.f', 'dlasr.f', 'dlasrt.f', 'dlassq.f', 'dlasv2.f', 'dpttrf.f', 'dstebz.f', 'dstedc.f', 'dsteqr.f', 'dsterf.f', 'dlaisnan.f', 'disnan.f', 'dlartgp.f', 'dlartgs.f', '../INSTALL/dlamch.f', '../INSTALL/dsecnd_INT_CPU_TIME.f')
+
+
+SLASRC = files('sbdsvdx.f', 'spotrf2.f', 'sgetrf2.f', 'sgbbrd.f', 'sgbcon.f', 'sgbequ.f', 'sgbrfs.f', 'sgbsv.f', 'sgbsvx.f', 'sgbtf2.f', 'sgbtrf.f', 'sgbtrs.f', 'sgebak.f', 'sgebal.f', 'sgebd2.f', 'sgebrd.f', 'sgecon.f', 'sgeequ.f', 'sgees.f', 'sgeesx.f', 'sgeev.f', 'sgeevx.f', 'sgehd2.f', 'sgehrd.f', 'sgelq2.f', 'sgelqf.f', 'sgels.f', 'sgelsd.f', 'sgelss.f', 'sgelsy.f', 'sgeql2.f', 'sgeqlf.f', 'sgeqp3.f', 'sgeqr2.f', 'sgeqr2p.f', 'sgeqrf.f', 'sgeqrfp.f', 'sgerfs.f', 'sgerq2.f', 'sgerqf.f', 'sgesc2.f', 'sgesdd.f', 'sgesv.f', 'sgesvd.f', 'sgesvdx.f', 'sgesvx.f', 'sgetc2.f', 'sgetf2.f', 'sgetri.f', 'sggbak.f', 'sggbal.f', 'sgges.f', 'sgges3.f', 'sggesx.f', 'sggev.f', 'sggev3.f', 'sggevx.f', 'sggglm.f', 'sgghrd.f', 'sgghd3.f', 'sgglse.f', 'sggqrf.f', 'sggrqf.f', 'sggsvd3.f', 'sggsvp3.f', 'sgtcon.f', 'sgtrfs.f', 'sgtsv.f', 'sgtsvx.f', 'sgttrf.f', 'sgttrs.f', 'sgtts2.f', 'shgeqz.f', 'shsein.f', 'shseqr.f', 'slabrd.f', 'slacon.f', 'slacn2.f', 'slaein.f', 'slaexc.f', 'slag2.f', 'slags2.f', 'slagtm.f', 'slagv2.f', 'slahqr.f', 'slahr2.f', 'slaic1.f', 'slaln2.f', 'slals0.f', 'slalsa.f', 'slalsd.f', 'slangb.f', 'slange.f', 'slangt.f', 'slanhs.f', 'slansb.f', 'slansp.f', 'slansy.f', 'slantb.f', 'slantp.f', 'slantr.f', 'slanv2.f', 'slapll.f', 'slapmt.f', 'slaqgb.f', 'slaqge.f', 'slaqp2.f', 'slaqps.f', 'slaqsb.f', 'slaqsp.f', 'slaqsy.f', 'slaqr0.f', 'slaqr1.f', 'slaqr2.f', 'slaqr3.f', 'slaqr4.f', 'slaqr5.f', 'slaqtr.f', 'slar1v.f', 'slar2v.f', 'ilaslr.f', 'ilaslc.f', 'slarf.f', 'slarfb.f', 'slarfg.f', 'slarfgp.f', 'slarft.f', 'slarfx.f', 'slarfy.f', 'slargv.f', 'slarrv.f', 'slartv.f', 'slarz.f', 'slarzb.f', 'slarzt.f', 'slaswp.f', 'slasy2.f', 'slasyf.f', 'slasyf_rook.f', 'slasyf_rk.f', 'slatbs.f', 'slatdf.f', 'slatps.f', 'slatrd.f', 'slatrs.f', 'slatrz.f', 'slauu2.f', 'slauum.f', 'sopgtr.f', 'sopmtr.f', 'sorg2l.f', 'sorg2r.f', 'sorgbr.f', 'sorghr.f', 'sorgl2.f', 'sorglq.f', 'sorgql.f', 'sorgqr.f', 'sorgr2.f', 'sorgrq.f', 'sorgtr.f', 'sorm2l.f', 'sorm2r.f', 'sorm22.f', 'sormbr.f', 'sormhr.f', 'sorml2.f', 'sormlq.f', 'sormql.f', 'sormqr.f', 'sormr2.f', 'sormr3.f', 'sormrq.f', 'sormrz.f', 'sormtr.f', 'spbcon.f', 'spbequ.f', 'spbrfs.f', 'spbstf.f', 'spbsv.f', 'spbsvx.f', 'spbtf2.f', 'spbtrf.f', 'spbtrs.f', 'spocon.f', 'spoequ.f', 'sporfs.f', 'sposv.f', 'sposvx.f', 'spotf2.f', 'spotri.f', 'spstrf.f', 'spstf2.f', 'sppcon.f', 'sppequ.f', 'spprfs.f', 'sppsv.f', 'sppsvx.f', 'spptrf.f', 'spptri.f', 'spptrs.f', 'sptcon.f', 'spteqr.f', 'sptrfs.f', 'sptsv.f', 'sptsvx.f', 'spttrs.f', 'sptts2.f', 'srscl.f', 'ssbev.f', 'ssbevd.f', 'ssbevx.f', 'ssbgst.f', 'ssbgv.f', 'ssbgvd.f', 'ssbgvx.f', 'ssbtrd.f', 'sspcon.f', 'sspev.f', 'sspevd.f', 'sspevx.f', 'sspgst.f', 'sspgv.f', 'sspgvd.f', 'sspgvx.f', 'ssprfs.f', 'sspsv.f', 'sspsvx.f', 'ssptrd.f', 'ssptrf.f', 'ssptri.f', 'ssptrs.f', 'sstegr.f', 'sstein.f', 'sstev.f', 'sstevd.f', 'sstevr.f', 'sstevx.f', 'ssycon.f', 'ssyev.f', 'ssyevd.f', 'ssyevr.f', 'ssyevx.f', 'ssygs2.f', 'ssygst.f', 'ssygv.f', 'ssygvd.f', 'ssygvx.f', 'ssyrfs.f', 'ssysv.f', 'ssysvx.f', 'ssytd2.f', 'ssytf2.f', 'ssytrd.f', 'ssytrf.f', 'ssytri.f', 'ssytri2.f', 'ssytri2x.f', 'ssyswapr.f', 'ssytrs.f', 'ssytrs2.f', 'ssyconv.f', 'ssyconvf.f', 'ssyconvf_rook.f', 'ssytf2_rook.f', 'ssytrf_rook.f', 'ssytrs_rook.f', 'ssytri_rook.f', 'ssycon_rook.f', 'ssysv_rook.f', 'ssytf2_rk.f', 'ssytrf_rk.f', 'ssytrs_3.f', 'ssytri_3.f', 'ssytri_3x.f', 'ssycon_3.f', 'ssysv_rk.f', 'slasyf_aa.f', 'ssysv_aa.f', 'ssytrf_aa.f', 'ssytrs_aa.f', 'ssysv_aa_2stage.f', 'ssytrf_aa_2stage.f', 'ssytrs_aa_2stage.f', 'stbcon.f', 'stbrfs.f', 'stbtrs.f', 'stgevc.f', 'stgex2.f', 'stgexc.f', 'stgsen.f', 'stgsja.f', 'stgsna.f', 'stgsy2.f', 'stgsyl.f', 'stpcon.f', 'stprfs.f', 'stptri.f', 'stptrs.f', 'strcon.f', 'strevc.f', 'strevc3.f', 'strexc.f', 'strrfs.f', 'strsen.f', 'strsna.f', 'strsyl.f', 'strti2.f', 'strtri.f', 'strtrs.f', 'stzrzf.f', 'sstemr.f', 'slansf.f', 'spftrf.f', 'spftri.f', 'spftrs.f', 'ssfrk.f', 'stfsm.f', 'stftri.f', 'stfttp.f', 'stfttr.f', 'stpttf.f', 'stpttr.f', 'strttf.f', 'strttp.f', 'sgejsv.f', 'sgesvj.f', 'sgsvj0.f', 'sgsvj1.f', 'sgeequb.f', 'ssyequb.f', 'spoequb.f', 'sgbequb.f', 'sbbcsd.f', 'slapmr.f', 'sorbdb.f', 'sorbdb1.f', 'sorbdb2.f', 'sorbdb3.f', 'sorbdb4.f', 'sorbdb5.f', 'sorbdb6.f', 'sorcsd.f', 'sorcsd2by1.f', 'sgeqrt.f', 'sgeqrt2.f', 'sgeqrt3.f', 'sgemqrt.f', 'stpqrt.f', 'stpqrt2.f', 'stpmqrt.f', 'stprfb.f', 'sgelqt.f', 'sgelqt3.f', 'sgemlqt.f', 'sgetsls.f', 'sgeqr.f', 'slatsqr.f', 'slamtsqr.f', 'sgemqr.f', 'sgelq.f', 'slaswlq.f', 'slamswlq.f', 'sgemlq.f', 'stplqt.f', 'stplqt2.f', 'stpmlqt.f', 'ssytrd_2stage.f', 'ssytrd_sy2sb.f', 'ssytrd_sb2st.F', 'ssb2st_kernels.f', 'ssyevd_2stage.f', 'ssyev_2stage.f', 'ssyevx_2stage.f', 'ssyevr_2stage.f', 'ssbev_2stage.f', 'ssbevx_2stage.f', 'ssbevd_2stage.f', 'ssygv_2stage.f', 'sgesvdq.f', 'scombssq.f')
+
+DSLASRC = files('spotrs.f', 'sgetrs.f', 'spotrf.f', 'sgetrf.f')
diff --git a/lapack-netlib/SRC/sbdsvdx.f b/lapack-netlib/SRC/sbdsvdx.f
index a4b1887b2..c46674c47 100644
--- a/lapack-netlib/SRC/sbdsvdx.f
+++ b/lapack-netlib/SRC/sbdsvdx.f
@@ -165,7 +165,7 @@
 *>
 *> \param[out] Z
 *> \verbatim
-*>          Z is REAL array, dimension (2*N,K) )
+*>          Z is REAL array, dimension (2*N,K)
 *>          If JOBZ = 'V', then if INFO = 0 the first NS columns of Z
 *>          contain the singular vectors of the matrix B corresponding to
 *>          the selected singular values, with U in rows 1 to N and V
diff --git a/lapack-netlib/SRC/scombssq.f b/lapack-netlib/SRC/scombssq.f
new file mode 100644
index 000000000..76bc0e320
--- /dev/null
+++ b/lapack-netlib/SRC/scombssq.f
@@ -0,0 +1,92 @@
+*> \brief \b SCOMBSSQ adds two scaled sum of squares quantities
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SCOMBSSQ( V1, V2 )
+*
+*       .. Array Arguments ..
+*       REAL               V1( 2 ), V2( 2 )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SCOMBSSQ adds two scaled sum of squares quantities, V1 := V1 + V2.
+*> That is,
+*>
+*>    V1_scale**2 * V1_sumsq := V1_scale**2 * V1_sumsq
+*>                            + V2_scale**2 * V2_sumsq
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in,out] V1
+*> \verbatim
+*>          V1 is REAL array, dimension (2).
+*>          The first scaled sum.
+*>          V1(1) = V1_scale, V1(2) = V1_sumsq.
+*> \endverbatim
+*>
+*> \param[in] V2
+*> \verbatim
+*>          V2 is REAL array, dimension (2).
+*>          The second scaled sum.
+*>          V2(1) = V2_scale, V2(2) = V2_sumsq.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2018
+*
+*> \ingroup OTHERauxiliary
+*
+*  =====================================================================
+      SUBROUTINE SCOMBSSQ( V1, V2 )
+*
+*  -- LAPACK auxiliary routine (version 3.7.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2018
+*
+*     .. Array Arguments ..
+      REAL               V1( 2 ), V2( 2 )
+*     ..
+*
+* =====================================================================
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0D+0 )
+*     ..
+*     .. Executable Statements ..
+*
+      IF( V1( 1 ).GE.V2( 1 ) ) THEN
+         IF( V1( 1 ).NE.ZERO ) THEN
+            V1( 2 ) = V1( 2 ) + ( V2( 1 ) / V1( 1 ) )**2 * V2( 2 )
+         END IF
+      ELSE
+         V1( 2 ) = V2( 2 ) + ( V1( 1 ) / V2( 1 ) )**2 * V1( 2 )
+         V1( 1 ) = V2( 1 )
+      END IF
+      RETURN
+*
+*     End of SCOMBSSQ
+*
+      END
diff --git a/lapack-netlib/SRC/sgbrfsx.f b/lapack-netlib/SRC/sgbrfsx.f
index 032b78b80..78ae584e1 100644
--- a/lapack-netlib/SRC/sgbrfsx.f
+++ b/lapack-netlib/SRC/sgbrfsx.f
@@ -308,7 +308,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -344,14 +344,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -359,9 +359,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/sgbsvxx.f b/lapack-netlib/SRC/sgbsvxx.f
index b2132325e..3c3d737b3 100644
--- a/lapack-netlib/SRC/sgbsvxx.f
+++ b/lapack-netlib/SRC/sgbsvxx.f
@@ -431,7 +431,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -467,14 +467,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -482,9 +482,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/sgebak.f b/lapack-netlib/SRC/sgebak.f
index ec58bf335..5c64c8b97 100644
--- a/lapack-netlib/SRC/sgebak.f
+++ b/lapack-netlib/SRC/sgebak.f
@@ -47,10 +47,10 @@
 *> \verbatim
 *>          JOB is CHARACTER*1
 *>          Specifies the type of backward transformation required:
-*>          = 'N', do nothing, return immediately;
-*>          = 'P', do backward transformation for permutation only;
-*>          = 'S', do backward transformation for scaling only;
-*>          = 'B', do backward transformations for both permutation and
+*>          = 'N': do nothing, return immediately;
+*>          = 'P': do backward transformation for permutation only;
+*>          = 'S': do backward transformation for scaling only;
+*>          = 'B': do backward transformations for both permutation and
 *>                 scaling.
 *>          JOB must be the same as the argument JOB supplied to SGEBAL.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sgeesx.f b/lapack-netlib/SRC/sgeesx.f
index c90de9b81..5ffa3bc37 100644
--- a/lapack-netlib/SRC/sgeesx.f
+++ b/lapack-netlib/SRC/sgeesx.f
@@ -583,7 +583,9 @@
                      IF( N.GT.I+1 )
      $                  CALL SSWAP( N-I-1, A( I, I+2 ), LDA,
      $                              A( I+1, I+2 ), LDA )
-                     CALL SSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 )
+                     IF( WANTVS ) THEN
+                       CALL SSWAP( N, VS( 1, I ), 1, VS( 1, I+1 ), 1 )
+                     END IF
                      A( I, I+1 ) = A( I+1, I )
                      A( I+1, I ) = ZERO
                   END IF
diff --git a/lapack-netlib/SRC/sgejsv.f b/lapack-netlib/SRC/sgejsv.f
index e4cbe8d0e..4ad316d99 100644
--- a/lapack-netlib/SRC/sgejsv.f
+++ b/lapack-netlib/SRC/sgejsv.f
@@ -82,7 +82,7 @@
 *>              desirable, then this option is advisable. The input matrix A
 *>              is preprocessed with QR factorization with FULL (row and
 *>              column) pivoting.
-*>       = 'G'  Computation as with 'F' with an additional estimate of the
+*>       = 'G': Computation as with 'F' with an additional estimate of the
 *>              condition number of B, where A=D*B. If A has heavily weighted
 *>              rows, then using this condition number gives too pessimistic
 *>              error bound.
@@ -133,7 +133,7 @@
 *>         specified range. If A .NE. 0 is scaled so that the largest singular
 *>         value of c*A is around SQRT(BIG), BIG=SLAMCH('O'), then JOBR issues
 *>         the licence to kill columns of A whose norm in c*A is less than
-*>         SQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN,
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
 *>         where SFMIN=SLAMCH('S'), EPSLN=SLAMCH('E').
 *>       = 'N': Do not kill small columns of c*A. This option assumes that
 *>              BLAS and QR factorizations and triangular solvers are
@@ -230,7 +230,7 @@
 *>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
 *>                         the left singular vectors, including an ONB
 *>                         of the orthogonal complement of the Range(A).
-*>          If JOBU = 'W'  .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N),
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
 *>                         then U is used as workspace if the procedure
 *>                         replaces A with A^t. In that case, [V] is computed
 *>                         in U as left singular vectors of A^t and then
@@ -252,7 +252,7 @@
 *>          V is REAL array, dimension ( LDV, N )
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
-*>          If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N),
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
 *>                         then V is used as workspace if the pprocedure
 *>                         replaces A with A^t. In that case, [U] is computed
 *>                         in V as right singular vectors of A^t and then
@@ -278,7 +278,7 @@
 *>                    of A. (See the description of SVA().)
 *>          WORK(2) = See the description of WORK(1).
 *>          WORK(3) = SCONDA is an estimate for the condition number of
-*>                    column equilibrated A. (If JOBA .EQ. 'E' or 'G')
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
 *>                    SCONDA is an estimate of SQRT(||(R^t * R)^(-1)||_1).
 *>                    It is computed using SPOCON. It holds
 *>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
@@ -297,7 +297,7 @@
 *>                    triangular factor in the first QR factorization.
 *>          WORK(5) = an estimate of the scaled condition number of the
 *>                    triangular factor in the second QR factorization.
-*>          The following two parameters are computed if JOBT .EQ. 'T'.
+*>          The following two parameters are computed if JOBT = 'T'.
 *>          They are provided for a developer/implementer who is familiar
 *>          with the details of the method.
 *>
@@ -313,8 +313,8 @@
 *>          Length of WORK to confirm proper allocation of work space.
 *>          LWORK depends on the job:
 *>
-*>          If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and
-*>            -> .. no scaled condition estimate required (JOBE.EQ.'N'):
+*>          If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
+*>            -> .. no scaled condition estimate required (JOBE = 'N'):
 *>               LWORK >= max(2*M+N,4*N+1,7). This is the minimal requirement.
 *>               ->> For optimal performance (blocked code) the optimal value
 *>               is LWORK >= max(2*M+N,3*N+(N+1)*NB,7). Here NB is the optimal
@@ -330,7 +330,7 @@
 *>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DGEQRF),
 *>                                                     N+N*N+LWORK(DPOCON),7).
 *>
-*>          If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'),
+*>          If SIGMA and the right singular vectors are needed (JOBV = 'V'),
 *>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
 *>            -> For optimal performance, LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
 *>               where NB is the optimal block size for DGEQP3, DGEQRF, DGELQ,
@@ -341,19 +341,19 @@
 *>          If SIGMA and the left singular vectors are needed
 *>            -> the minimal requirement is LWORK >= max(2*M+N,4*N+1,7).
 *>            -> For optimal performance:
-*>               if JOBU.EQ.'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
-*>               if JOBU.EQ.'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7),
+*>               if JOBU = 'U' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,7),
+*>               if JOBU = 'F' :: LWORK >= max(2*M+N,3*N+(N+1)*NB,N+M*NB,7),
 *>               where NB is the optimal block size for DGEQP3, DGEQRF, DORMQR.
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(2*M+N,N+LWORK(DGEQP3),N+LWORK(DPOCON),
 *>                        2*N+LWORK(DGEQRF), N+LWORK(DORMQR)).
-*>               Here LWORK(DORMQR) equals N*NB (for JOBU.EQ.'U') or
-*>               M*NB (for JOBU.EQ.'F').
+*>               Here LWORK(DORMQR) equals N*NB (for JOBU = 'U') or
+*>               M*NB (for JOBU = 'F').
 *>
-*>          If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and
-*>            -> if JOBV.EQ.'V'
+*>          If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and
+*>            -> if JOBV = 'V'
 *>               the minimal requirement is LWORK >= max(2*M+N,6*N+2*N*N).
-*>            -> if JOBV.EQ.'J' the minimal requirement is
+*>            -> if JOBV = 'J' the minimal requirement is
 *>               LWORK >= max(2*M+N, 4*N+N*N,2*N+N*N+6).
 *>            -> For optimal performance, LWORK should be additionally
 *>               larger than N+M*NB, where NB is the optimal block size
@@ -369,7 +369,7 @@
 *>                     of JOBA and JOBR.
 *>          IWORK(2) = the number of the computed nonzero singular values
 *>          IWORK(3) = if nonzero, a warning message:
-*>                     If IWORK(3).EQ.1 then some of the column norms of A
+*>                     If IWORK(3) = 1 then some of the column norms of A
 *>                     were denormalized floats. The requested high accuracy
 *>                     is not warranted by the data.
 *> \endverbatim
@@ -377,10 +377,10 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           < 0  : if INFO = -i, then the i-th argument had an illegal value.
-*>           = 0 :  successful exit;
-*>           > 0 :  SGEJSV  did not converge in the maximal allowed number
-*>                  of sweeps. The computed values may be inaccurate.
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  SGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
 *> \endverbatim
 *
 *  Authors:
@@ -953,7 +953,7 @@
       IF ( L2ABER ) THEN
 *        Standard absolute error bound suffices. All sigma_i with
 *        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
-*        agressive enforcement of lower numerical rank by introducing a
+*        aggressive enforcement of lower numerical rank by introducing a
 *        backward error of the order of N*EPSLN*||A||.
          TEMP1 = SQRT(FLOAT(N))*EPSLN
          DO 3001 p = 2, N
@@ -965,7 +965,7 @@
  3001    CONTINUE
  3002    CONTINUE
       ELSE IF ( L2RANK ) THEN
-*        .. similarly as above, only slightly more gentle (less agressive).
+*        .. similarly as above, only slightly more gentle (less aggressive).
 *        Sudden drop on the diagonal of R1 is used as the criterion for
 *        close-to-rank-deficient.
          TEMP1 = SQRT(SFMIN)
@@ -1294,7 +1294,7 @@
             CALL SPOCON('Lower',NR,WORK(2*N+1),NR,ONE,TEMP1,
      $                   WORK(2*N+NR*NR+1),IWORK(M+2*N+1),IERR)
             CONDR1 = ONE / SQRT(TEMP1)
-*           .. here need a second oppinion on the condition number
+*           .. here need a second opinion on the condition number
 *           .. then assume worst case scenario
 *           R1 is OK for inverse <=> CONDR1 .LT. FLOAT(N)
 *           more conservative    <=> CONDR1 .LT. SQRT(FLOAT(N))
@@ -1335,7 +1335,7 @@
             ELSE
 *
 *              .. ill-conditioned case: second QRF with pivoting
-*              Note that windowed pivoting would be equaly good
+*              Note that windowed pivoting would be equally good
 *              numerically, and more run-time efficient. So, in
 *              an optimal implementation, the next call to SGEQP3
 *              should be replaced with eg. CALL SGEQPX (ACM TOMS #782)
@@ -1388,7 +1388,7 @@
 *
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
-*                 (this overwrittes the copy of R2, as it will not be
+*                 (this overwrites the copy of R2, as it will not be
 *                 needed in this branch, but it does not overwritte the
 *                 Huseholder vectors of Q2.).
                   CALL SLACPY( 'U', NR, NR, V, LDV, WORK(2*N+1), N )
@@ -1638,7 +1638,7 @@
 *
 *        This branch deploys a preconditioned Jacobi SVD with explicitly
 *        accumulated rotations. It is included as optional, mainly for
-*        experimental purposes. It does perfom well, and can also be used.
+*        experimental purposes. It does perform well, and can also be used.
 *        In this implementation, this branch will be automatically activated
 *        if the  condition number sigma_max(A) / sigma_min(A) is predicted
 *        to be greater than the overflow threshold. This is because the
diff --git a/lapack-netlib/SRC/sgelq.f b/lapack-netlib/SRC/sgelq.f
index 4fe4d191d..96c4097e8 100644
--- a/lapack-netlib/SRC/sgelq.f
+++ b/lapack-netlib/SRC/sgelq.f
@@ -1,3 +1,4 @@
+*> \brief \b SGELQ
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,17 @@
 *  =============
 *>
 *> \verbatim
-*> SGELQ computes a LQ factorization of an M-by-N matrix A.
+*>
+*> SGELQ computes an LQ factorization of a real M-by-N matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +149,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -159,10 +170,10 @@
       SUBROUTINE SGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/sgelq2.f b/lapack-netlib/SRC/sgelq2.f
index 5b1ad215a..df8128d53 100644
--- a/lapack-netlib/SRC/sgelq2.f
+++ b/lapack-netlib/SRC/sgelq2.f
@@ -33,8 +33,16 @@
 *>
 *> \verbatim
 *>
-*> SGELQ2 computes an LQ factorization of a real m by n matrix A:
-*> A = L * Q.
+*> SGELQ2 computes an LQ factorization of a real m-by-n matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a n-by-n orthogonal matrix;
+*>    L is an lower-triangular m-by-m matrix;
+*>    0 is a m-by-(n-m) zero matrix, if m < n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +104,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup realGEcomputational
 *
@@ -121,10 +129,10 @@
 *  =====================================================================
       SUBROUTINE SGELQ2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/sgelqf.f b/lapack-netlib/SRC/sgelqf.f
index 99c03c0a3..90357e623 100644
--- a/lapack-netlib/SRC/sgelqf.f
+++ b/lapack-netlib/SRC/sgelqf.f
@@ -34,7 +34,15 @@
 *> \verbatim
 *>
 *> SGELQF computes an LQ factorization of a real M-by-N matrix A:
-*> A = L * Q.
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -110,7 +118,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup realGEcomputational
 *
@@ -135,10 +143,10 @@
 *  =====================================================================
       SUBROUTINE SGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/sgelqt.f b/lapack-netlib/SRC/sgelqt.f
index 9a93af332..64d46025c 100644
--- a/lapack-netlib/SRC/sgelqt.f
+++ b/lapack-netlib/SRC/sgelqt.f
@@ -1,3 +1,5 @@
+*> \brief \b SGELQT
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/sgelqt3.f b/lapack-netlib/SRC/sgelqt3.f
index 292ae88a3..edf5d6d30 100644
--- a/lapack-netlib/SRC/sgelqt3.f
+++ b/lapack-netlib/SRC/sgelqt3.f
@@ -1,3 +1,5 @@
+*> \brief \b SGELQT3
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/sgemlq.f b/lapack-netlib/SRC/sgemlq.f
index dedbe7752..5f2e02a8e 100644
--- a/lapack-netlib/SRC/sgemlq.f
+++ b/lapack-netlib/SRC/sgemlq.f
@@ -1,3 +1,4 @@
+*> \brief \b SGEMLQ
 *
 *  Definition:
 *  ===========
@@ -143,7 +144,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/sgemlqt.f b/lapack-netlib/SRC/sgemlqt.f
index a8f022bdc..37850fdf5 100644
--- a/lapack-netlib/SRC/sgemlqt.f
+++ b/lapack-netlib/SRC/sgemlqt.f
@@ -1,3 +1,5 @@
+*> \brief \b SGEMLQT
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/sgemqr.f b/lapack-netlib/SRC/sgemqr.f
index 307fc8ca9..66c5117c9 100644
--- a/lapack-netlib/SRC/sgemqr.f
+++ b/lapack-netlib/SRC/sgemqr.f
@@ -1,3 +1,4 @@
+*> \brief \b SGEMQR
 *
 *  Definition:
 *  ===========
@@ -144,7 +145,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/sgeqr.f b/lapack-netlib/SRC/sgeqr.f
index f939abd9d..4a6bb9ea5 100644
--- a/lapack-netlib/SRC/sgeqr.f
+++ b/lapack-netlib/SRC/sgeqr.f
@@ -1,3 +1,4 @@
+*> \brief \b SGEQR
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,18 @@
 *  =============
 *>
 *> \verbatim
-*> SGEQR computes a QR factorization of an M-by-N matrix A.
+*>
+*> SGEQR computes a QR factorization of a real M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +150,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -160,10 +172,10 @@
       SUBROUTINE SGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/sgeqr2.f b/lapack-netlib/SRC/sgeqr2.f
index 3b990f825..0a1ff304f 100644
--- a/lapack-netlib/SRC/sgeqr2.f
+++ b/lapack-netlib/SRC/sgeqr2.f
@@ -33,8 +33,17 @@
 *>
 *> \verbatim
 *>
-*> SGEQR2 computes a QR factorization of a real m by n matrix A:
-*> A = Q * R.
+*> SGEQR2 computes a QR factorization of a real m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +105,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup realGEcomputational
 *
@@ -121,10 +130,10 @@
 *  =====================================================================
       SUBROUTINE SGEQR2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/sgeqr2p.f b/lapack-netlib/SRC/sgeqr2p.f
index f48af9d2d..08d124797 100644
--- a/lapack-netlib/SRC/sgeqr2p.f
+++ b/lapack-netlib/SRC/sgeqr2p.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> SGEQR2P computes a QR factorization of a real m by n matrix A:
-*> A = Q * R. The diagonal entries of R are nonnegative.
+*> SGEQR2P computes a QR factorization of a real m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -97,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup realGEcomputational
 *
@@ -124,10 +134,10 @@
 *  =====================================================================
       SUBROUTINE SGEQR2P( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/sgeqrf.f b/lapack-netlib/SRC/sgeqrf.f
index 0f79c2ca5..7df495e04 100644
--- a/lapack-netlib/SRC/sgeqrf.f
+++ b/lapack-netlib/SRC/sgeqrf.f
@@ -34,7 +34,16 @@
 *> \verbatim
 *>
 *> SGEQRF computes a QR factorization of a real M-by-N matrix A:
-*> A = Q * R.
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -111,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup realGEcomputational
 *
@@ -136,10 +145,10 @@
 *  =====================================================================
       SUBROUTINE SGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/sgeqrfp.f b/lapack-netlib/SRC/sgeqrfp.f
index 654c0a13a..7f6741570 100644
--- a/lapack-netlib/SRC/sgeqrfp.f
+++ b/lapack-netlib/SRC/sgeqrfp.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> SGEQRFP computes a QR factorization of a real M-by-N matrix A:
-*> A = Q * R. The diagonal entries of R are nonnegative.
+*> SGEQR2P computes a QR factorization of a real M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -112,7 +122,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup realGEcomputational
 *
@@ -139,10 +149,10 @@
 *  =====================================================================
       SUBROUTINE SGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/sgerfsx.f b/lapack-netlib/SRC/sgerfsx.f
index 3f518899e..b1a1eb13d 100644
--- a/lapack-netlib/SRC/sgerfsx.f
+++ b/lapack-netlib/SRC/sgerfsx.f
@@ -283,7 +283,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -319,14 +319,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -334,9 +334,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/sgesc2.f b/lapack-netlib/SRC/sgesc2.f
index c78daa334..3a6f34584 100644
--- a/lapack-netlib/SRC/sgesc2.f
+++ b/lapack-netlib/SRC/sgesc2.f
@@ -90,7 +90,7 @@
 *> \verbatim
 *>          SCALE is REAL
 *>           On exit, SCALE contains the scale factor. SCALE is chosen
-*>           0 <= SCALE <= 1 to prevent owerflow in the solution.
+*>           0 <= SCALE <= 1 to prevent overflow in the solution.
 *> \endverbatim
 *
 *  Authors:
@@ -151,7 +151,7 @@
 *     ..
 *     .. Executable Statements ..
 *
-*      Set constant to control owerflow
+*      Set constant to control overflow
 *
       EPS = SLAMCH( 'P' )
       SMLNUM = SLAMCH( 'S' ) / EPS
diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f
index 0ba2a78c7..689494dd1 100644
--- a/lapack-netlib/SRC/sgesdd.f
+++ b/lapack-netlib/SRC/sgesdd.f
@@ -322,7 +322,7 @@
 *
             IF( WNTQN ) THEN
 *              sbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6)
-*              keep 7*N for backwards compatability.
+*              keep 7*N for backwards compatibility.
                BDSPAC = 7*N
             ELSE
                BDSPAC = 3*N*N + 4*N
@@ -448,7 +448,7 @@
 *
             IF( WNTQN ) THEN
 *              sbdsdc needs only 4*N (or 6*N for uplo=L for LAPACK <= 3.6)
-*              keep 7*N for backwards compatability.
+*              keep 7*N for backwards compatibility.
                BDSPAC = 7*M
             ELSE
                BDSPAC = 3*M*M + 4*M
diff --git a/lapack-netlib/SRC/sgesvdq.f b/lapack-netlib/SRC/sgesvdq.f
new file mode 100644
index 000000000..73e34862f
--- /dev/null
+++ b/lapack-netlib/SRC/sgesvdq.f
@@ -0,0 +1,1388 @@
+*> \brief <b> SGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SGESVDQ + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sgesvdq.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sgesvdq.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sgesvdq.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE SGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+*                          S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+*                          WORK, LWORK, RWORK, LRWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*      IMPLICIT    NONE
+*      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+*      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK,
+*                  INFO
+*     ..
+*     .. Array Arguments ..
+*      REAL        A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * )
+*      REAL        S( * ), RWORK( * )
+*      INTEGER     IWORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SGESVDQ computes the singular value decomposition (SVD) of a real
+*> M-by-N matrix A, where M >= N. The SVD of A is written as
+*>                                    [++]   [xx]   [x0]   [xx]
+*>              A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+*>                                    [++]   [xx]
+*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+*> matrix, and V is an N-by-N orthogonal matrix. The diagonal elements
+*> of SIGMA are the singular values of A. The columns of U and V are the
+*> left and the right singular vectors of A, respectively.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] JOBA
+*> \verbatim
+*>  JOBA is CHARACTER*1
+*>  Specifies the level of accuracy in the computed SVD
+*>  = 'A' The requested accuracy corresponds to having the backward
+*>        error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F,
+*>        where EPS = SLAMCH('Epsilon'). This authorises CGESVDQ to
+*>        truncate the computed triangular factor in a rank revealing
+*>        QR factorization whenever the truncated part is below the
+*>        threshold of the order of EPS * ||A||_F. This is aggressive
+*>        truncation level.
+*>  = 'M' Similarly as with 'A', but the truncation is more gentle: it
+*>        is allowed only when there is a drop on the diagonal of the
+*>        triangular factor in the QR factorization. This is medium
+*>        truncation level.
+*>  = 'H' High accuracy requested. No numerical rank determination based
+*>        on the rank revealing QR factorization is attempted.
+*>  = 'E' Same as 'H', and in addition the condition number of column
+*>        scaled A is estimated and returned in  RWORK(1).
+*>        N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1)
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>  JOBP is CHARACTER*1
+*>  = 'P' The rows of A are ordered in decreasing order with respect to
+*>        ||A(i,:)||_\infty. This enhances numerical accuracy at the cost
+*>        of extra data movement. Recommended for numerical robustness.
+*>  = 'N' No row pivoting.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>          = 'T' After the initial pivoted QR factorization, SGESVD is applied to
+*>          the transposed R**T of the computed triangular factor R. This involves
+*>          some extra data movement (matrix transpositions). Useful for
+*>          experiments, research and development.
+*>          = 'N' The triangular factor R is given as input to SGESVD. This may be
+*>          preferred as it involves less data movement.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>          = 'A' All M left singular vectors are computed and returned in the
+*>          matrix U. See the description of U.
+*>          = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned
+*>          in the matrix U. See the description of U.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular
+*>          vectors are computed and returned in the matrix U.
+*>          = 'F' The N left singular vectors are returned in factored form as the
+*>          product of the Q factor from the initial QR factorization and the
+*>          N left singular vectors of (R**T , 0)**T. If row pivoting is used,
+*>          then the necessary information on the row pivoting is stored in
+*>          IWORK(N+1:N+M-1).
+*>          = 'N' The left singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>          = 'A', 'V' All N right singular vectors are computed and returned in
+*>          the matrix V.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular
+*>          vectors are computed and returned in the matrix V. This option is
+*>          allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal.
+*>          = 'N' The right singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the input matrix A.  M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array of dimensions LDA x N
+*>          On entry, the input matrix A.
+*>          On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains
+*>          the Householder vectors as stored by SGEQP3. If JOBU = 'F', these Householder
+*>          vectors together with WORK(1:N) can be used to restore the Q factors from
+*>          the initial pivoted QR factorization of A. See the description of U.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER.
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is REAL array of dimension N.
+*>          The singular values of A, ordered so that S(i) >= S(i+1).
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is REAL array, dimension
+*>          LDU x M if JOBU = 'A'; see the description of LDU. In this case,
+*>          on exit, U contains the M left singular vectors.
+*>          LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this
+*>          case, U contains the leading N or the leading NUMRANK left singular vectors.
+*>          LDU x N if JOBU = 'F' ; see the description of LDU. In this case U
+*>          contains N x N orthogonal matrix that can be used to form the left
+*>          singular vectors.
+*>          If JOBU = 'N', U is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER.
+*>          The leading dimension of the array U.
+*>          If JOBU = 'A', 'S', 'U', 'R',  LDU >= max(1,M).
+*>          If JOBU = 'F',                 LDU >= max(1,N).
+*>          Otherwise,                     LDU >= 1.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is REAL array, dimension
+*>          LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' .
+*>          If JOBV = 'A', or 'V',  V contains the N-by-N orthogonal matrix  V**T;
+*>          If JOBV = 'R', V contains the first NUMRANK rows of V**T (the right
+*>          singular vectors, stored rowwise, of the NUMRANK largest singular values).
+*>          If JOBV = 'N' and JOBA = 'E', V is used as a workspace.
+*>          If JOBV = 'N', and JOBA.NE.'E', V is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If JOBV = 'A', 'V', 'R',  or JOBA = 'E', LDV >= max(1,N).
+*>          Otherwise,                               LDV >= 1.
+*> \endverbatim
+*>
+*> \param[out] NUMRANK
+*> \verbatim
+*>          NUMRANK is INTEGER
+*>          NUMRANK is the numerical rank first determined after the rank
+*>          revealing QR factorization, following the strategy specified by the
+*>          value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK
+*>          leading singular values and vectors are then requested in the call
+*>          of SGESVD. The final value of NUMRANK might be further reduced if
+*>          some singular values are computed as zeros.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (max(1, LIWORK)).
+*>          On exit, IWORK(1:N) contains column pivoting permutation of the
+*>          rank revealing QR factorization.
+*>          If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence
+*>          of row swaps used in row pivoting. These can be used to restore the
+*>          left singular vectors in the case JOBU = 'F'.
+*>
+*>          If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          LIWORK(1) returns the minimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          LIWORK >= N + M - 1,     if JOBP = 'P' and JOBA .NE. 'E';
+*>          LIWORK >= N              if JOBP = 'N' and JOBA .NE. 'E';
+*>          LIWORK >= N + M - 1 + N, if JOBP = 'P' and JOBA = 'E';
+*>          LIWORK >= N + N          if JOBP = 'N' and JOBA = 'E'.
+*
+*>          If LIWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the WORK, IWORK, and RWORK arrays, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          WORK is REAL array, dimension (max(2, LWORK)), used as a workspace.
+*>          On exit, if, on entry, LWORK.NE.-1, WORK(1:N) contains parameters
+*>          needed to recover the Q factor from the QR factorization computed by
+*>          SGEQP3.
+*>
+*>          If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          WORK(1) returns the optimal LWORK, and
+*>          WORK(2) returns the minimal LWORK.
+*> \endverbatim
+*>
+*> \param[in,out] LWORK
+*> \verbatim
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. It is determined as follows:
+*>          Let  LWQP3 = 3*N+1,  LWCON = 3*N, and let
+*>          LWORQ = { MAX( N, 1 ),  if JOBU = 'R', 'S', or 'U'
+*>                  { MAX( M, 1 ),  if JOBU = 'A'
+*>          LWSVD = MAX( 5*N, 1 )
+*>          LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 5*(N/2), 1 ), LWORLQ = MAX( N, 1 ),
+*>          LWQRF = MAX( N/2, 1 ), LWORQ2 = MAX( N, 1 )
+*>          Then the minimal value of LWORK is:
+*>          = MAX( N + LWQP3, LWSVD )        if only the singular values are needed;
+*>          = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed,
+*>                                   and a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWORQ ) if the singular values and the left
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the singular values and the left
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD )        if the singular values and the right
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition etimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWORQ ) if the full SVD is requested with JOBV = 'R';
+*>                                   independent of JOBR;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWORQ ) if the full SVD is requested,
+*>                                   JOBV = 'R' and, also a scaled condition
+*>                                   estimate requested; independent of JOBR;
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ) ) if the
+*>                         full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWORLQ, LWORQ ) )
+*>                         if the full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N', and also a scaled condition number estimate
+*>                         requested.
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) ) if the
+*>                         full SVD is requested with JOBV = 'A', 'V', and JOBR ='T'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWORQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWORQ2, LWORQ ) )
+*>                         if the full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='T', and also a scaled condition number estimate
+*>                         requested.
+*>          Finally, LWORK must be at least two: LWORK = MAX( 2, LWORK ).
+*>
+*>          If LWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the WORK, IWORK, and RWORK arrays, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is REAL array, dimension (max(1, LRWORK)).
+*>          On exit,
+*>          1. If JOBA = 'E', RWORK(1) contains an estimate of the condition
+*>          number of column scaled A. If A = C * D where D is diagonal and C
+*>          has unit columns in the Euclidean norm, then, assuming full column rank,
+*>          N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1).
+*>          Otherwise, RWORK(1) = -1.
+*>          2. RWORK(2) contains the number of singular values computed as
+*>          exact zeros in SGESVD applied to the upper triangular or trapeziodal
+*>          R (from the initial QR factorization). In case of early exit (no call to
+*>          SGESVD, such as in the case of zero matrix) RWORK(2) = -1.
+*>
+*>          If LIWORK, LWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          RWORK(1) returns the minimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER.
+*>          The dimension of the array RWORK.
+*>          If JOBP ='P', then LRWORK >= MAX(2, M).
+*>          Otherwise, LRWORK >= 2
+*
+*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the WORK, IWORK, and RWORK arrays, and no error
+*>          message related to LWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if SBDSQR did not converge, INFO specifies how many superdiagonals
+*>          of an intermediate bidiagonal form B (computed in SGESVD) did not
+*>          converge to zero.
+*> \endverbatim
+*
+*> \par Further Details:
+*  ========================
+*>
+*> \verbatim
+*>
+*>   1. The data movement (matrix transpose) is coded using simple nested
+*>   DO-loops because BLAS and LAPACK do not provide corresponding subroutines.
+*>   Those DO-loops are easily identified in this source code - by the CONTINUE
+*>   statements labeled with 11**. In an optimized version of this code, the
+*>   nested DO loops should be replaced with calls to an optimized subroutine.
+*>   2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause
+*>   column norm overflow. This is the minial precaution and it is left to the
+*>   SVD routine (CGESVD) to do its own preemptive scaling if potential over-
+*>   or underflows are detected. To avoid repeated scanning of the array A,
+*>   an optimal implementation would do all necessary scaling before calling
+*>   CGESVD and the scaling in CGESVD can be switched off.
+*>   3. Other comments related to code optimization are given in comments in the
+*>   code, enlosed in [[double brackets]].
+*> \endverbatim
+*
+*> \par Bugs, examples and comments
+*  ===========================
+*
+*> \verbatim
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*> \endverbatim
+*
+*> \par References
+*  ===============
+*
+*> \verbatim
+*>  [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for
+*>      Computing the SVD with High Accuracy. ACM Trans. Math. Softw.
+*>      44(1): 11:1-11:30 (2017)
+*>
+*>  SIGMA library, xGESVDQ section updated February 2016.
+*>  Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*> Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2018
+*
+*> \ingroup realGEsing
+*
+*  =====================================================================
+      SUBROUTINE SGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+     $                    S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+     $                    WORK, LWORK, RWORK, LRWORK, INFO )
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LWORK, LRWORK,
+     $            INFO
+*     ..
+*     .. Array Arguments ..
+      REAL        A( LDA, * ), U( LDU, * ), V( LDV, * ), WORK( * )
+      REAL        S( * ), RWORK( * )
+      INTEGER     IWORK( * )
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL        ZERO,         ONE
+      PARAMETER ( ZERO = 0.0E0, ONE = 1.0E0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER     IERR, IWOFF, NR, N1, OPTRATIO, p, q
+      INTEGER     LWCON, LWQP3, LWRK_SGELQF, LWRK_SGESVD, LWRK_SGESVD2,
+     $            LWRK_SGEQP3,  LWRK_SGEQRF, LWRK_SORMLQ, LWRK_SORMQR,
+     $            LWRK_SORMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWORQ,
+     $            LWORQ2, LWUNLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2,
+     $            IMINWRK, RMINWRK
+      LOGICAL     ACCLA,  ACCLM, ACCLH, ASCALED, CONDA, DNTWU,  DNTWV,
+     $            LQUERY, LSVC0, LSVEC, ROWPRM,  RSVEC, RTRANS, WNTUA,
+     $            WNTUF,  WNTUR, WNTUS, WNTVA,   WNTVR
+      REAL        BIG, EPSLN, RTMP, SCONDA, SFMIN
+*     ..
+*     .. Local Arrays
+      REAL        RDUMMY(1)
+*     ..
+*     .. External Subroutines (BLAS, LAPACK)
+      EXTERNAL    SGELQF, SGEQP3, SGEQRF, SGESVD, SLACPY, SLAPMT,
+     $            SLASCL, SLASET, SLASWP, SSCAL,  SPOCON, SORMLQ,
+     $            SORMQR, XERBLA
+*     ..
+*     .. External Functions (BLAS, LAPACK)
+      LOGICAL    LSAME
+      INTEGER    ISAMAX
+      REAL        SLANGE, SNRM2, SLAMCH
+      EXTERNAL    SLANGE, LSAME, ISAMAX, SNRM2, SLAMCH
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC   ABS, MAX, MIN, REAL, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      WNTUS  = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' )
+      WNTUR  = LSAME( JOBU, 'R' )
+      WNTUA  = LSAME( JOBU, 'A' )
+      WNTUF  = LSAME( JOBU, 'F' )
+      LSVC0  = WNTUS .OR. WNTUR .OR. WNTUA
+      LSVEC  = LSVC0 .OR. WNTUF
+      DNTWU  = LSAME( JOBU, 'N' )
+*
+      WNTVR  = LSAME( JOBV, 'R' )
+      WNTVA  = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' )
+      RSVEC  = WNTVR .OR. WNTVA
+      DNTWV  = LSAME( JOBV, 'N' )
+*
+      ACCLA  = LSAME( JOBA, 'A' )
+      ACCLM  = LSAME( JOBA, 'M' )
+      CONDA  = LSAME( JOBA, 'E' )
+      ACCLH  = LSAME( JOBA, 'H' ) .OR. CONDA
+*
+      ROWPRM = LSAME( JOBP, 'P' )
+      RTRANS = LSAME( JOBR, 'T' )
+*
+      IF ( ROWPRM ) THEN
+         IF ( CONDA ) THEN
+            IMINWRK = MAX( 1, N + M - 1 + N )
+         ELSE
+            IMINWRK = MAX( 1, N + M - 1 )
+         END IF
+         RMINWRK = MAX( 2, M )
+      ELSE
+         IF ( CONDA ) THEN
+            IMINWRK = MAX( 1, N + N )
+         ELSE
+            IMINWRK = MAX( 1, N )
+         END IF
+         RMINWRK = 2
+      END IF
+      LQUERY = (LIWORK .EQ. -1 .OR. LWORK .EQ. -1 .OR. LRWORK .EQ. -1)
+      INFO  = 0
+      IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN
+         INFO = -1
+      ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN
+         INFO = -4
+      ELSE IF ( WNTUR .AND. WNTVA ) THEN
+         INFO = -5
+      ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN
+         INFO = -5
+      ELSE IF ( M.LT.0 ) THEN
+         INFO = -6
+      ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN
+         INFO = -7
+      ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -9
+      ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR.
+     $       ( WNTUF .AND. LDU.LT.N ) ) THEN
+         INFO = -12
+      ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR.
+     $          ( CONDA .AND. LDV.LT.N ) ) THEN
+         INFO = -14
+      ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN
+         INFO = -17
+      END IF
+*
+*
+      IF ( INFO .EQ. 0 ) THEN
+*        .. compute the minimal and the optimal workspace lengths
+*        [[The expressions for computing the minimal and the optimal
+*        values of LWORK are written with a lot of redundancy and
+*        can be simplified. However, this detailed form is easier for
+*        maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for SGEQP3 of an M x N matrix
+         LWQP3 = 3 * N + 1
+*        .. minimal workspace length for SORMQR to build left singular vectors
+         IF ( WNTUS .OR. WNTUR ) THEN
+             LWORQ  = MAX( N  , 1 )
+         ELSE IF ( WNTUA ) THEN
+             LWORQ = MAX( M , 1 )
+         END IF
+*        .. minimal workspace length for SPOCON of an N x N matrix
+         LWCON = 3 * N
+*        .. SGESVD of an N x N matrix
+         LWSVD = MAX( 5 * N, 1 )
+         IF ( LQUERY ) THEN
+             CALL SGEQP3( M, N, A, LDA, IWORK, RDUMMY, RDUMMY, -1,
+     $           IERR )
+             LWRK_SGEQP3 = INT( RDUMMY(1) )
+             IF ( WNTUS .OR. WNTUR ) THEN
+                 CALL SORMQR( 'L', 'N', M, N, N, A, LDA, RDUMMY, U,
+     $                LDU, RDUMMY, -1, IERR )
+                 LWRK_SORMQR = INT( RDUMMY(1) )
+             ELSE IF ( WNTUA ) THEN
+                 CALL SORMQR( 'L', 'N', M, M, N, A, LDA, RDUMMY, U,
+     $                LDU, RDUMMY, -1, IERR )
+                 LWRK_SORMQR = INT( RDUMMY(1) )
+             ELSE
+                 LWRK_SORMQR = 0
+             END IF
+         END IF
+         MINWRK = 2
+         OPTWRK = 2
+         IF ( .NOT. (LSVEC .OR. RSVEC )) THEN
+*            .. minimal and optimal sizes of the workspace if
+*            only the singular values are requested
+             IF ( CONDA ) THEN
+                MINWRK = MAX( N+LWQP3, LWCON, LWSVD )
+             ELSE
+                MINWRK = MAX( N+LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL SGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU,
+     $                V, LDV, RDUMMY, -1, IERR )
+                 LWRK_SGESVD = INT( RDUMMY(1) )
+                 IF ( CONDA ) THEN
+                    OPTWRK = MAX( N+LWRK_SGEQP3, N+LWCON, LWRK_SGESVD )
+                 ELSE
+                    OPTWRK = MAX( N+LWRK_SGEQP3, LWRK_SGESVD )
+                 END IF
+             END IF
+         ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN
+*            .. minimal and optimal sizes of the workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWORQ )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD, LWORQ )
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL SGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                ELSE
+                   CALL SGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                END IF
+                LWRK_SGESVD = INT( RDUMMY(1) )
+                IF ( CONDA ) THEN
+                    OPTWRK = N + MAX( LWRK_SGEQP3, LWCON, LWRK_SGESVD,
+     $                               LWRK_SORMQR )
+                ELSE
+                    OPTWRK = N + MAX( LWRK_SGEQP3, LWRK_SGESVD,
+     $                               LWRK_SORMQR )
+                END IF
+             END IF
+         ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 IF ( RTRANS ) THEN
+                     CALL SGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, RDUMMY, -1, IERR )
+                 ELSE
+                     CALL SGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, RDUMMY, -1, IERR )
+                 END IF
+                 LWRK_SGESVD = INT( RDUMMY(1) )
+                 IF ( CONDA ) THEN
+                     OPTWRK = N + MAX( LWRK_SGEQP3, LWCON, LWRK_SGESVD )
+                 ELSE
+                     OPTWRK = N + MAX( LWRK_SGEQP3, LWRK_SGESVD )
+                 END IF
+             END IF
+         ELSE
+*            .. minimal and optimal sizes of the workspace if the
+*            full SVD is requested
+             IF ( RTRANS ) THEN
+                 MINWRK = MAX( LWQP3, LWSVD, LWORQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N x N/2 SGEQRF
+                    LWQRF  = MAX( N/2, 1 )
+*                   .. minimal workspace lengt for N/2 x N/2 SGESVD
+                    LWSVD2 = MAX( 5 * (N/2), 1 )
+                    LWORQ2 = MAX( N, 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2,
+     $                        N/2+LWORQ2, LWORQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             ELSE
+                 MINWRK = MAX( LWQP3, LWSVD, LWORQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N/2 x N SGELQF
+                    LWLQF  = MAX( N/2, 1 )
+                    LWSVD2 = MAX( 5 * (N/2), 1 )
+                    LWUNLQ = MAX( N , 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2,
+     $                        N/2+LWUNLQ, LWORQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL SGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                   LWRK_SGESVD = INT( RDUMMY(1) )
+                   OPTWRK = MAX(LWRK_SGEQP3,LWRK_SGESVD,LWRK_SORMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                       CALL SGEQRF(N,N/2,U,LDU,RDUMMY,RDUMMY,-1,IERR)
+                       LWRK_SGEQRF = INT( RDUMMY(1) )
+                       CALL SGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU,
+     $                      V, LDV, RDUMMY, -1, IERR )
+                       LWRK_SGESVD2 = INT( RDUMMY(1) )
+                       CALL SORMQR( 'R', 'C', N, N, N/2, U, LDU, RDUMMY,
+     $                      V, LDV, RDUMMY, -1, IERR )
+                       LWRK_SORMQR2 = INT( RDUMMY(1) )
+                       OPTWRK2 = MAX( LWRK_SGEQP3, N/2+LWRK_SGEQRF,
+     $                           N/2+LWRK_SGESVD2, N/2+LWRK_SORMQR2 )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                ELSE
+                   CALL SGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, RDUMMY, -1, IERR )
+                   LWRK_SGESVD = INT( RDUMMY(1) )
+                   OPTWRK = MAX(LWRK_SGEQP3,LWRK_SGESVD,LWRK_SORMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                      CALL SGELQF(N/2,N,U,LDU,RDUMMY,RDUMMY,-1,IERR)
+                      LWRK_SGELQF = INT( RDUMMY(1) )
+                      CALL SGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU,
+     $                     V, LDV, RDUMMY, -1, IERR )
+                      LWRK_SGESVD2 = INT( RDUMMY(1) )
+                      CALL SORMLQ( 'R', 'N', N, N, N/2, U, LDU, RDUMMY,
+     $                     V, LDV, RDUMMY,-1,IERR )
+                      LWRK_SORMLQ = INT( RDUMMY(1) )
+                      OPTWRK2 = MAX( LWRK_SGEQP3, N/2+LWRK_SGELQF,
+     $                           N/2+LWRK_SGESVD2, N/2+LWRK_SORMLQ )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                END IF
+             END IF
+         END IF
+*
+         MINWRK = MAX( 2, MINWRK )
+         OPTWRK = MAX( 2, OPTWRK )
+         IF ( LWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19
+*
+      END IF
+*
+      IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN
+         INFO = -21
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SGESVDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+*
+*     Return optimal workspace
+*
+          IWORK(1) = IMINWRK
+          WORK(1) = OPTWRK
+          WORK(2) = MINWRK
+          RWORK(1) = RMINWRK
+          RETURN
+      END IF
+*
+*     Quick return if the matrix is void.
+*
+      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN
+*     .. all output is void.
+         RETURN
+      END IF
+*
+      BIG = SLAMCH('O')
+      ASCALED = .FALSE.
+      IWOFF = 1
+      IF ( ROWPRM ) THEN
+            IWOFF = M
+*           .. reordering the rows in decreasing sequence in the
+*           ell-infinity norm - this enhances numerical robustness in
+*           the case of differently scaled rows.
+            DO 1904 p = 1, M
+*               RWORK(p) = ABS( A(p,ICAMAX(N,A(p,1),LDA)) )
+*               [[SLANGE will return NaN if an entry of the p-th row is Nan]]
+                RWORK(p) = SLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY )
+*               .. check for NaN's and Inf's
+                IF ( ( RWORK(p) .NE. RWORK(p) ) .OR.
+     $               ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN
+                    INFO = -8
+                    CALL XERBLA( 'SGESVDQ', -INFO )
+                    RETURN
+                END IF
+ 1904       CONTINUE
+            DO 1952 p = 1, M - 1
+            q = ISAMAX( M-p+1, RWORK(p), 1 ) + p - 1
+            IWORK(N+p) = q
+            IF ( p .NE. q ) THEN
+               RTMP     = RWORK(p)
+               RWORK(p) = RWORK(q)
+               RWORK(q) = RTMP
+            END IF
+ 1952       CONTINUE
+*
+            IF ( RWORK(1) .EQ. ZERO ) THEN
+*              Quick return: A is the M x N zero matrix.
+               NUMRANK = 0
+               CALL SLASET( 'G', N, 1, ZERO, ZERO, S, N )
+               IF ( WNTUS ) CALL SLASET('G', M, N, ZERO, ONE, U, LDU)
+               IF ( WNTUA ) CALL SLASET('G', M, M, ZERO, ONE, U, LDU)
+               IF ( WNTVA ) CALL SLASET('G', N, N, ZERO, ONE, V, LDV)
+               IF ( WNTUF ) THEN
+                   CALL SLASET( 'G', N, 1, ZERO, ZERO, WORK, N )
+                   CALL SLASET( 'G', M, N, ZERO,  ONE, U, LDU )
+               END IF
+               DO 5001 p = 1, N
+                   IWORK(p) = p
+ 5001          CONTINUE
+               IF ( ROWPRM ) THEN
+                   DO 5002 p = N + 1, N + M - 1
+                       IWORK(p) = p - N
+ 5002              CONTINUE
+               END IF
+               IF ( CONDA ) RWORK(1) = -1
+               RWORK(2) = -1
+               RETURN
+            END IF
+*
+            IF ( RWORK(1) .GT. BIG / SQRT(REAL(M)) ) THEN
+*               .. to prevent overflow in the QR factorization, scale the
+*               matrix by 1/sqrt(M) if too large entry detected
+                CALL SLASCL('G',0,0,SQRT(REAL(M)),ONE, M,N, A,LDA, IERR)
+                ASCALED = .TRUE.
+            END IF
+            CALL SLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 )
+      END IF
+*
+*    .. At this stage, preemptive scaling is done only to avoid column
+*    norms overflows during the QR factorization. The SVD procedure should
+*    have its own scaling to save the singular values from overflows and
+*    underflows. That depends on the SVD procedure.
+*
+      IF ( .NOT.ROWPRM ) THEN
+          RTMP = SLANGE( 'M', M, N, A, LDA, RDUMMY )
+          IF ( ( RTMP .NE. RTMP ) .OR.
+     $         ( (RTMP*ZERO) .NE. ZERO ) ) THEN
+               INFO = -8
+               CALL XERBLA( 'SGESVDQ', -INFO )
+               RETURN
+          END IF
+          IF ( RTMP .GT. BIG / SQRT(REAL(M)) ) THEN
+*             .. to prevent overflow in the QR factorization, scale the
+*             matrix by 1/sqrt(M) if too large entry detected
+              CALL SLASCL('G',0,0, SQRT(REAL(M)),ONE, M,N, A,LDA, IERR)
+              ASCALED = .TRUE.
+          END IF
+      END IF
+*
+*     .. QR factorization with column pivoting
+*
+*     A * P = Q * [ R ]
+*                 [ 0 ]
+*
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL SGEQP3( M, N, A, LDA, IWORK, WORK, WORK(N+1), LWORK-N,
+     $      IERR )
+*
+*    If the user requested accuracy level allows truncation in the
+*    computed upper triangular factor, the matrix R is examined and,
+*    if possible, replaced with its leading upper trapezoidal part.
+*
+      EPSLN = SLAMCH('E')
+      SFMIN = SLAMCH('S')
+*     SMALL = SFMIN / EPSLN
+      NR = N
+*
+      IF ( ACCLA ) THEN
+*
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPS*||A||_F are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPS*||A||_F.
+         NR = 1
+         RTMP = SQRT(REAL(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002
+               NR = NR + 1
+ 3001    CONTINUE
+ 3002    CONTINUE
+*
+      ELSEIF ( ACCLM ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R is used as the criterion for being
+*        close-to-rank-deficient. The threshold is set to EPSLN=SLAMCH('E').
+*        [[This can be made more flexible by replacing this hard-coded value
+*        with a user specified threshold.]] Also, the values that underflow
+*        will be truncated.
+         NR = 1
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        .. RRQR not authorized to determine numerical rank except in the
+*        obvious case of zero pivots.
+*        .. inspect R for exact zeros on the diagonal;
+*        R(i,i)=0 => R(i:N,i:N)=0.
+         NR = 1
+         DO 3501 p = 2, N
+            IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502
+            NR = NR + 1
+ 3501    CONTINUE
+ 3502    CONTINUE
+*
+         IF ( CONDA ) THEN
+*           Estimate the scaled condition number of A. Use the fact that it is
+*           the same as the scaled condition number of R.
+*              .. V is used as workspace
+               CALL SLACPY( 'U', N, N, A, LDA, V, LDV )
+*              Only the leading NR x NR submatrix of the triangular factor
+*              is considered. Only if NR=N will this give a reliable error
+*              bound. However, even for NR < N, this can be used on an
+*              expert level and obtain useful information in the sense of
+*              perturbation theory.
+               DO 3053 p = 1, NR
+                  RTMP = SNRM2( p, V(1,p), 1 )
+                  CALL SSCAL( p, ONE/RTMP, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN
+                   CALL SPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  WORK, IWORK(N+IWOFF), IERR )
+               ELSE
+                   CALL SPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  WORK(N+1), IWORK(N+IWOFF), IERR )
+               END IF
+               SCONDA = ONE / SQRT(RTMP)
+*           For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1),
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*           See the reference [1] for more details.
+         END IF
+*
+      ENDIF
+*
+      IF ( WNTUR ) THEN
+          N1 = NR
+      ELSE IF ( WNTUS .OR. WNTUF) THEN
+          N1 = N
+      ELSE IF ( WNTUA ) THEN
+          N1 = M
+      END IF
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*.......................................................................
+*        .. only the singular values are requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*         .. compute the singular values of R**T = [A](1:NR,1:N)**T
+*           .. set the lower triangle of [A] to [A](1:NR,1:N)**T and
+*           the upper triangle of [A] to zero.
+            DO 1146 p = 1, MIN( N, NR )
+               DO 1147 q = p + 1, N
+                  A(q,p) = A(p,q)
+                  IF ( q .LE. NR ) A(p,q) = ZERO
+ 1147          CONTINUE
+ 1146       CONTINUE
+*
+            CALL SGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU,
+     $           V, LDV, WORK, LWORK, INFO )
+*
+         ELSE
+*
+*           .. compute the singular values of R = [A](1:NR,1:N)
+*
+            IF ( NR .GT. 1 )
+     $          CALL SLASET( 'L', NR-1,NR-1, ZERO,ZERO, A(2,1), LDA )
+            CALL SGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU,
+     $           V, LDV, WORK, LWORK, INFO )
+*
+         END IF
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN
+*.......................................................................
+*       .. the singular values and the left singular vectors requested
+*.......................................................................""""""""
+         IF ( RTRANS ) THEN
+*            .. apply SGESVD to R**T
+*            .. copy R**T into [U] and overwrite [U] with the right singular
+*            vectors of R
+            DO 1192 p = 1, NR
+               DO 1193 q = p, N
+                  U(q,p) = A(p,q)
+ 1193          CONTINUE
+ 1192       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL SLASET( 'U', NR-1,NR-1, ZERO,ZERO, U(1,2), LDU )
+*           .. the left singular vectors not computed, the NR right singular
+*           vectors overwrite [U](1:NR,1:NR) as transposed. These
+*           will be pre-multiplied by Q to build the left singular vectors of A.
+               CALL SGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU,
+     $              U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+               DO 1119 p = 1, NR
+                   DO 1120 q = p + 1, NR
+                      RTMP   = U(q,p)
+                      U(q,p) = U(p,q)
+                      U(p,q) = RTMP
+ 1120              CONTINUE
+ 1119          CONTINUE
+*
+         ELSE
+*            .. apply SGESVD to R
+*            .. copy R into [U] and overwrite [U] with the left singular vectors
+             CALL SLACPY( 'U', NR, N, A, LDA, U, LDU )
+             IF ( NR .GT. 1 )
+     $         CALL SLASET( 'L', NR-1, NR-1, ZERO, ZERO, U(2,1), LDU )
+*            .. the right singular vectors not computed, the NR left singular
+*            vectors overwrite [U](1:NR,1:NR)
+                CALL SGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU,
+     $               V, LDV, WORK(N+1), LWORK-N, INFO )
+*               .. now [U](1:NR,1:NR) contains the NR left singular vectors of
+*               R. These will be pre-multiplied by Q to build the left singular
+*               vectors of A.
+         END IF
+*
+*           .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+         IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN
+             CALL SLASET('A', M-NR, NR, ZERO, ZERO, U(NR+1,1), LDU)
+             IF ( NR .LT. N1 ) THEN
+                CALL SLASET( 'A',NR,N1-NR,ZERO,ZERO,U(1,NR+1), LDU )
+                CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $               U(NR+1,NR+1), LDU )
+             END IF
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT.WNTUF )
+     $       CALL SORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U,
+     $            LDU, WORK(N+1), LWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL SLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+      ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN
+*.......................................................................
+*       .. the singular values and the right singular vectors requested
+*.......................................................................
+          IF ( RTRANS ) THEN
+*            .. apply SGESVD to R**T
+*            .. copy R**T into V and overwrite V with the left singular vectors
+            DO 1165 p = 1, NR
+               DO 1166 q = p, N
+                  V(q,p) = (A(p,q))
+ 1166          CONTINUE
+ 1165       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL SLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV )
+*           .. the left singular vectors of R**T overwrite V, the right singular
+*           vectors not computed
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+               CALL SGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU,
+     $              U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+               DO 1121 p = 1, NR
+                   DO 1122 q = p + 1, NR
+                      RTMP   = V(q,p)
+                      V(q,p) = V(p,q)
+                      V(p,q) = RTMP
+ 1122              CONTINUE
+ 1121          CONTINUE
+*
+               IF ( NR .LT. N ) THEN
+                   DO 1103 p = 1, NR
+                      DO 1104 q = NR + 1, N
+                          V(p,q) = V(q,p)
+ 1104                 CONTINUE
+ 1103              CONTINUE
+               END IF
+               CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:N,1:NR)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the QR factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                CALL SLASET('G', N, N-NR, ZERO, ZERO, V(1,NR+1), LDV)
+                CALL SGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU,
+     $               U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+                DO 1123 p = 1, N
+                   DO 1124 q = p + 1, N
+                      RTMP   = V(q,p)
+                      V(q,p) = V(p,q)
+                      V(p,q) = RTMP
+ 1124              CONTINUE
+ 1123           CONTINUE
+                CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK )
+            END IF
+*
+          ELSE
+*            .. aply SGESVD to R
+*            .. copy R into V and overwrite V with the right singular vectors
+             CALL SLACPY( 'U', NR, N, A, LDA, V, LDV )
+             IF ( NR .GT. 1 )
+     $         CALL SLASET( 'L', NR-1, NR-1, ZERO, ZERO, V(2,1), LDV )
+*            .. the right singular vectors overwrite V, the NR left singular
+*            vectors stored in U(1:NR,1:NR)
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+                CALL SGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, WORK(N+1), LWORK-N, INFO )
+                CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T
+             ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:NR,1:N)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the LQ factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                 CALL SLASET('G', N-NR, N, ZERO,ZERO, V(NR+1,1), LDV)
+                 CALL SGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU,
+     $                V, LDV, WORK(N+1), LWORK-N, INFO )
+                 CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK )
+             END IF
+*            .. now [V] contains the transposed matrix of the right singular
+*            vectors of A.
+          END IF
+*
+      ELSE
+*.......................................................................
+*       .. FULL SVD requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*            .. apply SGESVD to R**T [[this option is left for R&D&T]]
+*
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*            .. copy R**T into [V] and overwrite [V] with the left singular
+*            vectors of R**T
+            DO 1168 p = 1, NR
+               DO 1169 q = p, N
+                  V(q,p) = A(p,q)
+ 1169          CONTINUE
+ 1168       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL SLASET( 'U', NR-1,NR-1, ZERO,ZERO, V(1,2), LDV )
+*
+*           .. the left singular vectors of R**T overwrite [V], the NR right
+*           singular vectors of R**T stored in [U](1:NR,1:NR) as transposed
+               CALL SGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV,
+     $              U, LDU, WORK(N+1), LWORK-N, INFO )
+*              .. assemble V
+               DO 1115 p = 1, NR
+                  DO 1116 q = p + 1, NR
+                     RTMP   = V(q,p)
+                     V(q,p) = V(p,q)
+                     V(p,q) = RTMP
+ 1116             CONTINUE
+ 1115          CONTINUE
+               IF ( NR .LT. N ) THEN
+                   DO 1101 p = 1, NR
+                      DO 1102 q = NR+1, N
+                         V(p,q) = V(q,p)
+ 1102                 CONTINUE
+ 1101              CONTINUE
+               END IF
+               CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*
+                DO 1117 p = 1, NR
+                   DO 1118 q = p + 1, NR
+                      RTMP   = U(q,p)
+                      U(q,p) = U(p,q)
+                      U(p,q) = RTMP
+ 1118              CONTINUE
+ 1117           CONTINUE
+*
+                IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL SLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*            .. copy R**T into [V] and overwrite [V] with the left singular
+*            vectors of R**T
+*               [[The optimal ratio N/NR for using QRF instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'SGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+                OPTRATIO = 2
+                IF ( OPTRATIO*NR .GT. N ) THEN
+                   DO 1198 p = 1, NR
+                      DO 1199 q = p, N
+                         V(q,p) = A(p,q)
+ 1199                 CONTINUE
+ 1198              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL SLASET('U',NR-1,NR-1, ZERO,ZERO, V(1,2),LDV)
+*
+                   CALL SLASET('A',N,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                   CALL SGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV,
+     $                  U, LDU, WORK(N+1), LWORK-N, INFO )
+*
+                   DO 1113 p = 1, N
+                      DO 1114 q = p + 1, N
+                         RTMP   = V(q,p)
+                         V(q,p) = V(p,q)
+                         V(p,q) = RTMP
+ 1114                 CONTINUE
+ 1113              CONTINUE
+                   CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*              .. assemble the left singular vector matrix U of dimensions
+*              (M x N1), i.e. (M x N) or (M x M).
+*
+                   DO 1111 p = 1, N
+                      DO 1112 q = p + 1, N
+                         RTMP   = U(q,p)
+                         U(q,p) = U(p,q)
+                         U(p,q) = RTMP
+ 1112                 CONTINUE
+ 1111              CONTINUE
+*
+                   IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL SLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL SLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU)
+                        CALL SLASET('A',M-N,N1-N,ZERO,ONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                   END IF
+                ELSE
+*                  .. copy R**T into [U] and overwrite [U] with the right
+*                  singular vectors of R
+                   DO 1196 p = 1, NR
+                      DO 1197 q = p, N
+                         U(q,NR+p) = A(p,q)
+ 1197                 CONTINUE
+ 1196              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL SLASET('U',NR-1,NR-1,ZERO,ZERO,U(1,NR+2),LDU)
+                   CALL SGEQRF( N, NR, U(1,NR+1), LDU, WORK(N+1),
+     $                  WORK(N+NR+1), LWORK-N-NR, IERR )
+                   DO 1143 p = 1, NR
+                       DO 1144 q = 1, N
+                           V(q,p) = U(p,NR+q)
+ 1144                  CONTINUE
+ 1143              CONTINUE
+                  CALL SLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV)
+                  CALL SGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V,LDV, WORK(N+NR+1),LWORK-N-NR, INFO )
+                  CALL SLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV)
+                  CALL SLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                  CALL SLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV)
+                  CALL SORMQR('R','C', N, N, NR, U(1,NR+1), LDU,
+     $                 WORK(N+1),V,LDV,WORK(N+NR+1),LWORK-N-NR,IERR)
+                  CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL SLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1),LDU)
+                     END IF
+                  END IF
+                END IF
+            END IF
+*
+         ELSE
+*
+*            .. apply SGESVD to R [[this is the recommended option]]
+*
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*                .. copy R into [V] and overwrite V with the right singular vectors
+                 CALL SLACPY( 'U', NR, N, A, LDA, V, LDV )
+                IF ( NR .GT. 1 )
+     $          CALL SLASET( 'L', NR-1,NR-1, ZERO,ZERO, V(2,1), LDV )
+*               .. the right singular vectors of R overwrite [V], the NR left
+*               singular vectors of R stored in [U](1:NR,1:NR)
+                CALL SGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, WORK(N+1), LWORK-N, INFO )
+                CALL SLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**T
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+               IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL SLASET('A', M-NR,NR, ZERO,ZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+             ELSE
+*              .. need all N right singular vectors and NR < N
+*              .. the requested number of the left singular vectors
+*               is then N1 (N or M)
+*               [[The optimal ratio N/NR for using LQ instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'SGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+               OPTRATIO = 2
+               IF ( OPTRATIO * NR .GT. N ) THEN
+                  CALL SLACPY( 'U', NR, N, A, LDA, V, LDV )
+                  IF ( NR .GT. 1 )
+     $            CALL SLASET('L', NR-1,NR-1, ZERO,ZERO, V(2,1),LDV)
+*              .. the right singular vectors of R overwrite [V], the NR left
+*                 singular vectors of R stored in [U](1:NR,1:NR)
+                  CALL SLASET('A', N-NR,N, ZERO,ZERO, V(NR+1,1),LDV)
+                  CALL SGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU,
+     $                 V, LDV, WORK(N+1), LWORK-N, INFO )
+                  CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. now [V] contains the transposed matrix of the right
+*                 singular vectors of A. The leading N left singular vectors
+*                 are in [U](1:N,1:N)
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x N1), i.e. (M x N) or (M x M).
+                  IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL SLASET('A',M-N,N,ZERO,ZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL SLASET('A',N,N1-N,ZERO,ZERO,U(1,N+1),LDU)
+                        CALL SLASET( 'A',M-N,N1-N,ZERO,ONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                  END IF
+               ELSE
+                  CALL SLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU )
+                  IF ( NR .GT. 1 )
+     $            CALL SLASET('L',NR-1,NR-1,ZERO,ZERO,U(NR+2,1),LDU)
+                  CALL SGELQF( NR, N, U(NR+1,1), LDU, WORK(N+1),
+     $                 WORK(N+NR+1), LWORK-N-NR, IERR )
+                  CALL SLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV)
+                  IF ( NR .GT. 1 )
+     $            CALL SLASET('U',NR-1,NR-1,ZERO,ZERO,V(1,2),LDV)
+                  CALL SGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V, LDV, WORK(N+NR+1), LWORK-N-NR, INFO )
+                  CALL SLASET('A',N-NR,NR,ZERO,ZERO,V(NR+1,1),LDV)
+                  CALL SLASET('A',NR,N-NR,ZERO,ZERO,V(1,NR+1),LDV)
+                  CALL SLASET('A',N-NR,N-NR,ZERO,ONE,V(NR+1,NR+1),LDV)
+                  CALL SORMLQ('R','N',N,N,NR,U(NR+1,1),LDU,WORK(N+1),
+     $                 V, LDV, WORK(N+NR+1),LWORK-N-NR,IERR)
+                  CALL SLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL SLASET('A',M-NR,NR,ZERO,ZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL SLASET('A',NR,N1-NR,ZERO,ZERO,U(1,NR+1),LDU)
+                     CALL SLASET( 'A',M-NR,N1-NR,ZERO,ONE,
+     $                    U(NR+1,NR+1), LDU )
+                     END IF
+                  END IF
+               END IF
+             END IF
+*        .. end of the "R**T or R" branch
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT. WNTUF )
+     $       CALL SORMQR( 'L', 'N', M, N1, N, A, LDA, WORK, U,
+     $            LDU, WORK(N+1), LWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL SLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+*     ... end of the "full SVD" branch
+      END IF
+*
+*     Check whether some singular values are returned as zeros, e.g.
+*     due to underflow, and update the numerical rank.
+      p = NR
+      DO 4001 q = p, 1, -1
+          IF ( S(q) .GT. ZERO ) GO TO 4002
+          NR = NR - 1
+ 4001 CONTINUE
+ 4002 CONTINUE
+*
+*     .. if numerical rank deficiency is detected, the truncated
+*     singular values are set to zero.
+      IF ( NR .LT. N ) CALL SLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N )
+*     .. undo scaling; this may cause overflow in the largest singular
+*     values.
+      IF ( ASCALED )
+     $   CALL SLASCL( 'G',0,0, ONE,SQRT(REAL(M)), NR,1, S, N, IERR )
+      IF ( CONDA ) RWORK(1) = SCONDA
+      RWORK(2) = p - NR
+*     .. p-NR is the number of singular values that are computed as
+*     exact zeros in SGESVD() applied to the (possibly truncated)
+*     full row rank triangular (trapezoidal) factor of A.
+      NUMRANK = NR
+*
+      RETURN
+*
+*     End of SGESVDQ
+*
+      END
diff --git a/lapack-netlib/SRC/sgesvj.f b/lapack-netlib/SRC/sgesvj.f
index 7a7901135..fee5aba4a 100644
--- a/lapack-netlib/SRC/sgesvj.f
+++ b/lapack-netlib/SRC/sgesvj.f
@@ -90,13 +90,13 @@
 *>          JOBV is CHARACTER*1
 *>          Specifies whether to compute the right singular vectors, that
 *>          is, the matrix V:
-*>          = 'V' : the matrix V is computed and returned in the array V
-*>          = 'A' : the Jacobi rotations are applied to the MV-by-N
+*>          = 'V':  the matrix V is computed and returned in the array V
+*>          = 'A':  the Jacobi rotations are applied to the MV-by-N
 *>                  array V. In other words, the right singular vector
 *>                  matrix V is not computed explicitly; instead it is
 *>                  applied to an MV-by-N matrix initially stored in the
 *>                  first MV rows of V.
-*>          = 'N' : the matrix V is not computed and the array V is not
+*>          = 'N':  the matrix V is not computed and the array V is not
 *>                  referenced
 *> \endverbatim
 *>
@@ -118,8 +118,8 @@
 *>          A is REAL array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
 *>          On exit,
-*>          If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C':
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'U' .OR. JOBU = 'C':
+*>                 If INFO = 0:
 *>                 RANKA orthonormal columns of U are returned in the
 *>                 leading RANKA columns of the array A. Here RANKA <= N
 *>                 is the number of computed singular values of A that are
@@ -129,9 +129,9 @@
 *>                 in the array WORK as RANKA=NINT(WORK(2)). Also see the
 *>                 descriptions of SVA and WORK. The computed columns of U
 *>                 are mutually numerically orthogonal up to approximately
-*>                 TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'),
+*>                 TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'),
 *>                 see the description of JOBU.
-*>                 If INFO .GT. 0,
+*>                 If INFO > 0,
 *>                 the procedure SGESVJ did not converge in the given number
 *>                 of iterations (sweeps). In that case, the computed
 *>                 columns of U may not be orthogonal up to TOL. The output
@@ -139,8 +139,8 @@
 *>                 values in SVA(1:N)) and V is still a decomposition of the
 *>                 input matrix A in the sense that the residual
 *>                 ||A-SCALE*U*SIGMA*V^T||_2 / ||A||_2 is small.
-*>          If JOBU .EQ. 'N':
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'N':
+*>                 If INFO = 0:
 *>                 Note that the left singular vectors are 'for free' in the
 *>                 one-sided Jacobi SVD algorithm. However, if only the
 *>                 singular values are needed, the level of numerical
@@ -149,7 +149,7 @@
 *>                 numerically orthogonal up to approximately M*EPS. Thus,
 *>                 on exit, A contains the columns of U scaled with the
 *>                 corresponding singular values.
-*>                 If INFO .GT. 0 :
+*>                 If INFO > 0:
 *>                 the procedure SGESVJ did not converge in the given number
 *>                 of iterations (sweeps).
 *> \endverbatim
@@ -164,9 +164,9 @@
 *> \verbatim
 *>          SVA is REAL array, dimension (N)
 *>          On exit,
-*>          If INFO .EQ. 0 :
+*>          If INFO = 0 :
 *>          depending on the value SCALE = WORK(1), we have:
-*>                 If SCALE .EQ. ONE:
+*>                 If SCALE = ONE:
 *>                 SVA(1:N) contains the computed singular values of A.
 *>                 During the computation SVA contains the Euclidean column
 *>                 norms of the iterated matrices in the array A.
@@ -175,7 +175,7 @@
 *>                 factored representation is due to the fact that some of the
 *>                 singular values of A might underflow or overflow.
 *>
-*>          If INFO .GT. 0 :
+*>          If INFO > 0 :
 *>          the procedure SGESVJ did not converge in the given number of
 *>          iterations (sweeps) and SCALE*SVA(1:N) may not be accurate.
 *> \endverbatim
@@ -183,7 +183,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then the product of Jacobi rotations in SGESVJ
+*>          If JOBV = 'A', then the product of Jacobi rotations in SGESVJ
 *>          is applied to the first MV rows of V. See the description of JOBV.
 *> \endverbatim
 *>
@@ -201,16 +201,16 @@
 *> \param[in] LDV
 *> \verbatim
 *>          LDV is INTEGER
-*>          The leading dimension of the array V, LDV .GE. 1.
-*>          If JOBV .EQ. 'V', then LDV .GE. max(1,N).
-*>          If JOBV .EQ. 'A', then LDV .GE. max(1,MV) .
+*>          The leading dimension of the array V, LDV >= 1.
+*>          If JOBV = 'V', then LDV >= max(1,N).
+*>          If JOBV = 'A', then LDV >= max(1,MV) .
 *> \endverbatim
 *>
 *> \param[in,out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (LWORK)
 *>          On entry,
-*>          If JOBU .EQ. 'C' :
+*>          If JOBU = 'C' :
 *>          WORK(1) = CTOL, where CTOL defines the threshold for convergence.
 *>                    The process stops if all columns of A are mutually
 *>                    orthogonal up to CTOL*EPS, EPS=SLAMCH('E').
@@ -230,7 +230,7 @@
 *>          WORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep.
 *>                    This is useful information in cases when SGESVJ did
 *>                    not converge, as it can be used to estimate whether
-*>                    the output is stil useful and for post festum analysis.
+*>                    the output is still useful and for post festum analysis.
 *>          WORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
@@ -245,9 +245,9 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
-*>          > 0 : SGESVJ did not converge in the maximal allowed number (30)
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
+*>          > 0:  SGESVJ did not converge in the maximal allowed number (30)
 *>                of sweeps. The output may still be useful. See the
 *>                description of WORK.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sgesvxx.f b/lapack-netlib/SRC/sgesvxx.f
index 281f198d5..7cb29d5ab 100644
--- a/lapack-netlib/SRC/sgesvxx.f
+++ b/lapack-netlib/SRC/sgesvxx.f
@@ -411,7 +411,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -447,14 +447,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -462,9 +462,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/sgetc2.f b/lapack-netlib/SRC/sgetc2.f
index b0301b953..6bf0a93c6 100644
--- a/lapack-netlib/SRC/sgetc2.f
+++ b/lapack-netlib/SRC/sgetc2.f
@@ -85,7 +85,7 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>           = 0: successful exit
-*>           > 0: if INFO = k, U(k, k) is likely to produce owerflow if
+*>           > 0: if INFO = k, U(k, k) is likely to produce overflow if
 *>                we try to solve for x in Ax = b. So U is perturbed to
 *>                avoid the overflow.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sgetsls.f b/lapack-netlib/SRC/sgetsls.f
index 35af66c19..53d2f9431 100644
--- a/lapack-netlib/SRC/sgetsls.f
+++ b/lapack-netlib/SRC/sgetsls.f
@@ -1,3 +1,5 @@
+*> \brief \b SGETSLS
+*
 *  Definition:
 *  ===========
 *
@@ -154,7 +156,7 @@
 *
 *> \date June 2017
 *
-*> \ingroup doubleGEsolve
+*> \ingroup realGEsolve
 *
 *  =====================================================================
       SUBROUTINE SGETSLS( TRANS, M, N, NRHS, A, LDA, B, LDB,
diff --git a/lapack-netlib/SRC/sggesx.f b/lapack-netlib/SRC/sggesx.f
index 3c6273dcf..25691d164 100644
--- a/lapack-netlib/SRC/sggesx.f
+++ b/lapack-netlib/SRC/sggesx.f
@@ -131,10 +131,10 @@
 *> \verbatim
 *>          SENSE is CHARACTER*1
 *>          Determines which reciprocal condition numbers are computed.
-*>          = 'N' : None are computed;
-*>          = 'E' : Computed for average of selected eigenvalues only;
-*>          = 'V' : Computed for selected deflating subspaces only;
-*>          = 'B' : Computed for both.
+*>          = 'N':  None are computed;
+*>          = 'E':  Computed for average of selected eigenvalues only;
+*>          = 'V':  Computed for selected deflating subspaces only;
+*>          = 'B':  Computed for both.
 *>          If SENSE = 'E', 'V', or 'B', SORT must equal 'S'.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/sgsvj0.f b/lapack-netlib/SRC/sgsvj0.f
index e580efc30..d9177d818 100644
--- a/lapack-netlib/SRC/sgsvj0.f
+++ b/lapack-netlib/SRC/sgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
@@ -136,8 +136,8 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -157,7 +157,7 @@
 *>          TOL is REAL
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -175,14 +175,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
@@ -1045,7 +1045,7 @@
 
  1993 CONTINUE
 *     end i=1:NSWEEP loop
-* #:) Reaching this point means that the procedure has comleted the given
+* #:) Reaching this point means that the procedure has completed the given
 *     number of iterations.
       INFO = NSWEEP - 1
       GO TO 1995
diff --git a/lapack-netlib/SRC/sgsvj1.f b/lapack-netlib/SRC/sgsvj1.f
index 49b81cf4f..ea4ba2e0e 100644
--- a/lapack-netlib/SRC/sgsvj1.f
+++ b/lapack-netlib/SRC/sgsvj1.f
@@ -61,7 +61,7 @@
 *> In terms of the columns of A, the first N1 columns are rotated 'against'
 *> the remaining N-N1 columns, trying to increase the angle between the
 *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
-*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter.
+*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter.
 *> The number of sweeps is given in NSWEEP and the orthogonality threshold
 *> is given in TOL.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is REAL array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
@@ -166,8 +166,8 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -187,7 +187,7 @@
 *>          TOL is REAL
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -205,14 +205,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f
index 5654a4682..b5707f2c3 100644
--- a/lapack-netlib/SRC/shseqr.f
+++ b/lapack-netlib/SRC/shseqr.f
@@ -70,7 +70,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -87,7 +87,7 @@
 *>           set by a previous call to SGEBAL, and then passed to ZGEHRD
 *>           when the matrix output by SGEBAL is reduced to Hessenberg
 *>           form. Otherwise ILO and IHI should be set to 1 and N
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -100,20 +100,20 @@
 *>           (the Schur form); 2-by-2 diagonal blocks (corresponding to
 *>           complex conjugate pairs of eigenvalues) are returned in
 *>           standard form, with H(i,i) = H(i+1,i+1) and
-*>           H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and JOB = 'E', the
+*>           H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and JOB = 'E', the
 *>           contents of H are unspecified on exit.  (The output value of
-*>           H when INFO.GT.0 is given under the description of INFO
+*>           H when INFO > 0 is given under the description of INFO
 *>           below.)
 *>
 *>           Unlike earlier versions of SHSEQR, this subroutine may
-*>           explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+*>           explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1
 *>           or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] WR
@@ -128,8 +128,8 @@
 *>           The real and imaginary parts, respectively, of the computed
 *>           eigenvalues. If two eigenvalues are computed as a complex
 *>           conjugate pair, they are stored in consecutive elements of
-*>           WR and WI, say the i-th and (i+1)th, with WI(i) .GT. 0 and
-*>           WI(i+1) .LT. 0. If JOB = 'S', the eigenvalues are stored in
+*>           WR and WI, say the i-th and (i+1)th, with WI(i) > 0 and
+*>           WI(i+1) < 0. If JOB = 'S', the eigenvalues are stored in
 *>           the same order as on the diagonal of the Schur form returned
 *>           in H, with WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2
 *>           diagonal block, WI(i) = sqrt(-H(i+1,i)*H(i,i+1)) and
@@ -148,7 +148,7 @@
 *>           if INFO = 0, Z contains Q*Z.
 *>           Normally Q is the orthogonal matrix generated by SORGHR
 *>           after the call to SGEHRD which formed the Hessenberg matrix
-*>           H. (The output value of Z when INFO.GT.0 is given under
+*>           H. (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -156,7 +156,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if COMPZ = 'I' or
-*>           COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+*>           COMPZ = 'V', then LDZ >= MAX(1,N).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -169,7 +169,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient and delivers very good and sometimes
 *>           optimal performance.  However, LWORK as large as 11*N
 *>           may be required for optimal performance.  A workspace
@@ -187,21 +187,21 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .LT. 0:  if INFO = -i, the i-th argument had an illegal
+*>             = 0:  successful exit
+*>             < 0:  if INFO = -i, the i-th argument had an illegal
 *>                    value
-*>           .GT. 0:  if INFO = i, SHSEQR failed to compute all of
+*>             > 0:  if INFO = i, SHSEQR failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and JOB = 'E', then on exit, the
+*>                If INFO > 0 and JOB = 'E', then on exit, the
 *>                remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and JOB   = 'S', then on exit
+*>                If INFO > 0 and JOB   = 'S', then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -209,19 +209,19 @@
 *>                value of H is upper Hessenberg and quasi-triangular
 *>                in rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and COMPZ = 'V', then on exit
+*>                If INFO > 0 and COMPZ = 'V', then on exit
 *>
 *>                  (final value of Z)  =  (initial value of Z)*U
 *>
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'I', then on exit
+*>                If INFO > 0 and COMPZ = 'I', then on exit
 *>                      (final value of Z)  = U
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'N', then Z is not
+*>                If INFO > 0 and COMPZ = 'N', then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -261,8 +261,8 @@
 *>                      This depends on ILO, IHI and NS.  NS is the
 *>                      number of simultaneous shifts returned
 *>                      by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
-*>                      The default for (IHI-ILO+1).LE.500 is NS.
-*>                      The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+*>                      The default for (IHI-ILO+1) <= 500 is NS.
+*>                      The default for (IHI-ILO+1) >  500 is 3*NS/2.
 *>
 *>            ISPEC=14: Nibble crossover point. (See IPARMQ for
 *>                      details.)  Default: 14% of deflation window
@@ -341,8 +341,8 @@
       PARAMETER          ( NTINY = 11 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare SLAHQR failure.  NL .GT. NTINY = 11 is
-*     .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+*     .    through a rare SLAHQR failure.  NL > NTINY = 11 is
+*     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
 *     .    deflation window.  ====
diff --git a/lapack-netlib/SRC/sla_gbrcond.f b/lapack-netlib/SRC/sla_gbrcond.f
index 36aa93dc9..7f2c4062e 100644
--- a/lapack-netlib/SRC/sla_gbrcond.f
+++ b/lapack-netlib/SRC/sla_gbrcond.f
@@ -140,13 +140,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (5*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/sla_gbrfsx_extended.f b/lapack-netlib/SRC/sla_gbrfsx_extended.f
index a81feb45e..0fd1fd350 100644
--- a/lapack-netlib/SRC/sla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gbrfsx_extended.f
@@ -65,19 +65,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -269,7 +269,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/sla_gercond.f b/lapack-netlib/SRC/sla_gercond.f
index 349a1b5be..e54e0d7b4 100644
--- a/lapack-netlib/SRC/sla_gercond.f
+++ b/lapack-netlib/SRC/sla_gercond.f
@@ -122,13 +122,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (3*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.2
diff --git a/lapack-netlib/SRC/sla_gerfsx_extended.f b/lapack-netlib/SRC/sla_gerfsx_extended.f
index 1795ea975..84d1ae31b 100644
--- a/lapack-netlib/SRC/sla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/sla_gerfsx_extended.f
@@ -65,19 +65,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -257,7 +257,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERRS_C is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERRS_C is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERRS_C(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/sla_porcond.f b/lapack-netlib/SRC/sla_porcond.f
index 9dd7c587b..729581f46 100644
--- a/lapack-netlib/SRC/sla_porcond.f
+++ b/lapack-netlib/SRC/sla_porcond.f
@@ -112,13 +112,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (3*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/sla_porfsx_extended.f b/lapack-netlib/SRC/sla_porfsx_extended.f
index 27baa20d1..abbfebb83 100644
--- a/lapack-netlib/SRC/sla_porfsx_extended.f
+++ b/lapack-netlib/SRC/sla_porfsx_extended.f
@@ -65,11 +65,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -246,7 +246,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/sla_syrcond.f b/lapack-netlib/SRC/sla_syrcond.f
index c4b204cc6..0c9e2b361 100644
--- a/lapack-netlib/SRC/sla_syrcond.f
+++ b/lapack-netlib/SRC/sla_syrcond.f
@@ -118,13 +118,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (3*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] IWORK
+*> \param[out] IWORK
 *> \verbatim
 *>          IWORK is INTEGER array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/sla_syrfsx_extended.f b/lapack-netlib/SRC/sla_syrfsx_extended.f
index f7b909ac0..a83a9db98 100644
--- a/lapack-netlib/SRC/sla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/sla_syrfsx_extended.f
@@ -67,11 +67,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -255,7 +255,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/sla_syrpvgrw.f b/lapack-netlib/SRC/sla_syrpvgrw.f
index f5eb81b1f..a0a235ee3 100644
--- a/lapack-netlib/SRC/sla_syrpvgrw.f
+++ b/lapack-netlib/SRC/sla_syrpvgrw.f
@@ -101,7 +101,7 @@
 *>     as determined by SSYTRF.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/sla_wwaddw.f b/lapack-netlib/SRC/sla_wwaddw.f
index 96a7d3542..e390c9fab 100644
--- a/lapack-netlib/SRC/sla_wwaddw.f
+++ b/lapack-netlib/SRC/sla_wwaddw.f
@@ -36,7 +36,7 @@
 *>    SLA_WWADDW adds a vector W into a doubled-single vector (X, Y).
 *>
 *>    This works for all extant IBM's hex and binary floating point
-*>    arithmetics, but not for decimal.
+*>    arithmetic, but not for decimal.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/slaed4.f b/lapack-netlib/SRC/slaed4.f
index c65cba75a..64260843f 100644
--- a/lapack-netlib/SRC/slaed4.f
+++ b/lapack-netlib/SRC/slaed4.f
@@ -82,7 +82,7 @@
 *> \param[out] DELTA
 *> \verbatim
 *>          DELTA is REAL array, dimension (N)
-*>         If N .GT. 2, DELTA contains (D(j) - lambda_I) in its  j-th
+*>         If N > 2, DELTA contains (D(j) - lambda_I) in its  j-th
 *>         component.  If N = 1, then DELTA(1) = 1. If N = 2, see SLAED5
 *>         for detail. The vector DELTA contains the information necessary
 *>         to construct the eigenvectors by SLAED3 and SLAED9.
diff --git a/lapack-netlib/SRC/slaed8.f b/lapack-netlib/SRC/slaed8.f
index 5ec117cb5..2e3f6f51f 100644
--- a/lapack-netlib/SRC/slaed8.f
+++ b/lapack-netlib/SRC/slaed8.f
@@ -353,7 +353,7 @@
          Z( I ) = W( INDX( I ) )
    40 CONTINUE
 *
-*     Calculate the allowable deflation tolerence
+*     Calculate the allowable deflation tolerance
 *
       IMAX = ISAMAX( N, Z, 1 )
       JMAX = ISAMAX( N, D, 1 )
diff --git a/lapack-netlib/SRC/slagtf.f b/lapack-netlib/SRC/slagtf.f
index d3f0b6813..59ef097a7 100644
--- a/lapack-netlib/SRC/slagtf.f
+++ b/lapack-netlib/SRC/slagtf.f
@@ -125,7 +125,7 @@
 *>          then IN(k) = 1, otherwise IN(k) = 0. The element IN(n)
 *>          returns the smallest positive integer j such that
 *>
-*>             abs( u(j,j) ).le. norm( (T - lambda*I)(j) )*TOL,
+*>             abs( u(j,j) ) <= norm( (T - lambda*I)(j) )*TOL,
 *>
 *>          where norm( A(j) ) denotes the sum of the absolute values of
 *>          the jth row of the matrix A. If no such j exists then IN(n)
@@ -137,8 +137,8 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0   : successful exit
-*>          .lt. 0: if INFO = -k, the kth argument had an illegal value
+*>          = 0: successful exit
+*>          < 0: if INFO = -k, the kth argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/slagts.f b/lapack-netlib/SRC/slagts.f
index 0c3c5239f..e0c8892d7 100644
--- a/lapack-netlib/SRC/slagts.f
+++ b/lapack-netlib/SRC/slagts.f
@@ -122,12 +122,12 @@
 *> \param[in,out] TOL
 *> \verbatim
 *>          TOL is REAL
-*>          On entry, with  JOB .lt. 0, TOL should be the minimum
+*>          On entry, with  JOB < 0, TOL should be the minimum
 *>          perturbation to be made to very small diagonal elements of U.
 *>          TOL should normally be chosen as about eps*norm(U), where eps
 *>          is the relative machine precision, but if TOL is supplied as
 *>          non-positive, then it is reset to eps*max( abs( u(i,j) ) ).
-*>          If  JOB .gt. 0  then TOL is not referenced.
+*>          If  JOB > 0  then TOL is not referenced.
 *>
 *>          On exit, TOL is changed as described above, only if TOL is
 *>          non-positive on entry. Otherwise TOL is unchanged.
@@ -136,14 +136,14 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0   : successful exit
-*>          .lt. 0: if INFO = -i, the i-th argument had an illegal value
-*>          .gt. 0: overflow would occur when computing the INFO(th)
-*>                  element of the solution vector x. This can only occur
-*>                  when JOB is supplied as positive and either means
-*>                  that a diagonal element of U is very small, or that
-*>                  the elements of the right-hand side vector y are very
-*>                  large.
+*>          = 0: successful exit
+*>          < 0: if INFO = -i, the i-th argument had an illegal value
+*>          > 0: overflow would occur when computing the INFO(th)
+*>               element of the solution vector x. This can only occur
+*>               when JOB is supplied as positive and either means
+*>               that a diagonal element of U is very small, or that
+*>               the elements of the right-hand side vector y are very
+*>               large.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/slahqr.f b/lapack-netlib/SRC/slahqr.f
index d91826e61..e5642d2bf 100644
--- a/lapack-netlib/SRC/slahqr.f
+++ b/lapack-netlib/SRC/slahqr.f
@@ -150,26 +150,26 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           =   0: successful exit
-*>          .GT. 0: If INFO = i, SLAHQR failed to compute all the
+*>           = 0:   successful exit
+*>           > 0:   If INFO = i, SLAHQR failed to compute all the
 *>                  eigenvalues ILO to IHI in a total of 30 iterations
 *>                  per eigenvalue; elements i+1:ihi of WR and WI
 *>                  contain those eigenvalues which have been
 *>                  successfully computed.
 *>
-*>                  If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+*>                  If INFO > 0 and WANTT is .FALSE., then on exit,
 *>                  the remaining unconverged eigenvalues are the
 *>                  eigenvalues of the upper Hessenberg matrix rows
-*>                  and columns ILO thorugh INFO of the final, output
+*>                  and columns ILO through INFO of the final, output
 *>                  value of H.
 *>
-*>                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                  If INFO > 0 and WANTT is .TRUE., then on exit
 *>          (*)       (initial value of H)*U  = U*(final value of H)
-*>                  where U is an orthognal matrix.    The final
+*>                  where U is an orthogonal matrix.    The final
 *>                  value of H is upper Hessenberg and triangular in
 *>                  rows and columns INFO+1 through IHI.
 *>
-*>                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                  If INFO > 0 and WANTZ is .TRUE., then on exit
 *>                      (final value of Z)  = (initial value of Z)*U
 *>                  where U is the orthogonal matrix in (*)
 *>                  (regardless of the value of WANTT.)
diff --git a/lapack-netlib/SRC/slaln2.f b/lapack-netlib/SRC/slaln2.f
index f9ceee7b7..4c6a55ec7 100644
--- a/lapack-netlib/SRC/slaln2.f
+++ b/lapack-netlib/SRC/slaln2.f
@@ -49,7 +49,7 @@
 *> the first column of each being the real part and the second
 *> being the imaginary part.
 *>
-*> "s" is a scaling factor (.LE. 1), computed by SLALN2, which is
+*> "s" is a scaling factor (<= 1), computed by SLALN2, which is
 *> so chosen that X can be computed without overflow.  X is further
 *> scaled if necessary to assure that norm(ca A - w D)*norm(X) is less
 *> than overflow.
diff --git a/lapack-netlib/SRC/slamswlq.f b/lapack-netlib/SRC/slamswlq.f
index b13d02b6c..59ab1a6ee 100644
--- a/lapack-netlib/SRC/slamswlq.f
+++ b/lapack-netlib/SRC/slamswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b SLAMSWLQ
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/slamtsqr.f b/lapack-netlib/SRC/slamtsqr.f
index 84ac86ee2..58905ab46 100644
--- a/lapack-netlib/SRC/slamtsqr.f
+++ b/lapack-netlib/SRC/slamtsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b SLAMTSQR
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/slangb.f b/lapack-netlib/SRC/slangb.f
index fd538b1b7..706e07501 100644
--- a/lapack-netlib/SRC/slangb.f
+++ b/lapack-netlib/SRC/slangb.f
@@ -129,6 +129,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            KL, KU, LDAB, N
@@ -139,22 +140,24 @@
 *
 * =====================================================================
 *
-*
 *     .. Parameters ..
       REAL               ONE, ZERO
       PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K, L
-      REAL               SCALE, SUM, VALUE, TEMP
+      REAL               SUM, VALUE, TEMP
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
 *     ..
@@ -206,15 +209,22 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
             L = MAX( 1, J-KU )
             K = KU + 1 - J + L
-            CALL SLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL SLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL SCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANGB = VALUE
diff --git a/lapack-netlib/SRC/slange.f b/lapack-netlib/SRC/slange.f
index 2eb8d7d14..0c80f1d40 100644
--- a/lapack-netlib/SRC/slange.f
+++ b/lapack-netlib/SRC/slange.f
@@ -119,6 +119,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, M, N
@@ -135,10 +136,13 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      REAL               SCALE, SUM, VALUE, TEMP
+      REAL               SUM, VALUE, TEMP
+*     ..
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+      EXTERNAL           SLASSQ, SCOMBSSQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
@@ -194,13 +198,19 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL SLASSQ( M, A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL SLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL SCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANGE = VALUE
diff --git a/lapack-netlib/SRC/slanhs.f b/lapack-netlib/SRC/slanhs.f
index c5a077fbf..8913031a2 100644
--- a/lapack-netlib/SRC/slanhs.f
+++ b/lapack-netlib/SRC/slanhs.f
@@ -113,6 +113,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, N
@@ -129,15 +130,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
 *     ..
@@ -188,13 +192,20 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL SLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL SLASSQ( MIN( N, J+1 ), A( 1, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL SCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANHS = VALUE
diff --git a/lapack-netlib/SRC/slansb.f b/lapack-netlib/SRC/slansb.f
index 8f3fe1eb9..23519025d 100644
--- a/lapack-netlib/SRC/slansb.f
+++ b/lapack-netlib/SRC/slansb.f
@@ -134,6 +134,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            K, LDAB, N
@@ -150,15 +151,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, L
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
 *     ..
@@ -225,29 +229,47 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( K.GT.0 ) THEN
             IF( LSAME( UPLO, 'U' ) ) THEN
                DO 110 J = 2, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL SLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   110          CONTINUE
                L = K + 1
             ELSE
                DO 120 J = 1, N - 1
-                  CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   120          CONTINUE
                L = 1
             END IF
-            SUM = 2*SUM
+            SSQ( 2 ) = 2*SSQ( 2 )
          ELSE
             L = 1
          END IF
-         CALL SLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL SLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANSB = VALUE
diff --git a/lapack-netlib/SRC/slansp.f b/lapack-netlib/SRC/slansp.f
index 35390cd1c..7e29d778b 100644
--- a/lapack-netlib/SRC/slansp.f
+++ b/lapack-netlib/SRC/slansp.f
@@ -119,6 +119,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            N
@@ -135,15 +136,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
 *     ..
@@ -217,31 +221,48 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          K = 2
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL SLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL SLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
                K = K + J
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL SLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL SLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
                K = K + N - J + 1
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          K = 1
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 I = 1, N
             IF( AP( K ).NE.ZERO ) THEN
                ABSA = ABS( AP( K ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( LSAME( UPLO, 'U' ) ) THEN
@@ -250,7 +271,8 @@
                K = K + N - I + 1
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANSP = VALUE
diff --git a/lapack-netlib/SRC/slansy.f b/lapack-netlib/SRC/slansy.f
index c8400e530..66ff1c5c7 100644
--- a/lapack-netlib/SRC/slansy.f
+++ b/lapack-netlib/SRC/slansy.f
@@ -127,6 +127,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            LDA, N
@@ -143,15 +144,18 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      REAL               ABSA, SCALE, SUM, VALUE
+      REAL               ABSA, SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
 *     ..
@@ -216,21 +220,39 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL SLASSQ( J-1, A( 1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL SLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL SLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL SLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL SCOMBSSQ( SSQ, COLSSQ )
   120       CONTINUE
          END IF
-         SUM = 2*SUM
-         CALL SLASSQ( N, A, LDA+1, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL SLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL SCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANSY = VALUE
diff --git a/lapack-netlib/SRC/slantb.f b/lapack-netlib/SRC/slantb.f
index 3588779cb..5b94618e1 100644
--- a/lapack-netlib/SRC/slantb.f
+++ b/lapack-netlib/SRC/slantb.f
@@ -145,6 +145,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            K, LDAB, N
@@ -162,15 +163,18 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, L
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
 *     ..
@@ -311,46 +315,61 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 280 J = 2, N
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
                      CALL SLASSQ( MIN( J-1, K ),
-     $                            AB( MAX( K+2-J, 1 ), J ), 1, SCALE,
-     $                            SUM )
+     $                            AB( MAX( K+2-J, 1 ), J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL SCOMBSSQ( SSQ, COLSSQ )
   280             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 290 J = 1, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL SLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 300 J = 1, N - 1
-                     CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                            SUM )
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
+                     CALL SLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL SCOMBSSQ( SSQ, COLSSQ )
   300             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 310 J = 1, N
-                  CALL SLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANTB = VALUE
diff --git a/lapack-netlib/SRC/slantp.f b/lapack-netlib/SRC/slantp.f
index 1423f5ca3..ab781deac 100644
--- a/lapack-netlib/SRC/slantp.f
+++ b/lapack-netlib/SRC/slantp.f
@@ -129,6 +129,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            N
@@ -146,15 +147,18 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, K
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
 *     ..
@@ -306,45 +310,64 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 280 J = 2, N
-                  CALL SLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( J-1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   280          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 290 J = 1, N
-                  CALL SLASSQ( J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 300 J = 1, N - 1
-                  CALL SLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( N-J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   300          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 310 J = 1, N
-                  CALL SLASSQ( N-J+1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( N-J+1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANTP = VALUE
diff --git a/lapack-netlib/SRC/slantr.f b/lapack-netlib/SRC/slantr.f
index 63b855892..04d29f537 100644
--- a/lapack-netlib/SRC/slantr.f
+++ b/lapack-netlib/SRC/slantr.f
@@ -146,6 +146,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            LDA, M, N
@@ -163,15 +164,18 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J
-      REAL               SCALE, SUM, VALUE
+      REAL               SUM, VALUE
 *     ..
-*     .. External Subroutines ..
-      EXTERNAL           SLASSQ
+*     .. Local Arrays ..
+      REAL               SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, SISNAN
       EXTERNAL           LSAME, SISNAN
 *     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLASSQ, SCOMBSSQ
+*     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
 *     ..
@@ -281,7 +285,7 @@
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               DO 210 I = 1, N
+               DO 210 I = 1, MIN( M, N )
                   WORK( I ) = ONE
   210          CONTINUE
                DO 220 I = N + 1, M
@@ -311,38 +315,56 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 290 J = 2, N
-                  CALL SLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( MIN( M, J-1 ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 300 J = 1, N
-                  CALL SLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( MIN( M, J ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   300          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 310 J = 1, N
-                  CALL SLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( M-J, A( MIN( M, J+1 ), J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 320 J = 1, N
-                  CALL SLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL SLASSQ( M-J+1, A( J, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL SCOMBSSQ( SSQ, COLSSQ )
   320          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       SLANTR = VALUE
diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f
index e73e5455c..1163446fa 100644
--- a/lapack-netlib/SRC/slanv2.f
+++ b/lapack-netlib/SRC/slanv2.f
@@ -161,7 +161,6 @@
       IF( C.EQ.ZERO ) THEN
          CS = ONE
          SN = ZERO
-         GO TO 10
 *
       ELSE IF( B.EQ.ZERO ) THEN
 *
@@ -174,12 +173,12 @@
          A = TEMP
          B = -C
          C = ZERO
-         GO TO 10
+*
       ELSE IF( (A-D).EQ.ZERO .AND. SIGN( ONE, B ).NE.
      $   SIGN( ONE, C ) ) THEN
          CS = ONE
          SN = ZERO
-         GO TO 10
+*
       ELSE
 *
          TEMP = A - D
@@ -207,6 +206,7 @@
             SN = C / TAU
             B = B - C
             C = ZERO
+*
          ELSE
 *
 *           Complex eigenvalues, or real (almost) equal eigenvalues.
@@ -268,8 +268,6 @@
          END IF
 *
       END IF
-*
-   10 CONTINUE
 *
 *     Store eigenvalues in (RT1R,RT1I) and (RT2R,RT2I).
 *
diff --git a/lapack-netlib/SRC/slaorhr_col_getrfnp.f b/lapack-netlib/SRC/slaorhr_col_getrfnp.f
new file mode 100644
index 000000000..6cc59e538
--- /dev/null
+++ b/lapack-netlib/SRC/slaorhr_col_getrfnp.f
@@ -0,0 +1,248 @@
+*> \brief \b SLAORHR_COL_GETRFNP
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SLAORHR_COL_GETRFNP + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaorhr_col_getrfnp.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaorhr_col_getrfnp.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaorhr_col_getrfnp.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAORHR_COL_GETRFNP computes the modified LU factorization without
+*> pivoting of a real general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is
+*>    at least one in absolute value (so that division-by-zero not
+*>    not possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine SORHR_COL. In SORHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the blocked right-looking version of the algorithm,
+*> calling Level 3 BLAS to update the submatrix. To factorize a block,
+*> this routine calls the recursive routine SLAORHR_COL_GETRFNP2.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is REAL array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can
+*>          be only plus or minus one.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup realGEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SLAORHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            IINFO, J, JB, NB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SLAORHR_COL_GETRFNP2, STRSM, XERBLA
+*     ..
+*     .. External Functions ..
+      INTEGER            ILAENV
+      EXTERNAL           ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SLAORHR_COL_GETRFNP', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+*
+*     Determine the block size for this environment.
+*
+
+      NB = ILAENV( 1, 'SLAORHR_COL_GETRFNP', ' ', M, N, -1, -1 )
+
+      IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN
+*
+*        Use unblocked code.
+*
+         CALL SLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      ELSE
+*
+*        Use blocked code.
+*
+         DO J = 1, MIN( M, N ), NB
+            JB = MIN( MIN( M, N )-J+1, NB )
+*
+*           Factor diagonal and subdiagonal blocks.
+*
+            CALL SLAORHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA,
+     $                                 D( J ), IINFO )
+*
+            IF( J+JB.LE.N ) THEN
+*
+*              Compute block row of U.
+*
+               CALL STRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB,
+     $                     N-J-JB+1, ONE, A( J, J ), LDA, A( J, J+JB ),
+     $                     LDA )
+               IF( J+JB.LE.M ) THEN
+*
+*                 Update trailing submatrix.
+*
+                  CALL SGEMM( 'No transpose', 'No transpose', M-J-JB+1,
+     $                        N-J-JB+1, JB, -ONE, A( J+JB, J ), LDA,
+     $                        A( J, J+JB ), LDA, ONE, A( J+JB, J+JB ),
+     $                        LDA )
+               END IF
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of SLAORHR_COL_GETRFNP
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/slaorhr_col_getrfnp2.f b/lapack-netlib/SRC/slaorhr_col_getrfnp2.f
new file mode 100644
index 000000000..de604602f
--- /dev/null
+++ b/lapack-netlib/SRC/slaorhr_col_getrfnp2.f
@@ -0,0 +1,305 @@
+*> \brief \b SLAORHR_COL_GETRFNP2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download DLAORHR_GETRF2NP + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/slaorhr_col_getrfnp2.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/slaorhr_col_getrfnp2.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/slaorhr_col_getrfnp2.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       RECURSIVE SUBROUTINE SLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       REAL               A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SLAORHR_COL_GETRFNP2 computes the modified LU factorization without
+*> pivoting of a real general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is at
+*>    least one in absolute value (so that division-by-zero not
+*>    possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine SORHR_COL. In SORHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the recursive version of the LU factorization algorithm.
+*> Denote A - S by B. The algorithm divides the matrix B into four
+*> submatrices:
+*>
+*>        [  B11 | B12  ]  where B11 is n1 by n1,
+*>    B = [ -----|----- ]        B21 is (m-n1) by n1,
+*>        [  B21 | B22  ]        B12 is n1 by n2,
+*>                               B22 is (m-n1) by n2,
+*>                               with n1 = min(m,n)/2, n2 = n-n1.
+*>
+*>
+*> The subroutine calls itself to factor B11, solves for B21,
+*> solves for B12, updates B22, then calls itself to factor B22.
+*>
+*> For more details on the recursive LU algorithm, see [2].
+*>
+*> SLAORHR_COL_GETRFNP2 is called to factorize a block by the blocked
+*> routine SLAORHR_COL_GETRFNP, which uses blocked code calling
+*. Level 3 BLAS to update the submatrix. However, SLAORHR_COL_GETRFNP2
+*> is self-sufficient and can be used without SLAORHR_COL_GETRFNP.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*>
+*> [2] "Recursion leads to automatic variable blocking for dense linear
+*>     algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+*>     vol. 41, no. 6, pp. 737-755, 1997.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is REAL array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can
+*>          be only plus or minus one.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup realGEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      RECURSIVE SUBROUTINE SLAORHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      REAL               A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE
+      PARAMETER          ( ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      REAL               SFMIN
+      INTEGER            I, IINFO, N1, N2
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH
+      EXTERNAL           SLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SGEMM, SSCAL, STRSM, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, SIGN, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SLAORHR_COL_GETRFNP2', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+
+      IF ( M.EQ.1 ) THEN
+*
+*        One row case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = -SIGN( ONE, A( 1, 1 ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+      ELSE IF( N.EQ.1 ) THEN
+*
+*        One column case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = -SIGN( ONE, A( 1, 1 ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+*        Scale the elements 2:M of the column
+*
+*        Determine machine safe minimum
+*
+         SFMIN = SLAMCH('S')
+*
+*        Construct the subdiagonal elements of L
+*
+         IF( ABS( A( 1, 1 ) ) .GE. SFMIN ) THEN
+            CALL SSCAL( M-1, ONE / A( 1, 1 ), A( 2, 1 ), 1 )
+         ELSE
+            DO I = 2, M
+               A( I, 1 ) = A( I, 1 ) / A( 1, 1 )
+            END DO
+         END IF
+*
+      ELSE
+*
+*        Divide the matrix B into four submatrices
+*
+         N1 = MIN( M, N ) / 2
+         N2 = N-N1
+
+*
+*        Factor B11, recursive call
+*
+         CALL SLAORHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO )
+*
+*        Solve for B21
+*
+         CALL STRSM( 'R', 'U', 'N', 'N', M-N1, N1, ONE, A, LDA,
+     $               A( N1+1, 1 ), LDA )
+*
+*        Solve for B12
+*
+         CALL STRSM( 'L', 'L', 'N', 'U', N1, N2, ONE, A, LDA,
+     $               A( 1, N1+1 ), LDA )
+*
+*        Update B22, i.e. compute the Schur complement
+*        B22 := B22 - B21*B12
+*
+         CALL SGEMM( 'N', 'N', M-N1, N2, N1, -ONE, A( N1+1, 1 ), LDA,
+     $               A( 1, N1+1 ), LDA, ONE, A( N1+1, N1+1 ), LDA )
+*
+*        Factor B22, recursive call
+*
+         CALL SLAORHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA,
+     $                          D( N1+1 ), IINFO )
+*
+      END IF
+      RETURN
+*
+*     End of SLAORHR_COL_GETRFNP2
+*
+      END
diff --git a/lapack-netlib/SRC/slaqps.f b/lapack-netlib/SRC/slaqps.f
index 9c62ec8b6..3f8af304f 100644
--- a/lapack-netlib/SRC/slaqps.f
+++ b/lapack-netlib/SRC/slaqps.f
@@ -127,7 +127,7 @@
 *> \param[in,out] AUXV
 *> \verbatim
 *>          AUXV is REAL array, dimension (NB)
-*>          Auxiliar vector.
+*>          Auxiliary vector.
 *> \endverbatim
 *>
 *> \param[in,out] F
diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f
index 1dcd3d176..318b46943 100644
--- a/lapack-netlib/SRC/slaqr0.f
+++ b/lapack-netlib/SRC/slaqr0.f
@@ -67,7 +67,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -79,12 +79,12 @@
 *> \verbatim
 *>          IHI is INTEGER
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to SGEBAL, and then passed to SGEHRD when the
 *>           matrix output by SGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -97,19 +97,19 @@
 *>           decomposition (the Schur form); 2-by-2 diagonal blocks
 *>           (corresponding to complex conjugate pairs of eigenvalues)
 *>           are returned in standard form, with H(i,i) = H(i+1,i+1)
-*>           and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+*>           and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] WR
@@ -125,7 +125,7 @@
 *>           and WI(ILO:IHI). If two eigenvalues are computed as a
 *>           complex conjugate pair, they are stored in consecutive
 *>           elements of WR and WI, say the i-th and (i+1)th, with
-*>           WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+*>           WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then
 *>           the eigenvalues are stored in the same order as on the
 *>           diagonal of the Schur form returned in H, with
 *>           WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
@@ -143,7 +143,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -153,7 +153,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -161,7 +161,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -174,7 +174,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -190,19 +190,19 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, SLAQR0 failed to compute all of
+*>             = 0:  successful exit
+*>             > 0:  if INFO = i, SLAQR0 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -210,7 +210,7 @@
 *>                value of H is upper Hessenberg and quasi-triangular
 *>                in rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -218,7 +218,7 @@
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -677,7 +677,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/slaqr1.f b/lapack-netlib/SRC/slaqr1.f
index 2de33849d..6bb88c794 100644
--- a/lapack-netlib/SRC/slaqr1.f
+++ b/lapack-netlib/SRC/slaqr1.f
@@ -69,7 +69,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>              The leading dimension of H as declared in
-*>              the calling procedure.  LDH.GE.N
+*>              the calling procedure.  LDH >= N
 *> \endverbatim
 *>
 *> \param[in] SR1
diff --git a/lapack-netlib/SRC/slaqr2.f b/lapack-netlib/SRC/slaqr2.f
index 8e1f34910..f4f8ca7f2 100644
--- a/lapack-netlib/SRC/slaqr2.f
+++ b/lapack-netlib/SRC/slaqr2.f
@@ -103,7 +103,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -121,7 +121,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -133,7 +133,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -149,7 +149,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -194,13 +194,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -212,14 +212,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -231,7 +231,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/slaqr3.f b/lapack-netlib/SRC/slaqr3.f
index 534e2c489..ccad338b9 100644
--- a/lapack-netlib/SRC/slaqr3.f
+++ b/lapack-netlib/SRC/slaqr3.f
@@ -100,7 +100,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -118,7 +118,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -130,7 +130,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -146,7 +146,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -191,13 +191,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -209,14 +209,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -228,7 +228,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f
index 12b6b2fb1..cd642e07f 100644
--- a/lapack-netlib/SRC/slaqr4.f
+++ b/lapack-netlib/SRC/slaqr4.f
@@ -74,7 +74,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -86,12 +86,12 @@
 *> \verbatim
 *>          IHI is INTEGER
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to SGEBAL, and then passed to SGEHRD when the
 *>           matrix output by SGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -104,19 +104,19 @@
 *>           decomposition (the Schur form); 2-by-2 diagonal blocks
 *>           (corresponding to complex conjugate pairs of eigenvalues)
 *>           are returned in standard form, with H(i,i) = H(i+1,i+1)
-*>           and H(i+1,i)*H(i,i+1).LT.0. If INFO = 0 and WANTT is
+*>           and H(i+1,i)*H(i,i+1) < 0. If INFO = 0 and WANTT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] WR
@@ -132,7 +132,7 @@
 *>           and WI(ILO:IHI). If two eigenvalues are computed as a
 *>           complex conjugate pair, they are stored in consecutive
 *>           elements of WR and WI, say the i-th and (i+1)th, with
-*>           WI(i) .GT. 0 and WI(i+1) .LT. 0. If WANTT is .TRUE., then
+*>           WI(i) > 0 and WI(i+1) < 0. If WANTT is .TRUE., then
 *>           the eigenvalues are stored in the same order as on the
 *>           diagonal of the Schur form returned in H, with
 *>           WR(i) = H(i,i) and, if H(i:i+1,i:i+1) is a 2-by-2 diagonal
@@ -150,7 +150,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -160,7 +160,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -168,7 +168,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -181,7 +181,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -199,19 +199,19 @@
 *>          INFO is INTEGER
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, SLAQR4 failed to compute all of
+*>             = 0:  successful exit
+*>             > 0:  if INFO = i, SLAQR4 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -219,7 +219,7 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -227,7 +227,7 @@
 *>                where U is the orthogonal matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -680,7 +680,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f
index 65278e355..f04ee577e 100644
--- a/lapack-netlib/SRC/slaqr5.f
+++ b/lapack-netlib/SRC/slaqr5.f
@@ -133,7 +133,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>             LDH is the leading dimension of H just as declared in the
-*>             calling procedure.  LDH.GE.MAX(1,N).
+*>             calling procedure.  LDH >= MAX(1,N).
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -145,7 +145,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>             Specify the rows of Z to which transformations must be
-*>             applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+*>             applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -161,7 +161,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>             LDA is the leading dimension of Z just as declared in
-*>             the calling procedure. LDZ.GE.N.
+*>             the calling procedure. LDZ >= N.
 *> \endverbatim
 *>
 *> \param[out] V
@@ -173,7 +173,7 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>             LDV is the leading dimension of V as declared in the
-*>             calling procedure.  LDV.GE.3.
+*>             calling procedure.  LDV >= 3.
 *> \endverbatim
 *>
 *> \param[out] U
@@ -185,33 +185,14 @@
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU.GE.3*NSHFTS-3.
-*> \endverbatim
-*>
-*> \param[in] NH
-*> \verbatim
-*>          NH is INTEGER
-*>             NH is the number of columns in array WH available for
-*>             workspace. NH.GE.1.
-*> \endverbatim
-*>
-*> \param[out] WH
-*> \verbatim
-*>          WH is REAL array, dimension (LDWH,NH)
-*> \endverbatim
-*>
-*> \param[in] LDWH
-*> \verbatim
-*>          LDWH is INTEGER
-*>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH.GE.3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>             NV is the number of rows in WV agailable for workspace.
-*>             NV.GE.1.
+*>             NV >= 1.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -223,9 +204,28 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>             LDWV is the leading dimension of WV as declared in the
-*>             in the calling subroutine.  LDWV.GE.NV.
+*>             in the calling subroutine.  LDWV >= NV.
 *> \endverbatim
 *
+*> \param[in] NH
+*> \verbatim
+*>          NH is INTEGER
+*>             NH is the number of columns in array WH available for
+*>             workspace. NH >= 1.
+*> \endverbatim
+*>
+*> \param[out] WH
+*> \verbatim
+*>          WH is REAL array, dimension (LDWH,NH)
+*> \endverbatim
+*>
+*> \param[in] LDWH
+*> \verbatim
+*>          LDWH is INTEGER
+*>             Leading dimension of WH just as declared in the
+*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*> \endverbatim
+*>
 *  Authors:
 *  ========
 *
diff --git a/lapack-netlib/SRC/slarfb.f b/lapack-netlib/SRC/slarfb.f
index c51f69534..d853a54ec 100644
--- a/lapack-netlib/SRC/slarfb.f
+++ b/lapack-netlib/SRC/slarfb.f
@@ -92,6 +92,8 @@
 *>          K is INTEGER
 *>          The order of the matrix T (= the number of elementary
 *>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
 *> \endverbatim
 *>
 *> \param[in] V
diff --git a/lapack-netlib/SRC/slarfx.f b/lapack-netlib/SRC/slarfx.f
index 590e99e70..3175068b8 100644
--- a/lapack-netlib/SRC/slarfx.f
+++ b/lapack-netlib/SRC/slarfx.f
@@ -94,7 +94,7 @@
 *> \param[in] LDC
 *> \verbatim
 *>          LDC is INTEGER
-*>          The leading dimension of the array C. LDA >= (1,M).
+*>          The leading dimension of the array C. LDC >= (1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/slarfy.f b/lapack-netlib/SRC/slarfy.f
index 340c54413..f9ba011a2 100644
--- a/lapack-netlib/SRC/slarfy.f
+++ b/lapack-netlib/SRC/slarfy.f
@@ -103,7 +103,7 @@
 *
 *> \date December 2016
 *
-*> \ingroup single_eig
+*> \ingroup realOTHERauxiliary
 *
 *  =====================================================================
       SUBROUTINE SLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
diff --git a/lapack-netlib/SRC/slarrb.f b/lapack-netlib/SRC/slarrb.f
index 988e25ff0..ac9d7bc8c 100644
--- a/lapack-netlib/SRC/slarrb.f
+++ b/lapack-netlib/SRC/slarrb.f
@@ -91,7 +91,7 @@
 *>          RTOL2 is REAL
 *>          Tolerance for the convergence of the bisection intervals.
 *>          An interval [LEFT,RIGHT] has converged if
-*>          RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>          RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *>          where GAP is the (estimated) distance to the nearest
 *>          eigenvalue.
 *> \endverbatim
@@ -117,7 +117,7 @@
 *>          WGAP is REAL array, dimension (N-1)
 *>          On input, the (estimated) gaps between consecutive
 *>          eigenvalues of L D L^T, i.e., WGAP(I-OFFSET) is the gap between
-*>          eigenvalues I and I+1. Note that if IFIRST.EQ.ILAST
+*>          eigenvalues I and I+1. Note that if IFIRST = ILAST
 *>          then WGAP(IFIRST-OFFSET) must be set to ZERO.
 *>          On output, these gaps are refined.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/slarre.f b/lapack-netlib/SRC/slarre.f
index ea9b8fcbc..6636235d0 100644
--- a/lapack-netlib/SRC/slarre.f
+++ b/lapack-netlib/SRC/slarre.f
@@ -150,7 +150,7 @@
 *>          RTOL2 is REAL
 *>           Parameters for bisection.
 *>           An interval [LEFT,RIGHT] has converged if
-*>           RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>           RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *> \endverbatim
 *>
 *> \param[in] SPLTOL
diff --git a/lapack-netlib/SRC/slarrj.f b/lapack-netlib/SRC/slarrj.f
index a721d0751..fb867595c 100644
--- a/lapack-netlib/SRC/slarrj.f
+++ b/lapack-netlib/SRC/slarrj.f
@@ -85,7 +85,7 @@
 *>          RTOL is REAL
 *>          Tolerance for the convergence of the bisection intervals.
 *>          An interval [LEFT,RIGHT] has converged if
-*>          RIGHT-LEFT.LT.RTOL*MAX(|LEFT|,|RIGHT|).
+*>          RIGHT-LEFT < RTOL*MAX(|LEFT|,|RIGHT|).
 *> \endverbatim
 *>
 *> \param[in] OFFSET
diff --git a/lapack-netlib/SRC/slarrv.f b/lapack-netlib/SRC/slarrv.f
index f9e3cf2b9..04519fde8 100644
--- a/lapack-netlib/SRC/slarrv.f
+++ b/lapack-netlib/SRC/slarrv.f
@@ -149,7 +149,7 @@
 *>          RTOL2 is REAL
 *>           Parameters for bisection.
 *>           An interval [LEFT,RIGHT] has converged if
-*>           RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>           RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *> \endverbatim
 *>
 *> \param[in,out] W
diff --git a/lapack-netlib/SRC/slasd7.f b/lapack-netlib/SRC/slasd7.f
index 2adaa5ee7..d8775b6c6 100644
--- a/lapack-netlib/SRC/slasd7.f
+++ b/lapack-netlib/SRC/slasd7.f
@@ -400,7 +400,7 @@
          VL( I ) = VLW( IDXI )
    50 CONTINUE
 *
-*     Calculate the allowable deflation tolerence
+*     Calculate the allowable deflation tolerance
 *
       EPS = SLAMCH( 'Epsilon' )
       TOL = MAX( ABS( ALPHA ), ABS( BETA ) )
diff --git a/lapack-netlib/SRC/slassq.f b/lapack-netlib/SRC/slassq.f
index 35b40f07f..d9930a597 100644
--- a/lapack-netlib/SRC/slassq.f
+++ b/lapack-netlib/SRC/slassq.f
@@ -60,7 +60,7 @@
 *>
 *> \param[in] X
 *> \verbatim
-*>          X is REAL array, dimension (N)
+*>          X is REAL array, dimension (1+(N-1)*INCX)
 *>          The vector for which a scaled sum of squares is computed.
 *>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/slaswlq.f b/lapack-netlib/SRC/slaswlq.f
index 27b5b8067..5eb9cc5f9 100644
--- a/lapack-netlib/SRC/slaswlq.f
+++ b/lapack-netlib/SRC/slaswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b SLASWLQ
 *
 *  Definition:
 *  ===========
@@ -18,9 +19,20 @@
 *>
 *> \verbatim
 *>
-*>          SLASWLQ computes a blocked Short-Wide LQ factorization of a
-*>          M-by-N matrix A, where N >= M:
-*>          A = L * Q
+*> SLASWLQ computes a blocked Tall-Skinny LQ factorization of
+*> a real M-by-N matrix A for M <= N:
+*>
+*>    A = ( L 0 ) *  Q,
+*>
+*> where:
+*>
+*>    Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+*>    form in the elements above the digonal of the array A and in
+*>    the elemenst of the array T;
+*>    L is an lower-triangular M-by-M matrix stored on exit in
+*>    the elements on and below the diagonal of the array A.
+*>    0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -150,10 +162,10 @@
       SUBROUTINE SLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
      $                  INFO)
 *
-*  -- LAPACK computational routine (version 3.8.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     November 2017
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           INFO, LDA, M, N, MB, NB, LWORK, LDT
diff --git a/lapack-netlib/SRC/slasyf_aa.f b/lapack-netlib/SRC/slasyf_aa.f
index ed4ef6291..76f632602 100644
--- a/lapack-netlib/SRC/slasyf_aa.f
+++ b/lapack-netlib/SRC/slasyf_aa.f
@@ -284,8 +284,9 @@
 *
 *              Swap A(I1, I2+1:M) with A(I2, I2+1:M)
 *
-               CALL SSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
-     $                           A( J1+I2-1, I2+1 ), LDA )
+               IF( I2.LT.M )
+     $            CALL SSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
+     $                              A( J1+I2-1, I2+1 ), LDA )
 *
 *              Swap A(I1, I1) with A(I2,I2)
 *
@@ -325,13 +326,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( K, J+1 ).NE.ZERO ) THEN
-               ALPHA = ONE / A( K, J+1 )
-               CALL SCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
-               CALL SSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
-            ELSE
-               CALL SLASET( 'Full', 1, M-J-1, ZERO, ZERO,
-     $                      A( K, J+2 ), LDA)
+            IF( J.LT.(M-1) ) THEN
+               IF( A( K, J+1 ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( K, J+1 )
+                  CALL SCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
+                  CALL SSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
+               ELSE
+                  CALL SLASET( 'Full', 1, M-J-1, ZERO, ZERO,
+     $                         A( K, J+2 ), LDA)
+               END IF
             END IF
          END IF
          J = J + 1
@@ -432,8 +435,9 @@
 *
 *              Swap A(I2+1:M, I1) with A(I2+1:M, I2)
 *
-               CALL SSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
-     $                           A( I2+1, J1+I2-1 ), 1 )
+               IF( I2.LT.M )
+     $            CALL SSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
+     $                              A( I2+1, J1+I2-1 ), 1 )
 *
 *              Swap A(I1, I1) with A(I2, I2)
 *
@@ -473,13 +477,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( J+1, K ).NE.ZERO ) THEN
-               ALPHA = ONE / A( J+1, K )
-               CALL SCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
-               CALL SSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
-            ELSE
-               CALL SLASET( 'Full', M-J-1, 1, ZERO, ZERO,
-     $                      A( J+2, K ), LDA )
+            IF( J.LT.(M-1) ) THEN
+               IF( A( J+1, K ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( J+1, K )
+                  CALL SCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
+                  CALL SSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
+               ELSE
+                  CALL SLASET( 'Full', M-J-1, 1, ZERO, ZERO,
+     $                         A( J+2, K ), LDA )
+               END IF
             END IF
          END IF
          J = J + 1
diff --git a/lapack-netlib/SRC/slasyf_rk.f b/lapack-netlib/SRC/slasyf_rk.f
index b1b37177f..c16708365 100644
--- a/lapack-netlib/SRC/slasyf_rk.f
+++ b/lapack-netlib/SRC/slasyf_rk.f
@@ -321,7 +321,7 @@
 *        of A and working backwards, and compute the matrix W = U12*D
 *        for use in updating A11
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = ZERO
@@ -649,7 +649,7 @@
 *        of A and working forwards, and compute the matrix W = L21*D
 *        for use in updating A22
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = ZERO
 *
diff --git a/lapack-netlib/SRC/slatdf.f b/lapack-netlib/SRC/slatdf.f
index 5496f9db4..495d32502 100644
--- a/lapack-netlib/SRC/slatdf.f
+++ b/lapack-netlib/SRC/slatdf.f
@@ -85,7 +85,7 @@
 *>          RHS is REAL array, dimension N.
 *>          On entry, RHS contains contributions from other subsystems.
 *>          On exit, RHS contains the solution of the subsystem with
-*>          entries acoording to the value of IJOB (see above).
+*>          entries according to the value of IJOB (see above).
 *> \endverbatim
 *>
 *> \param[in,out] RDSUM
@@ -260,7 +260,7 @@
 *
 *        Solve for U-part, look-ahead for RHS(N) = +-1. This is not done
 *        in BSOLVE and will hopefully give us a better estimate because
-*        any ill-conditioning of the original matrix is transfered to U
+*        any ill-conditioning of the original matrix is transferred to U
 *        and not to L. U(N, N) is an approximation to sigma_min(LU).
 *
          CALL SCOPY( N-1, RHS, 1, XP, 1 )
diff --git a/lapack-netlib/SRC/slatsqr.f b/lapack-netlib/SRC/slatsqr.f
index d6d682799..b56b0d41e 100644
--- a/lapack-netlib/SRC/slatsqr.f
+++ b/lapack-netlib/SRC/slatsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b SLATSQR
 *
 *  Definition:
 *  ===========
@@ -19,8 +20,22 @@
 *> \verbatim
 *>
 *> SLATSQR computes a blocked Tall-Skinny QR factorization of
-*> an M-by-N matrix A, where M >= N:
-*> A = Q * R .
+*> a real M-by-N matrix A for M >= N:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+*>    form in the elements below the digonal of the array A and in
+*>    the elemenst of the array T;
+*>
+*>    R is an upper-triangular N-by-N matrix, stored on exit in
+*>    the elements on and above the diagonal of the array A.
+*>
+*>    0 is a (M-N)-by-N zero matrix, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -149,10 +164,10 @@
       SUBROUTINE SLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
      $                    LWORK, INFO)
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
diff --git a/lapack-netlib/SRC/sorgtsqr.f b/lapack-netlib/SRC/sorgtsqr.f
new file mode 100644
index 000000000..748760d63
--- /dev/null
+++ b/lapack-netlib/SRC/sorgtsqr.f
@@ -0,0 +1,306 @@
+*> \brief \b SORGTSQR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SORGTSQR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sorgtsqr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sorgtsqr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sorgtsqr.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+*      $                     INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SORGTSQR generates an M-by-N real matrix Q_out with orthonormal columns,
+*> which are the first N columns of a product of real orthogonal
+*> matrices of order M which are returned by SLATSQR
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> See the documentation for SLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by SLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by SLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not accessed.
+*>             The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by SLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored) (same format as the output A
+*>             below the diagonal in SLATSQR).
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is REAL array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of NIRB block reflector sequences
+*>          is stored in a larger NB-by-N column block of T and consists
+*>          of NICB smaller NB-by-NB upper-triangular column blocks.
+*>          (same format as the output T in SLATSQR).
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB1,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) REAL array, dimension (MAX(2,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup singleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SORGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      REAL              A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SLAMTSQR, SLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      LQUERY  = LWORK.EQ.-1
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array C(LDC, N) and WORK(LWORK)
+*        in the call to DLAMTSQR. See the documentation for DLAMTSQR.
+*
+         IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN
+            INFO = -10
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NBLOCAL = MIN( NB, N )
+*
+*           LWORK = -1, then set the size for the array C(LDC,N)
+*           in DLAMTSQR call and set the optimal size of the work array
+*           WORK(LWORK) in DLAMTSQR call.
+*
+            LDC = M
+            LC = LDC*N
+            LW = N * NBLOCAL
+*
+            LWORKOPT = LC+LW
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -10
+            END IF
+         END IF
+*
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SORGTSQR', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = REAL( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in
+*     of M-by-M orthogonal matrix Q_in, which is implicitly stored in
+*     the subdiagonal part of input array A and in the input array T.
+*     Perform by the following operation using the routine DLAMTSQR.
+*
+*         Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix,
+*                        ( 0 )        0 is a (M-N)-by-N zero matrix.
+*
+*     (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones
+*     on the diagonal and zeros elsewhere.
+*
+      CALL SLASET( 'F', M, N, ZERO, ONE, WORK, LDC )
+*
+*     (1b)  On input, WORK(1:LDC*N) stores ( I );
+*                                          ( 0 )
+*
+*           On output, WORK(1:LDC*N) stores Q1_in.
+*
+      CALL SLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT,
+     $               WORK, LDC, WORK( LC+1 ), LW, IINFO )
+*
+*     (2) Copy the result from the part of the work array (1:M,1:N)
+*     with the leading dimension LDC that starts at WORK(1) into
+*     the output array A(1:M,1:N) column-by-column.
+*
+      DO J = 1, N
+         CALL SCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 )
+      END DO
+*
+      WORK( 1 ) = REAL( LWORKOPT )
+      RETURN
+*
+*     End of SORGTSQR
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/sorhr_col.f b/lapack-netlib/SRC/sorhr_col.f
new file mode 100644
index 000000000..38976245c
--- /dev/null
+++ b/lapack-netlib/SRC/sorhr_col.f
@@ -0,0 +1,439 @@
+*> \brief \b SORHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download SORHR_COL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/sorhr_col.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/sorhr_col.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/sorhr_col.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, M, N, NB
+*       ..
+*       .. Array Arguments ..
+*       REAL              A( LDA, * ), D( * ), T( LDT, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>  SORHR_COL takes an M-by-N real matrix Q_in with orthonormal columns
+*>  as input, stored in A, and performs Householder Reconstruction (HR),
+*>  i.e. reconstructs Householder vectors V(i) implicitly representing
+*>  another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+*>  where S is an N-by-N diagonal matrix with diagonal entries
+*>  equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+*>  stored in A on output, and the diagonal entries of S are stored in D.
+*>  Block reflectors are also returned in T
+*>  (same output format as SGEQRT).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size to be used in the reconstruction
+*>          of Householder column vector blocks in the array A and
+*>          corresponding block reflectors in the array T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size.)
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is REAL array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_in,
+*>             i.e the columns of A are orthogonal unit vectors.
+*>
+*>          On exit:
+*>
+*>             The elements below the diagonal of A represent the unit
+*>             lower-trapezoidal matrix V of Householder column vectors
+*>             V(i). The unit diagonal entries of V are not stored
+*>             (same format as the output below the diagonal in A from
+*>             SGEQRT). The matrix T and the matrix V stored on output
+*>             in A implicitly define Q_out.
+*>
+*>             The elements above the diagonal contain the factor U
+*>             of the "modified" LU-decomposition:
+*>                Q_in - ( S ) = V * U
+*>                       ( 0 )
+*>             where 0 is a (M-N)-by-(M-N) zero matrix.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is REAL array,
+*>          dimension (LDT, N)
+*>
+*>          Let NOCB = Number_of_output_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          On exit, T(1:NB, 1:N) contains NOCB upper-triangular
+*>          block reflectors used to define Q_out stored in compact
+*>          form as a sequence of upper-triangular NB-by-NB column
+*>          blocks (same format as the output T in SGEQRT).
+*>          The matrix T and the matrix V stored on output in A
+*>          implicitly define Q_out. NOTE: The lower triangles
+*>          below the upper-triangular blcoks will be filled with
+*>          zeros. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is REAL array, dimension min(M,N).
+*>          The elements can be only plus or minus one.
+*>
+*>          D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where
+*>          1 <= i <= min(M,N), and Q_in_i is Q_in after performing
+*>          i-1 steps of “modified” Gaussian elimination.
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*> The computed M-by-M orthogonal factor Q_out is defined implicitly as
+*> a product of orthogonal matrices Q_out(i). Each Q_out(i) is stored in
+*> the compact WY-representation format in the corresponding blocks of
+*> matrices V (stored in A) and T.
+*>
+*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N
+*> matrix A contains the column vectors V(i) in NB-size column
+*> blocks VB(j). For example, VB(1) contains the columns
+*> V(1), V(2), ... V(NB). NOTE: The unit entries on
+*> the diagonal of Y are not stored in A.
+*>
+*> The number of column blocks is
+*>
+*>     NOCB = Number_of_output_col_blocks = CEIL(N/NB)
+*>
+*> where each block is of order NB except for the last block, which
+*> is of order LAST_NB = N - (NOCB-1)*NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix V is
+*>
+*>
+*>     V = (    VB(1),   VB(2), VB(3) ) =
+*>
+*>       = (   1                      )
+*>         ( v21    1                 )
+*>         ( v31  v32    1            )
+*>         ( v41  v42  v43   1        )
+*>         ( v51  v52  v53  v54    1  )
+*>         ( v61  v62  v63  v54   v65 )
+*>
+*>
+*> For each of the column blocks VB(i), an upper-triangular block
+*> reflector TB(i) is computed. These blocks are stored as
+*> a sequence of upper-triangular column blocks in the NB-by-N
+*> matrix T. The size of each TB(i) block is NB-by-NB, except
+*> for the last block, whose size is LAST_NB-by-LAST_NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix T is
+*>
+*>     T  = (    TB(1),    TB(2), TB(3) ) =
+*>
+*>        = ( t11  t12  t13  t14   t15  )
+*>          (      t22       t24        )
+*>
+*>
+*> The M-by-M factor Q_out is given as a product of NOCB
+*> orthogonal M-by-M matrices Q_out(i).
+*>
+*>     Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB),
+*>
+*> where each matrix Q_out(i) is given by the WY-representation
+*> using corresponding blocks from the matrices V and T:
+*>
+*>     Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T,
+*>
+*> where I is the identity matrix. Here is the formula with matrix
+*> dimensions:
+*>
+*>  Q(i){M-by-M} = I{M-by-M} -
+*>    VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M},
+*>
+*> where INB = NB, except for the last block NOCB
+*> for which INB=LAST_NB.
+*>
+*> =====
+*> NOTE:
+*> =====
+*>
+*> If Q_in is the result of doing a QR factorization
+*> B = Q_in * R_in, then:
+*>
+*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out.
+*>
+*> So if one wants to interpret Q_out as the result
+*> of the QR factorization of B, then corresponding R_out
+*> should be obtained by R_out = S * R_in, i.e. some rows of R_in
+*> should be multiplied by -1.
+*>
+*> For the details of the algorithm, see [1].
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup singleOTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November   2019, Igor Kozachenko,
+*>            Computer Science Division,
+*>            University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE SORHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, M, N, NB
+*     ..
+*     .. Array Arguments ..
+      REAL              A( LDA, * ), D( * ), T( LDT, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ONE = 1.0E+0, ZERO = 0.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB,
+     $                   NPLUSONE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SCOPY, SLAORHR_COL_GETRFNP, SSCAL, STRSM, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. N.GT.M ) THEN
+         INFO = -2
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -7
+      END IF
+*
+*     Handle error in the input parameters.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'SORHR_COL', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         RETURN
+      END IF
+*
+*     On input, the M-by-N matrix A contains the orthogonal
+*     M-by-N matrix Q_in.
+*
+*     (1) Compute the unit lower-trapezoidal V (ones on the diagonal
+*     are not stored) by performing the "modified" LU-decomposition.
+*
+*     Q_in - ( S ) = V * U = ( V1 ) * U,
+*            ( 0 )           ( V2 )
+*
+*     where 0 is an (M-N)-by-N zero matrix.
+*
+*     (1-1) Factor V1 and U.
+
+      CALL SLAORHR_COL_GETRFNP( N, N, A, LDA, D, IINFO )
+*
+*     (1-2) Solve for V2.
+*
+      IF( M.GT.N ) THEN
+         CALL STRSM( 'R', 'U', 'N', 'N', M-N, N, ONE, A, LDA,
+     $               A( N+1, 1 ), LDA )
+      END IF
+*
+*     (2) Reconstruct the block reflector T stored in T(1:NB, 1:N)
+*     as a sequence of upper-triangular blocks with NB-size column
+*     blocking.
+*
+*     Loop over the column blocks of size NB of the array A(1:M,1:N)
+*     and the array T(1:NB,1:N), JB is the column index of a column
+*     block, JNB is the column block size at each step JB.
+*
+      NPLUSONE = N + 1
+      DO JB = 1, N, NB
+*
+*        (2-0) Determine the column block size JNB.
+*
+         JNB = MIN( NPLUSONE-JB, NB )
+*
+*        (2-1) Copy the upper-triangular part of the current JNB-by-JNB
+*        diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part
+*        of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1)
+*        column-by-column, total JNB*(JNB+1)/2 elements.
+*
+         JBTEMP1 = JB - 1
+         DO J = JB, JB+JNB-1
+            CALL SCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 )
+         END DO
+*
+*        (2-2) Perform on the upper-triangular part of the current
+*        JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in T(1:JNB,JB:JB+JNB-1) the following operation in place:
+*        (-1)*U(JB)*S(JB), i.e the result will be stored in the upper-
+*        triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication
+*        of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB
+*        diagonal block S(JB) of the N-by-N sign matrix S from the
+*        right means changing the sign of each J-th column of the block
+*        U(JB) according to the sign of the diagonal element of the block
+*        S(JB), i.e. S(J,J) that is stored in the array element D(J).
+*
+         DO J = JB, JB+JNB-1
+            IF( D( J ).EQ.ONE ) THEN
+               CALL SSCAL( J-JBTEMP1, -ONE, T( 1, J ), 1 )
+            END IF
+         END DO
+*
+*        (2-3) Perform the triangular solve for the current block
+*        matrix X(JB):
+*
+*               X(JB) * (A(JB)**T) = B(JB), where:
+*
+*               A(JB)**T  is a JNB-by-JNB unit upper-triangular
+*                         coefficient block, and A(JB)=V1(JB), which
+*                         is a JNB-by-JNB unit lower-triangular block
+*                         stored in A(JB:JB+JNB-1,JB:JB+JNB-1).
+*                         The N-by-N matrix V1 is the upper part
+*                         of the M-by-N lower-trapezoidal matrix V
+*                         stored in A(1:M,1:N);
+*
+*               B(JB)     is a JNB-by-JNB  upper-triangular right-hand
+*                         side block, B(JB) = (-1)*U(JB)*S(JB), and
+*                         B(JB) is stored in T(1:JNB,JB:JB+JNB-1);
+*
+*               X(JB)     is a JNB-by-JNB upper-triangular solution
+*                         block, X(JB) is the upper-triangular block
+*                         reflector T(JB), and X(JB) is stored
+*                         in T(1:JNB,JB:JB+JNB-1).
+*
+*             In other words, we perform the triangular solve for the
+*             upper-triangular block T(JB):
+*
+*               T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB).
+*
+*             Even though the blocks X(JB) and B(JB) are upper-
+*             triangular, the routine STRSM will access all JNB**2
+*             elements of the square T(1:JNB,JB:JB+JNB-1). Therefore,
+*             we need to set to zero the elements of the block
+*             T(1:JNB,JB:JB+JNB-1) below the diagonal before the call
+*             to STRSM.
+*
+*        (2-3a) Set the elements to zero.
+*
+         JBTEMP2 = JB - 2
+         DO J = JB, JB+JNB-2
+            DO I = J-JBTEMP2, NB
+               T( I, J ) = ZERO
+            END DO
+         END DO
+*
+*        (2-3b) Perform the triangular solve.
+*
+         CALL STRSM( 'R', 'L', 'T', 'U', JNB, JNB, ONE,
+     $               A( JB, JB ), LDA, T( 1, JB ), LDT )
+*
+      END DO
+*
+      RETURN
+*
+*     End of SORHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/sporfsx.f b/lapack-netlib/SRC/sporfsx.f
index 52fab6976..ce8c26569 100644
--- a/lapack-netlib/SRC/sporfsx.f
+++ b/lapack-netlib/SRC/sporfsx.f
@@ -135,7 +135,7 @@
 *> \param[in,out] S
 *> \verbatim
 *>          S is REAL array, dimension (N)
-*>     The row scale factors for A.  If EQUED = 'Y', A is multiplied on
+*>     The scale factors for A.  If EQUED = 'Y', A is multiplied on
 *>     the left and right by diag(S).  S is an input argument if FACT =
 *>     'F'; otherwise, S is an output argument.  If FACT = 'F' and EQUED
 *>     = 'Y', each element of S must be positive.  If S is output, each
@@ -263,7 +263,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -299,14 +299,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -314,9 +314,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/sposvxx.f b/lapack-netlib/SRC/sposvxx.f
index 3cdfa749c..fa2c0d3f3 100644
--- a/lapack-netlib/SRC/sposvxx.f
+++ b/lapack-netlib/SRC/sposvxx.f
@@ -366,7 +366,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -402,14 +402,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -417,9 +417,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/ssb2st_kernels.f b/lapack-netlib/SRC/ssb2st_kernels.f
index 54479f89e..08859169b 100644
--- a/lapack-netlib/SRC/ssb2st_kernels.f
+++ b/lapack-netlib/SRC/ssb2st_kernels.f
@@ -1,26 +1,26 @@
 *> \brief \b SSB2ST_KERNELS
 *
 *  @generated from zhb2st_kernels.f, fortran z -> s, Wed Dec  7 08:22:40 2016
-*      
+*
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download SSB2ST_KERNELS + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssb2st_kernels.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssb2st_kernels.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssb2st_kernels.f"> 
+*> Download SSB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/ssb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/ssb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/ssb2st_kernels.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE  SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*       SUBROUTINE  SSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
 *                                   ST, ED, SWEEP, N, NB, IB,
 *                                   A, LDA, V, TAU, LDVT, WORK)
 *
@@ -32,9 +32,9 @@
 *       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *       ..
 *       .. Array Arguments ..
-*       REAL               A( LDA, * ), V( * ), 
+*       REAL               A( LDA, * ), V( * ),
 *                          TAU( * ), WORK( * )
-*  
+*
 *> \par Purpose:
 *  =============
 *>
@@ -124,7 +124,7 @@
 *>          LDVT is INTEGER.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is REAL array. Workspace of size nb.
 *> \endverbatim
@@ -150,7 +150,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -158,16 +158,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE  SSB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+      SUBROUTINE  SSB2ST_KERNELS( UPLO, WANTZ, TTYPE,
      $                            ST, ED, SWEEP, N, NB, IB,
      $                            A, LDA, V, TAU, LDVT, WORK)
 *
@@ -184,7 +184,7 @@
       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *     ..
 *     .. Array Arguments ..
-      REAL               A( LDA, * ), V( * ), 
+      REAL               A( LDA, * ), V( * ),
      $                   TAU( * ), WORK( * )
 *     ..
 *
@@ -198,8 +198,8 @@
 *     .. Local Scalars ..
       LOGICAL            UPPER
       INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
-     $                   DPOS, OFDPOS, AJETER 
-      REAL               CTMP 
+     $                   DPOS, OFDPOS, AJETER
+      REAL               CTMP
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           SLARFG, SLARFX, SLARFY
@@ -212,7 +212,7 @@
 *     ..
 *     ..
 *     .. Executable Statements ..
-*      
+*
       AJETER = IB + LDVT
       UPPER = LSAME( UPLO, 'U' )
 
@@ -243,10 +243,10 @@
               V( VPOS ) = ONE
               DO 10 I = 1, LM-1
                   V( VPOS+I )         = ( A( OFDPOS-I, ST+I ) )
-                  A( OFDPOS-I, ST+I ) = ZERO  
+                  A( OFDPOS-I, ST+I ) = ZERO
    10         CONTINUE
               CTMP = ( A( OFDPOS, ST ) )
-              CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+              CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
               A( OFDPOS, ST ) = CTMP
 *
@@ -284,14 +284,14 @@
 *
                   V( VPOS ) = ONE
                   DO 30 I = 1, LM-1
-                      V( VPOS+I )          = 
+                      V( VPOS+I )          =
      $                                    ( A( DPOS-NB-I, J1+I ) )
                       A( DPOS-NB-I, J1+I ) = ZERO
    30             CONTINUE
                   CTMP = ( A( DPOS-NB, J1 ) )
                   CALL SLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
                   A( DPOS-NB, J1 ) = CTMP
-*                 
+*
                   CALL SLARFX( 'Right', LN-1, LM, V( VPOS ),
      $                         TAU( TAUPOS ),
      $                         A( DPOS-NB+1, J1 ), LDA-1, WORK)
@@ -299,9 +299,9 @@
           ENDIF
 *
 *     Lower case
-*  
+*
       ELSE
-*      
+*
           IF( WANTZ ) THEN
               VPOS   = MOD( SWEEP-1, 2 ) * N + ST
               TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
@@ -316,9 +316,9 @@
               V( VPOS ) = ONE
               DO 20 I = 1, LM-1
                   V( VPOS+I )         = A( OFDPOS+I, ST-1 )
-                  A( OFDPOS+I, ST-1 ) = ZERO  
+                  A( OFDPOS+I, ST-1 ) = ZERO
    20         CONTINUE
-              CALL SLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+              CALL SLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
 *
               LM = ED - ST + 1
@@ -345,7 +345,7 @@
               LM = J2-J1+1
 *
               IF( LM.GT.0) THEN
-                  CALL SLARFX( 'Right', LM, LN, V( VPOS ), 
+                  CALL SLARFX( 'Right', LM, LN, V( VPOS ),
      $                         TAU( TAUPOS ), A( DPOS+NB, ST ),
      $                         LDA-1, WORK)
 *
@@ -362,13 +362,13 @@
                       V( VPOS+I )        = A( DPOS+NB+I, ST )
                       A( DPOS+NB+I, ST ) = ZERO
    40             CONTINUE
-                  CALL SLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+                  CALL SLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
      $                                        TAU( TAUPOS ) )
 *
-                  CALL SLARFX( 'Left', LM, LN-1, V( VPOS ), 
+                  CALL SLARFX( 'Left', LM, LN-1, V( VPOS ),
      $                         ( TAU( TAUPOS ) ),
      $                         A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
-             
+
               ENDIF
           ENDIF
       ENDIF
@@ -377,4 +377,4 @@
 *
 *     END OF SSB2ST_KERNELS
 *
-      END      
+      END
diff --git a/lapack-netlib/SRC/ssbgvx.f b/lapack-netlib/SRC/ssbgvx.f
index 3408810bd..22f96729e 100644
--- a/lapack-netlib/SRC/ssbgvx.f
+++ b/lapack-netlib/SRC/ssbgvx.f
@@ -261,11 +261,11 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit
-*>          < 0 : if INFO = -i, the i-th argument had an illegal value
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
 *>          <= N: if INFO = i, then i eigenvectors failed to converge.
 *>                  Their indices are stored in IFAIL.
-*>          > N : SPBSTF returned an error code; i.e.,
+*>          > N:  SPBSTF returned an error code; i.e.,
 *>                if INFO = N + i, for 1 <= i <= N, then the leading
 *>                minor of order i of B is not positive definite.
 *>                The factorization of B could not be completed and
diff --git a/lapack-netlib/SRC/sstemr.f b/lapack-netlib/SRC/sstemr.f
index 228538161..d550f87e0 100644
--- a/lapack-netlib/SRC/sstemr.f
+++ b/lapack-netlib/SRC/sstemr.f
@@ -233,13 +233,13 @@
 *> \param[in,out] TRYRAC
 *> \verbatim
 *>          TRYRAC is LOGICAL
-*>          If TRYRAC.EQ..TRUE., indicates that the code should check whether
+*>          If TRYRAC = .TRUE., indicates that the code should check whether
 *>          the tridiagonal matrix defines its eigenvalues to high relative
 *>          accuracy.  If so, the code uses relative-accuracy preserving
 *>          algorithms that might be (a bit) slower depending on the matrix.
 *>          If the matrix does not define its eigenvalues to high relative
 *>          accuracy, the code can uses possibly faster algorithms.
-*>          If TRYRAC.EQ..FALSE., the code is not required to guarantee
+*>          If TRYRAC = .FALSE., the code is not required to guarantee
 *>          relatively accurate eigenvalues and can use the fastest possible
 *>          techniques.
 *>          On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix
diff --git a/lapack-netlib/SRC/ssyconvf.f b/lapack-netlib/SRC/ssyconvf.f
index d43b9473f..c6f08428f 100644
--- a/lapack-netlib/SRC/ssyconvf.f
+++ b/lapack-netlib/SRC/ssyconvf.f
@@ -291,7 +291,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -344,7 +344,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -435,7 +435,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where k increases from 1 to N
 *
             I = 1
@@ -488,7 +488,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/ssyconvf_rook.f b/lapack-netlib/SRC/ssyconvf_rook.f
index 833b9c632..a7e0d5258 100644
--- a/lapack-netlib/SRC/ssyconvf_rook.f
+++ b/lapack-netlib/SRC/ssyconvf_rook.f
@@ -282,7 +282,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -333,7 +333,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -423,7 +423,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where i increases from 1 to N
 *
             I = 1
@@ -474,7 +474,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/ssyev_2stage.f b/lapack-netlib/SRC/ssyev_2stage.f
index 166766919..5d354c1b3 100644
--- a/lapack-netlib/SRC/ssyev_2stage.f
+++ b/lapack-netlib/SRC/ssyev_2stage.f
@@ -317,7 +317,7 @@
       IF( .NOT.WANTZ ) THEN
          CALL SSTERF( N, W, WORK( INDE ), INFO )
       ELSE
-*        Not available in this release, and agrument checking should not
+*        Not available in this release, and argument checking should not
 *        let it getting here
          RETURN
          CALL SORGTR( UPLO, N, A, LDA, WORK( INDTAU ), WORK( INDWRK ),
diff --git a/lapack-netlib/SRC/ssyevd_2stage.f b/lapack-netlib/SRC/ssyevd_2stage.f
index 8ab90b641..625713b85 100644
--- a/lapack-netlib/SRC/ssyevd_2stage.f
+++ b/lapack-netlib/SRC/ssyevd_2stage.f
@@ -385,7 +385,7 @@
       IF( .NOT.WANTZ ) THEN
          CALL SSTERF( N, W, WORK( INDE ), INFO )
       ELSE
-*        Not available in this release, and agrument checking should not
+*        Not available in this release, and argument checking should not
 *        let it getting here
          RETURN
          CALL SSTEDC( 'I', N, W, WORK( INDE ), WORK( INDWRK ), N,
diff --git a/lapack-netlib/SRC/ssyrfsx.f b/lapack-netlib/SRC/ssyrfsx.f
index b5dd0b2df..bfb7b6005 100644
--- a/lapack-netlib/SRC/ssyrfsx.f
+++ b/lapack-netlib/SRC/ssyrfsx.f
@@ -271,7 +271,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -307,14 +307,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -322,9 +322,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/ssysv_aa.f b/lapack-netlib/SRC/ssysv_aa.f
index e470f5883..7e58d1e75 100644
--- a/lapack-netlib/SRC/ssysv_aa.f
+++ b/lapack-netlib/SRC/ssysv_aa.f
@@ -42,7 +42,7 @@
 *> matrices.
 *>
 *> Aasen's algorithm is used to factor A as
-*>    A = U * T * U**T,  if UPLO = 'U', or
+*>    A = U**T * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric tridiagonal. The factored
@@ -86,7 +86,7 @@
 *>
 *>          On exit, if INFO = 0, the tridiagonal matrix T and the
 *>          multipliers used to obtain the factor U or L from the
-*>          factorization A = U*T*U**T or A = L*T*L**T as computed by
+*>          factorization A = U**T*T*U or A = L*T*L**T as computed by
 *>          SSYTRF.
 *> \endverbatim
 *>
@@ -229,7 +229,7 @@
          RETURN
       END IF
 *
-*     Compute the factorization A = U*T*U**T or A = L*T*L**T.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL SSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
       IF( INFO.EQ.0 ) THEN
diff --git a/lapack-netlib/SRC/ssysv_aa_2stage.f b/lapack-netlib/SRC/ssysv_aa_2stage.f
index 43d937141..5e2e0e340 100644
--- a/lapack-netlib/SRC/ssysv_aa_2stage.f
+++ b/lapack-netlib/SRC/ssysv_aa_2stage.f
@@ -44,7 +44,7 @@
 *> matrices.
 *>
 *> Aasen's 2-stage algorithm is used to factor A as
-*>    A = U * T * U**T,  if UPLO = 'U', or
+*>    A = U**T * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric and band. The matrix T is
@@ -258,7 +258,7 @@
       END IF
 *
 *
-*     Compute the factorization A = U*T*U**T or A = L*T*L**T.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL SSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
      $                       WORK, LWORK, INFO )
diff --git a/lapack-netlib/SRC/ssysvxx.f b/lapack-netlib/SRC/ssysvxx.f
index 4762748c0..e2be0128b 100644
--- a/lapack-netlib/SRC/ssysvxx.f
+++ b/lapack-netlib/SRC/ssysvxx.f
@@ -377,7 +377,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -413,14 +413,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is REAL array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -428,9 +428,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/ssytf2_rk.f b/lapack-netlib/SRC/ssytf2_rk.f
index bf113d1bd..400e48353 100644
--- a/lapack-netlib/SRC/ssytf2_rk.f
+++ b/lapack-netlib/SRC/ssytf2_rk.f
@@ -312,7 +312,7 @@
 *
 *        Factorize A as U*D*U**T using the upper triangle of A
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = ZERO
@@ -623,7 +623,7 @@
 *
 *        Factorize A as L*D*L**T using the lower triangle of A
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = ZERO
 *
diff --git a/lapack-netlib/SRC/ssytrd_2stage.f b/lapack-netlib/SRC/ssytrd_2stage.f
index 7ddc0224e..d2502f483 100644
--- a/lapack-netlib/SRC/ssytrd_2stage.f
+++ b/lapack-netlib/SRC/ssytrd_2stage.f
@@ -123,23 +123,22 @@
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is REAL array, dimension LHOUS2, that
-*>          store the Householder representation of the stage2
+*>          HOUS2 is REAL array, dimension (LHOUS2)
+*>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
 *>
 *> \param[in] LHOUS2
 *> \verbatim
 *>          LHOUS2 is INTEGER
-*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array HOUS2.
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
 *>          this value as the first entry of the HOUS2 array, and no error
 *>          message related to LHOUS2 is issued by XERBLA.
-*>          LHOUS2 = MAX(1, dimension) where
-*>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'
+*>          If VECT='N', LHOUS2 = max(1, 4*n);
+*>          if VECT='V', option not yet available.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/ssytrd_sb2st.F b/lapack-netlib/SRC/ssytrd_sb2st.F
index 0df1173e4..1d8c9f5c5 100644
--- a/lapack-netlib/SRC/ssytrd_sb2st.F
+++ b/lapack-netlib/SRC/ssytrd_sb2st.F
@@ -50,9 +50,9 @@
 *  Arguments:
 *  ==========
 *
-*> \param[in] STAGE
+*> \param[in] STAGE1
 *> \verbatim
-*>          STAGE is CHARACTER*1
+*>          STAGE1 is CHARACTER*1
 *>          = 'N':  "No": to mention that the stage 1 of the reduction  
 *>                  from dense to band using the ssytrd_sy2sb routine
 *>                  was not called before this routine to reproduce AB. 
diff --git a/lapack-netlib/SRC/ssytrd_sy2sb.f b/lapack-netlib/SRC/ssytrd_sy2sb.f
index 272876700..98169dc00 100644
--- a/lapack-netlib/SRC/ssytrd_sy2sb.f
+++ b/lapack-netlib/SRC/ssytrd_sy2sb.f
@@ -363,7 +363,7 @@
 *
 *
 *     Set the workspace of the triangular matrix T to zero once such a
-*     way everytime T is generated the upper/lower portion will be always zero  
+*     way every time T is generated the upper/lower portion will be always zero
 *   
       CALL SLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
 *
diff --git a/lapack-netlib/SRC/ssytrf.f b/lapack-netlib/SRC/ssytrf.f
index 2c29475df..ae4550f28 100644
--- a/lapack-netlib/SRC/ssytrf.f
+++ b/lapack-netlib/SRC/ssytrf.f
@@ -39,7 +39,7 @@
 *> the Bunch-Kaufman diagonal pivoting method.  The form of the
 *> factorization is
 *>
-*>    A = U*D*U**T  or  A = L*D*L**T
+*>    A = U**T*D*U  or  A = L*D*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and D is symmetric and block diagonal with
@@ -144,7 +144,7 @@
 *>
 *> \verbatim
 *>
-*>  If UPLO = 'U', then A = U*D*U**T, where
+*>  If UPLO = 'U', then A = U**T*D*U, where
 *>     U = P(n)*U(n)* ... *P(k)U(k)* ...,
 *>  i.e., U is a product of terms P(k)*U(k), where k decreases from n to
 *>  1 in steps of 1 or 2, and D is a block diagonal matrix with 1-by-1
@@ -262,7 +262,7 @@
 *
       IF( UPPER ) THEN
 *
-*        Factorize A as U*D*U**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *
 *        K is the main loop index, decreasing from N to 1 in steps of
 *        KB, where KB is the number of columns factorized by SLASYF;
diff --git a/lapack-netlib/SRC/ssytrf_aa.f b/lapack-netlib/SRC/ssytrf_aa.f
index 4aaa978ad..7f428561c 100644
--- a/lapack-netlib/SRC/ssytrf_aa.f
+++ b/lapack-netlib/SRC/ssytrf_aa.f
@@ -37,7 +37,7 @@
 *> SSYTRF_AA computes the factorization of a real symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a symmetric tridiagonal matrix.
@@ -223,7 +223,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
 *        Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N))
@@ -256,7 +256,7 @@
      $                      A( MAX(1, J), J+1 ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
@@ -375,7 +375,7 @@
      $                      A( J+1, MAX(1, J) ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
diff --git a/lapack-netlib/SRC/ssytrf_aa_2stage.f b/lapack-netlib/SRC/ssytrf_aa_2stage.f
index 0e0f6edb7..03690815b 100644
--- a/lapack-netlib/SRC/ssytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrf_aa_2stage.f
@@ -38,7 +38,7 @@
 *> SSYTRF_AA_2STAGE computes the factorization of a real symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a symmetric band matrix with the
@@ -275,7 +275,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
          DO J = 0, NT-1
@@ -442,12 +442,14 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL SSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, 
      $                                A( (J+1)*NB+1, I2 ), 1 )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL SSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
-     $                                    A( I1+1, I2 ), 1 )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) )
+     $                  CALL SSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
+     $                                       A( I1+1, I2 ), 1 )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL SSWAP( N-I2, A( I1, I2+1 ), LDA,
-     $                                 A( I2, I2+1 ), LDA ) 
+                     IF( I2.LT.N )
+     $                  CALL SSWAP( N-I2, A( I1, I2+1 ), LDA,
+     $                                    A( I2, I2+1 ), LDA ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
@@ -616,11 +618,13 @@ c               END IF
                      CALL SSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, 
      $                                A( I2, (J+1)*NB+1 ), LDA )
 *                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL SSWAP( I2-I1-1, A( I1+1, I1 ), 1,
-     $                                    A( I2, I1+1 ), LDA )
+                     IF( I2.GT.(I1+1) )
+     $                  CALL SSWAP( I2-I1-1, A( I1+1, I1 ), 1,
+     $                                       A( I2, I1+1 ), LDA )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL SSWAP( N-I2, A( I2+1, I1 ), 1,
-     $                                 A( I2+1, I2 ), 1 ) 
+                     IF( I2.LT.N )
+     $                  CALL SSWAP( N-I2, A( I2+1, I1 ), 1,
+     $                                    A( I2+1, I2 ), 1 ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
diff --git a/lapack-netlib/SRC/ssytri2.f b/lapack-netlib/SRC/ssytri2.f
index 4b9ea4e7b..897116c23 100644
--- a/lapack-netlib/SRC/ssytri2.f
+++ b/lapack-netlib/SRC/ssytri2.f
@@ -62,7 +62,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
+*>          On entry, the block diagonal matrix D and the multipliers
 *>          used to obtain the factor U or L as computed by SSYTRF.
 *>
 *>          On exit, if INFO = 0, the (symmetric) inverse of the original
@@ -82,7 +82,7 @@
 *> \param[in] IPIV
 *> \verbatim
 *>          IPIV is INTEGER array, dimension (N)
-*>          Details of the interchanges and the NB structure of D
+*>          Details of the interchanges and the block structure of D
 *>          as determined by SSYTRF.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ssytrs_aa.f b/lapack-netlib/SRC/ssytrs_aa.f
index b05c9f7e6..ed4377ae7 100644
--- a/lapack-netlib/SRC/ssytrs_aa.f
+++ b/lapack-netlib/SRC/ssytrs_aa.f
@@ -37,7 +37,7 @@
 *> \verbatim
 *>
 *> SSYTRS_AA solves a system of linear equations A*X = B with a real
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by SSYTRF_AA.
 *> \endverbatim
 *
@@ -49,7 +49,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -97,14 +97,16 @@
 *>          The leading dimension of the array B.  LDB >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE array, dimension (MAX(1,LWORK))
+*>          WORK is REAL array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER, LWORK >= MAX(1,3*N-2).
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -198,24 +200,31 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
+*
+*        1) Forward substitution with U**T
+*
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
 *
-*        Pivot, P**T * B
+            K = 1
+            DO WHILE ( K.LE.N )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $             CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K + 1
+            END DO
 *
-         K = 1
-         DO WHILE ( K.LE.N )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $          CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K + 1
-         END DO
+*           Compute U**T \ B -> B    [ (U**T \P**T * B) ]
 *
-*        Compute (U \P**T * B) -> B    [ (U \P**T * B) ]
+            CALL STRSM( 'L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-         CALL STRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+*        2) Solve with triangular matrix T
 *
-*        Compute T \ B -> B   [ T \ (U \P**T * B) ]
+*        Compute T \ B -> B   [ T \ (U**T \P**T * B) ]
 *
          CALL SLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1)
          IF( N.GT.1 ) THEN
@@ -224,41 +233,53 @@
          END IF
          CALL SGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB,
      $              INFO)
+*
+*        3) Backward substitution with U
+*
+         IF( N.GT.1 ) THEN
 *     
 *
-*        Compute (U**T \ B) -> B   [ U**T \ (T \ (U \P**T * B) ) ]
+*           Compute U \ B -> B   [ U \ (T \ (U**T \P**T * B) ) ]
 *
-         CALL STRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B(2, 1), LDB)
+            CALL STRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B(2, 1), LDB)
 *
-*        Pivot, P * B  [ P * (U**T \ (T \ (U \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
 *
-         K = N
-         DO WHILE ( K.GE.1 )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K - 1
-         END DO
+            K = N
+            DO WHILE ( K.GE.1 )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K - 1
+            END DO
+         END IF
 *
       ELSE
 *
 *        Solve A*X = B, where A = L*T*L**T.
 *
-*        Pivot, P**T * B
+*        1) Forward substitution with L
 *
-         K = 1
-         DO WHILE ( K.LE.N )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K + 1
-         END DO
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
+*
+            K = 1
+            DO WHILE ( K.LE.N )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K + 1
+            END DO
 *
-*        Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute L \ B -> B    [ (L \P**T * B) ]
+*
+            CALL STRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1),
+     $                  LDA, B(2, 1), LDB)
+         END IF
 *
-         CALL STRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1), LDA,
-     $               B(2, 1), LDB)
+*        2) Solve with triangular matrix T
 *
 *        Compute T \ B -> B   [ T \ (L \P**T * B) ]
 *
@@ -270,20 +291,25 @@
          CALL SGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB,
      $              INFO)
 *
-*        Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
+*        3) Backward substitution with L**T
 *
-         CALL STRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $              B( 2, 1 ), LDB)
+         IF( N.GT.1 ) THEN
+*
+*           Compute L**T \ B -> B   [ L**T \ (T \ (L \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+            CALL STRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
 *
-         K = N
-         DO WHILE ( K.GE.1 )
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-            K = K - 1
-         END DO
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*
+            K = N
+            DO WHILE ( K.GE.1 )
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL SSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+               K = K - 1
+            END DO
+         END IF
 *
       END IF
 *
diff --git a/lapack-netlib/SRC/ssytrs_aa_2stage.f b/lapack-netlib/SRC/ssytrs_aa_2stage.f
index d271b9481..cf2da529d 100644
--- a/lapack-netlib/SRC/ssytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/ssytrs_aa_2stage.f
@@ -36,7 +36,7 @@
 *> \verbatim
 *>
 *> SSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a real
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by SSYTRF_AA_2STAGE.
 *> \endverbatim
 *
@@ -48,7 +48,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -208,15 +208,15 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (U**T \P**T * B) -> B    [ (U**T \P**T * B) ]
+*           Compute (U**T \ B) -> B    [ (U**T \P**T * B) ]
 *
             CALL STRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -234,7 +234,7 @@
             CALL STRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
 *
             CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
@@ -246,11 +246,11 @@
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute (L \ B) -> B    [ (L \P**T * B) ]
 *
             CALL STRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -268,7 +268,7 @@
             CALL STRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
 *
             CALL SLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
diff --git a/lapack-netlib/SRC/stgsy2.f b/lapack-netlib/SRC/stgsy2.f
index ca9946a7e..2814889fc 100644
--- a/lapack-netlib/SRC/stgsy2.f
+++ b/lapack-netlib/SRC/stgsy2.f
@@ -71,7 +71,7 @@
 *>             R  * B**T + L  * E**T  = scale * -F
 *>
 *> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
-*> sigma_min(Z) using reverse communicaton with SLACON.
+*> sigma_min(Z) using reverse communication with SLACON.
 *>
 *> STGSY2 also (IJOB >= 1) contributes to the computation in STGSYL
 *> of an upper bound on the separation between to matrix pairs. Then
@@ -85,7 +85,7 @@
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
-*>          = 'N', solve the generalized Sylvester equation (1).
+*>          = 'N': solve the generalized Sylvester equation (1).
 *>          = 'T': solve the 'transposed' system (3).
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/stgsyl.f b/lapack-netlib/SRC/stgsyl.f
index cd597f37d..ff634b1de 100644
--- a/lapack-netlib/SRC/stgsyl.f
+++ b/lapack-netlib/SRC/stgsyl.f
@@ -88,20 +88,20 @@
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
-*>          = 'N', solve the generalized Sylvester equation (1).
-*>          = 'T', solve the 'transposed' system (3).
+*>          = 'N': solve the generalized Sylvester equation (1).
+*>          = 'T': solve the 'transposed' system (3).
 *> \endverbatim
 *>
 *> \param[in] IJOB
 *> \verbatim
 *>          IJOB is INTEGER
 *>          Specifies what kind of functionality to be performed.
-*>           =0: solve (1) only.
-*>           =1: The functionality of 0 and 3.
-*>           =2: The functionality of 0 and 4.
-*>           =3: Only an estimate of Dif[(A,D), (B,E)] is computed.
+*>          = 0: solve (1) only.
+*>          = 1: The functionality of 0 and 3.
+*>          = 2: The functionality of 0 and 4.
+*>          = 3: Only an estimate of Dif[(A,D), (B,E)] is computed.
 *>               (look ahead strategy IJOB  = 1 is used).
-*>           =4: Only an estimate of Dif[(A,D), (B,E)] is computed.
+*>          = 4: Only an estimate of Dif[(A,D), (B,E)] is computed.
 *>               ( SGECON on sub-systems is used ).
 *>          Not referenced if TRANS = 'T'.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/stpmlqt.f b/lapack-netlib/SRC/stpmlqt.f
index 565dadd0c..8fc7823c2 100644
--- a/lapack-netlib/SRC/stpmlqt.f
+++ b/lapack-netlib/SRC/stpmlqt.f
@@ -94,7 +94,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is REAL array, dimension (LDA,K)
+*>          V is REAL array, dimension (LDV,K)
 *>          The i-th row must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          DTPLQT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/stpmqrt.f b/lapack-netlib/SRC/stpmqrt.f
index b1813b7dd..6a5cbb981 100644
--- a/lapack-netlib/SRC/stpmqrt.f
+++ b/lapack-netlib/SRC/stpmqrt.f
@@ -94,7 +94,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is REAL array, dimension (LDA,K)
+*>          V is REAL array, dimension (LDV,K)
 *>          The i-th column must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          CTPQRT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/stprfb.f b/lapack-netlib/SRC/stprfb.f
index 66e67252f..fcd164183 100644
--- a/lapack-netlib/SRC/stprfb.f
+++ b/lapack-netlib/SRC/stprfb.f
@@ -152,8 +152,8 @@
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.
-*>          If SIDE = 'L', LDC >= max(1,K);
-*>          If SIDE = 'R', LDC >= max(1,M).
+*>          If SIDE = 'L', LDA >= max(1,K);
+*>          If SIDE = 'R', LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in,out] B
diff --git a/lapack-netlib/SRC/zcgesv.f b/lapack-netlib/SRC/zcgesv.f
index bb12d4f3a..b71018638 100644
--- a/lapack-netlib/SRC/zcgesv.f
+++ b/lapack-netlib/SRC/zcgesv.f
@@ -93,9 +93,9 @@
 *>          dimension (LDA,N)
 *>          On entry, the N-by-N coefficient matrix A.
 *>          On exit, if iterative refinement has been successfully used
-*>          (INFO.EQ.0 and ITER.GE.0, see description below), then A is
+*>          (INFO = 0 and ITER >= 0, see description below), then A is
 *>          unchanged, if double precision factorization has been used
-*>          (INFO.EQ.0 and ITER.LT.0, see description below), then the
+*>          (INFO = 0 and ITER < 0, see description below), then the
 *>          array A contains the factors L and U from the factorization
 *>          A = P*L*U; the unit diagonal elements of L are not stored.
 *> \endverbatim
@@ -112,8 +112,8 @@
 *>          The pivot indices that define the permutation matrix P;
 *>          row i of the matrix was interchanged with row IPIV(i).
 *>          Corresponds either to the single precision factorization
-*>          (if INFO.EQ.0 and ITER.GE.0) or the double precision
-*>          factorization (if INFO.EQ.0 and ITER.LT.0).
+*>          (if INFO = 0 and ITER >= 0) or the double precision
+*>          factorization (if INFO = 0 and ITER < 0).
 *> \endverbatim
 *>
 *> \param[in] B
@@ -421,7 +421,7 @@
    30 CONTINUE
 *
 *     If we are at this place of the code, this is because we have
-*     performed ITER=ITERMAX iterations and never satisified the stopping
+*     performed ITER=ITERMAX iterations and never satisfied the stopping
 *     criterion, set up the ITER flag accordingly and follow up on double
 *     precision routine.
 *
diff --git a/lapack-netlib/SRC/zcposv.f b/lapack-netlib/SRC/zcposv.f
index eafcce623..101d25f5d 100644
--- a/lapack-netlib/SRC/zcposv.f
+++ b/lapack-netlib/SRC/zcposv.f
@@ -111,9 +111,9 @@
 *>          elements need not be set and are assumed to be zero.
 *>
 *>          On exit, if iterative refinement has been successfully used
-*>          (INFO.EQ.0 and ITER.GE.0, see description below), then A is
+*>          (INFO = 0 and ITER >= 0, see description below), then A is
 *>          unchanged, if double precision factorization has been used
-*>          (INFO.EQ.0 and ITER.LT.0, see description below), then the
+*>          (INFO = 0 and ITER < 0, see description below), then the
 *>          array A contains the factor U or L from the Cholesky
 *>          factorization A = U**H*U or A = L*L**H.
 *> \endverbatim
@@ -431,7 +431,7 @@
    30 CONTINUE
 *
 *     If we are at this place of the code, this is because we have
-*     performed ITER=ITERMAX iterations and never satisified the
+*     performed ITER=ITERMAX iterations and never satisfied the
 *     stopping criterion, set up the ITER flag accordingly and follow
 *     up on double precision routine.
 *
diff --git a/lapack-netlib/SRC/zgbrfsx.f b/lapack-netlib/SRC/zgbrfsx.f
index e40d7d23e..872709899 100644
--- a/lapack-netlib/SRC/zgbrfsx.f
+++ b/lapack-netlib/SRC/zgbrfsx.f
@@ -75,7 +75,7 @@
 *>     Specifies the form of the system of equations:
 *>       = 'N':  A * X = B     (No transpose)
 *>       = 'T':  A**T * X = B  (Transpose)
-*>       = 'C':  A**H * X = B  (Conjugate transpose = Transpose)
+*>       = 'C':  A**H * X = B  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] EQUED
@@ -308,7 +308,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -344,14 +344,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -359,9 +359,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/zgbsvxx.f b/lapack-netlib/SRC/zgbsvxx.f
index 9ba9c2ee3..0d916fd62 100644
--- a/lapack-netlib/SRC/zgbsvxx.f
+++ b/lapack-netlib/SRC/zgbsvxx.f
@@ -431,7 +431,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -467,14 +467,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -482,9 +482,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/zgebak.f b/lapack-netlib/SRC/zgebak.f
index a9761fde2..70c265e05 100644
--- a/lapack-netlib/SRC/zgebak.f
+++ b/lapack-netlib/SRC/zgebak.f
@@ -48,10 +48,10 @@
 *> \verbatim
 *>          JOB is CHARACTER*1
 *>          Specifies the type of backward transformation required:
-*>          = 'N', do nothing, return immediately;
-*>          = 'P', do backward transformation for permutation only;
-*>          = 'S', do backward transformation for scaling only;
-*>          = 'B', do backward transformations for both permutation and
+*>          = 'N': do nothing, return immediately;
+*>          = 'P': do backward transformation for permutation only;
+*>          = 'S': do backward transformation for scaling only;
+*>          = 'B': do backward transformations for both permutation and
 *>                 scaling.
 *>          JOB must be the same as the argument JOB supplied to ZGEBAL.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zgeev.f b/lapack-netlib/SRC/zgeev.f
index 22b04469f..1ba542587 100644
--- a/lapack-netlib/SRC/zgeev.f
+++ b/lapack-netlib/SRC/zgeev.f
@@ -157,7 +157,7 @@
 *>          < 0:  if INFO = -i, the i-th argument had an illegal value.
 *>          > 0:  if INFO = i, the QR algorithm failed to compute all the
 *>                eigenvalues, and no eigenvectors have been computed;
-*>                elements and i+1:N of W contain eigenvalues which have
+*>                elements i+1:N of W contain eigenvalues which have
 *>                converged.
 *> \endverbatim
 *
diff --git a/lapack-netlib/SRC/zgejsv.f b/lapack-netlib/SRC/zgejsv.f
index d553da90b..91a20416e 100644
--- a/lapack-netlib/SRC/zgejsv.f
+++ b/lapack-netlib/SRC/zgejsv.f
@@ -80,13 +80,13 @@
 *>              desirable, then this option is advisable. The input matrix A
 *>              is preprocessed with QR factorization with FULL (row and
 *>              column) pivoting.
-*>       = 'G'  Computation as with 'F' with an additional estimate of the
+*>       = 'G': Computation as with 'F' with an additional estimate of the
 *>              condition number of B, where A=B*D. If A has heavily weighted
 *>              rows, then using this condition number gives too pessimistic
 *>              error bound.
 *>       = 'A': Small singular values are not well determined by the data 
 *>              and are considered as noisy; the matrix is treated as
-*>              numerically rank defficient. The error in the computed
+*>              numerically rank deficient. The error in the computed
 *>              singular values is bounded by f(m,n)*epsilon*||A||.
 *>              The computed SVD A = U * S * V^* restores A up to
 *>              f(m,n)*epsilon*||A||.
@@ -117,7 +117,7 @@
 *>       = 'V': N columns of V are returned in the array V; Jacobi rotations
 *>              are not explicitly accumulated.
 *>       = 'J': N columns of V are returned in the array V, but they are
-*>              computed as the product of Jacobi rotations, if JOBT .EQ. 'N'.
+*>              computed as the product of Jacobi rotations, if JOBT = 'N'.
 *>       = 'W': V may be used as workspace of length N*N. See the description
 *>              of V.
 *>       = 'N': V is not computed.
@@ -131,7 +131,7 @@
 *>         specified range. If A .NE. 0 is scaled so that the largest singular
 *>         value of c*A is around SQRT(BIG), BIG=DLAMCH('O'), then JOBR issues
 *>         the licence to kill columns of A whose norm in c*A is less than
-*>         SQRT(SFMIN) (for JOBR.EQ.'R'), or less than SMALL=SFMIN/EPSLN,
+*>         SQRT(SFMIN) (for JOBR = 'R'), or less than SMALL=SFMIN/EPSLN,
 *>         where SFMIN=DLAMCH('S'), EPSLN=DLAMCH('E').
 *>       = 'N': Do not kill small columns of c*A. This option assumes that
 *>              BLAS and QR factorizations and triangular solvers are
@@ -229,7 +229,7 @@
 *>          If JOBU = 'F', then U contains on exit the M-by-M matrix of
 *>                         the left singular vectors, including an ONB
 *>                         of the orthogonal complement of the Range(A).
-*>          If JOBU = 'W'  .AND. (JOBV.EQ.'V' .AND. JOBT.EQ.'T' .AND. M.EQ.N),
+*>          If JOBU = 'W'  .AND. (JOBV = 'V' .AND. JOBT = 'T' .AND. M = N),
 *>                         then U is used as workspace if the procedure
 *>                         replaces A with A^*. In that case, [V] is computed
 *>                         in U as left singular vectors of A^* and then
@@ -251,7 +251,7 @@
 *>          V is COMPLEX*16 array, dimension ( LDV, N )
 *>          If JOBV = 'V', 'J' then V contains on exit the N-by-N matrix of
 *>                         the right singular vectors;
-*>          If JOBV = 'W', AND (JOBU.EQ.'U' AND JOBT.EQ.'T' AND M.EQ.N),
+*>          If JOBV = 'W', AND (JOBU = 'U' AND JOBT = 'T' AND M = N),
 *>                         then V is used as workspace if the pprocedure
 *>                         replaces A with A^*. In that case, [U] is computed
 *>                         in V as right singular vectors of A^* and then
@@ -282,7 +282,7 @@
 *>          Length of CWORK to confirm proper allocation of workspace.
 *>          LWORK depends on the job:
 *>
-*>          1. If only SIGMA is needed ( JOBU.EQ.'N', JOBV.EQ.'N' ) and
+*>          1. If only SIGMA is needed ( JOBU = 'N', JOBV = 'N' ) and
 *>            1.1 .. no scaled condition estimate required (JOBA.NE.'E'.AND.JOBA.NE.'G'):
 *>               LWORK >= 2*N+1. This is the minimal requirement.
 *>               ->> For optimal performance (blocked code) the optimal value
@@ -298,9 +298,9 @@
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZGEQRF), LWORK(ZGESVJ),
 *>                            N*N+LWORK(ZPOCON)).
-*>          2. If SIGMA and the right singular vectors are needed (JOBV.EQ.'V'),
-*>             (JOBU.EQ.'N')
-*>            2.1   .. no scaled condition estimate requested (JOBE.EQ.'N'):    
+*>          2. If SIGMA and the right singular vectors are needed (JOBV = 'V'),
+*>             (JOBU = 'N')
+*>            2.1   .. no scaled condition estimate requested (JOBE = 'N'):    
 *>            -> the minimal requirement is LWORK >= 3*N.
 *>            -> For optimal performance, 
 *>               LWORK >= max(N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
@@ -318,10 +318,10 @@
 *>               LWORK >= max(N+LWORK(ZGEQP3), LWORK(ZPOCON), N+LWORK(ZGESVJ),
 *>                       N+LWORK(ZGELQF), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMLQ)).   
 *>          3. If SIGMA and the left singular vectors are needed
-*>            3.1  .. no scaled condition estimate requested (JOBE.EQ.'N'):
+*>            3.1  .. no scaled condition estimate requested (JOBE = 'N'):
 *>            -> the minimal requirement is LWORK >= 3*N.
 *>            -> For optimal performance:
-*>               if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
 *>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(N+LWORK(ZGEQP3), 2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)). 
@@ -329,15 +329,15 @@
 *>               required (JOBA='E', or 'G').
 *>            -> the minimal requirement is LWORK >= 3*N.
 *>            -> For optimal performance:
-*>               if JOBU.EQ.'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
+*>               if JOBU = 'U' :: LWORK >= max(3*N, N+(N+1)*NB, 2*N+N*NB)=2*N+N*NB,
 *>               where NB is the optimal block size for ZGEQP3, ZGEQRF, ZUNMQR.
 *>               In general, the optimal length LWORK is computed as
 *>               LWORK >= max(N+LWORK(ZGEQP3),N+LWORK(ZPOCON),
 *>                        2*N+LWORK(ZGEQRF), N+LWORK(ZUNMQR)).
-*>          4. If the full SVD is needed: (JOBU.EQ.'U' or JOBU.EQ.'F') and 
-*>            4.1. if JOBV.EQ.'V'  
+*>          4. If the full SVD is needed: (JOBU = 'U' or JOBU = 'F') and 
+*>            4.1. if JOBV = 'V'  
 *>               the minimal requirement is LWORK >= 5*N+2*N*N. 
-*>            4.2. if JOBV.EQ.'J' the minimal requirement is 
+*>            4.2. if JOBV = 'J' the minimal requirement is 
 *>               LWORK >= 4*N+N*N.
 *>            In both cases, the allocated CWORK can accommodate blocked runs
 *>            of ZGEQP3, ZGEQRF, ZGELQF, SUNMQR, ZUNMLQ.
@@ -356,7 +356,7 @@
 *>                    of A. (See the description of SVA().)
 *>          RWORK(2) = See the description of RWORK(1).
 *>          RWORK(3) = SCONDA is an estimate for the condition number of
-*>                    column equilibrated A. (If JOBA .EQ. 'E' or 'G')
+*>                    column equilibrated A. (If JOBA = 'E' or 'G')
 *>                    SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1).
 *>                    It is computed using SPOCON. It holds
 *>                    N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
@@ -375,7 +375,7 @@
 *>                    triangular factor in the first QR factorization.
 *>          RWORK(5) = an estimate of the scaled condition number of the
 *>                    triangular factor in the second QR factorization.
-*>          The following two parameters are computed if JOBT .EQ. 'T'.
+*>          The following two parameters are computed if JOBT = 'T'.
 *>          They are provided for a developer/implementer who is familiar
 *>          with the details of the method.
 *>          RWORK(6) = the entropy of A^* * A :: this is the Shannon entropy
@@ -456,23 +456,23 @@
 *>                     of JOBA and JOBR.
 *>          IWORK(2) = the number of the computed nonzero singular values
 *>          IWORK(3) = if nonzero, a warning message:
-*>                     If IWORK(3).EQ.1 then some of the column norms of A
+*>                     If IWORK(3) = 1 then some of the column norms of A
 *>                     were denormalized floats. The requested high accuracy
 *>                     is not warranted by the data.
-*>          IWORK(4) = 1 or -1. If IWORK(4) .EQ. 1, then the procedure used A^* to
+*>          IWORK(4) = 1 or -1. If IWORK(4) = 1, then the procedure used A^* to
 *>                     do the job as specified by the JOB parameters.
-*>          If the call to ZGEJSV is a workspace query (indicated by LWORK .EQ. -1 or
-*>          LRWORK .EQ. -1), then on exit IWORK(1) contains the required length of 
+*>          If the call to ZGEJSV is a workspace query (indicated by LWORK = -1 or
+*>          LRWORK = -1), then on exit IWORK(1) contains the required length of 
 *>          IWORK for the job parameters used in the call.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           < 0  : if INFO = -i, then the i-th argument had an illegal value.
-*>           = 0 :  successful exit;
-*>           > 0 :  ZGEJSV  did not converge in the maximal allowed number
-*>                  of sweeps. The computed values may be inaccurate.
+*>           < 0:  if INFO = -i, then the i-th argument had an illegal value.
+*>           = 0:  successful exit;
+*>           > 0:  ZGEJSV  did not converge in the maximal allowed number
+*>                 of sweeps. The computed values may be inaccurate.
 *> \endverbatim
 *
 *  Authors:
@@ -1338,7 +1338,7 @@
       IF ( L2ABER ) THEN
 *        Standard absolute error bound suffices. All sigma_i with
 *        sigma_i < N*EPSLN*||A|| are flushed to zero. This is an
-*        agressive enforcement of lower numerical rank by introducing a
+*        aggressive enforcement of lower numerical rank by introducing a
 *        backward error of the order of N*EPSLN*||A||.
          TEMP1 = SQRT(DBLE(N))*EPSLN
          DO 3001 p = 2, N
@@ -1350,7 +1350,7 @@
  3001    CONTINUE
  3002    CONTINUE
       ELSE IF ( L2RANK ) THEN
-*        .. similarly as above, only slightly more gentle (less agressive).
+*        .. similarly as above, only slightly more gentle (less aggressive).
 *        Sudden drop on the diagonal of R1 is used as the criterion for
 *        close-to-rank-deficient.
          TEMP1 = SQRT(SFMIN)
@@ -1720,7 +1720,7 @@
             CALL ZPOCON('L',NR,CWORK(2*N+1),NR,ONE,TEMP1,
      $                   CWORK(2*N+NR*NR+1),RWORK,IERR)
             CONDR1 = ONE / SQRT(TEMP1)
-*           .. here need a second oppinion on the condition number
+*           .. here need a second opinion on the condition number
 *           .. then assume worst case scenario
 *           R1 is OK for inverse <=> CONDR1 .LT. DBLE(N)
 *           more conservative    <=> CONDR1 .LT. SQRT(DBLE(N))
@@ -1765,7 +1765,7 @@
             ELSE
 *
 *              .. ill-conditioned case: second QRF with pivoting
-*              Note that windowed pivoting would be equaly good
+*              Note that windowed pivoting would be equally good
 *              numerically, and more run-time efficient. So, in
 *              an optimal implementation, the next call to ZGEQP3
 *              should be replaced with eg. CALL ZGEQPX (ACM TOMS #782)
@@ -1823,7 +1823,7 @@
 *
                IF ( CONDR2 .GE. COND_OK ) THEN
 *                 .. save the Householder vectors used for Q3
-*                 (this overwrittes the copy of R2, as it will not be
+*                 (this overwrites the copy of R2, as it will not be
 *                 needed in this branch, but it does not overwritte the
 *                 Huseholder vectors of Q2.).
                   CALL ZLACPY( 'U', NR, NR, V, LDV, CWORK(2*N+1), N )
@@ -2079,7 +2079,7 @@
 *
 *        This branch deploys a preconditioned Jacobi SVD with explicitly
 *        accumulated rotations. It is included as optional, mainly for
-*        experimental purposes. It does perfom well, and can also be used.
+*        experimental purposes. It does perform well, and can also be used.
 *        In this implementation, this branch will be automatically activated
 *        if the  condition number sigma_max(A) / sigma_min(A) is predicted
 *        to be greater than the overflow threshold. This is because the
diff --git a/lapack-netlib/SRC/zgelq.f b/lapack-netlib/SRC/zgelq.f
index 656396536..4e7e7e38e 100644
--- a/lapack-netlib/SRC/zgelq.f
+++ b/lapack-netlib/SRC/zgelq.f
@@ -1,3 +1,4 @@
+*> \brief \b ZGELQ
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,17 @@
 *  =============
 *>
 *> \verbatim
-*> ZGELQ computes a LQ factorization of an M-by-N matrix A.
+*>
+*> ZGELQ computes an LQ factorization of a complex M-by-N matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +149,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -159,10 +170,10 @@
       SUBROUTINE ZGELQ( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/zgelq2.f b/lapack-netlib/SRC/zgelq2.f
index 188c8f8c8..a825ac17b 100644
--- a/lapack-netlib/SRC/zgelq2.f
+++ b/lapack-netlib/SRC/zgelq2.f
@@ -33,8 +33,16 @@
 *>
 *> \verbatim
 *>
-*> ZGELQ2 computes an LQ factorization of a complex m by n matrix A:
-*> A = L * Q.
+*> ZGELQ2 computes an LQ factorization of a complex m-by-n matrix A:
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a n-by-n orthogonal matrix;
+*>    L is an lower-triangular m-by-m matrix;
+*>    0 is a m-by-(n-m) zero matrix, if m < n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +104,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complex16GEcomputational
 *
@@ -121,10 +129,10 @@
 *  =====================================================================
       SUBROUTINE ZGELQ2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/zgelqf.f b/lapack-netlib/SRC/zgelqf.f
index 8d9341a61..3a5e5fd4a 100644
--- a/lapack-netlib/SRC/zgelqf.f
+++ b/lapack-netlib/SRC/zgelqf.f
@@ -34,7 +34,15 @@
 *> \verbatim
 *>
 *> ZGELQF computes an LQ factorization of a complex M-by-N matrix A:
-*> A = L * Q.
+*>
+*>    A = ( L 0 ) *  Q
+*>
+*> where:
+*>
+*>    Q is a N-by-N orthogonal matrix;
+*>    L is an lower-triangular M-by-M matrix;
+*>    0 is a M-by-(N-M) zero matrix, if M < N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -110,7 +118,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complex16GEcomputational
 *
@@ -135,10 +143,10 @@
 *  =====================================================================
       SUBROUTINE ZGELQF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/zgemlq.f b/lapack-netlib/SRC/zgemlq.f
index aa07e0feb..6fb2be3d8 100644
--- a/lapack-netlib/SRC/zgemlq.f
+++ b/lapack-netlib/SRC/zgemlq.f
@@ -1,3 +1,4 @@
+*> \brief \b ZGEMLQ
 *
 *  Definition:
 *  ===========
@@ -142,7 +143,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/zgemqr.f b/lapack-netlib/SRC/zgemqr.f
index 32f1bf4d5..aec9321bb 100644
--- a/lapack-netlib/SRC/zgemqr.f
+++ b/lapack-netlib/SRC/zgemqr.f
@@ -1,3 +1,4 @@
+*> \brief \b ZGEMQR
 *
 *  Definition:
 *  ===========
@@ -144,7 +145,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
diff --git a/lapack-netlib/SRC/zgeqr.f b/lapack-netlib/SRC/zgeqr.f
index 1aa457f56..cea686b98 100644
--- a/lapack-netlib/SRC/zgeqr.f
+++ b/lapack-netlib/SRC/zgeqr.f
@@ -1,3 +1,4 @@
+*> \brief \b ZGEQR
 *
 *  Definition:
 *  ===========
@@ -17,7 +18,18 @@
 *  =============
 *>
 *> \verbatim
-*> ZGEQR computes a QR factorization of an M-by-N matrix A.
+*>
+*> ZGEQR computes a QR factorization of a complex M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -138,7 +150,7 @@
 *> \verbatim
 *>
 *> These details are particular for this LAPACK implementation. Users should not 
-*> take them for granted. These details may change in the future, and are unlikely not
+*> take them for granted. These details may change in the future, and are not likely
 *> true for another LAPACK implementation. These details are relevant if one wants
 *> to try to understand the code. They are not part of the interface.
 *>
@@ -160,10 +172,10 @@
       SUBROUTINE ZGEQR( M, N, A, LDA, T, TSIZE, WORK, LWORK,
      $                  INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N, TSIZE, LWORK
diff --git a/lapack-netlib/SRC/zgeqr2.f b/lapack-netlib/SRC/zgeqr2.f
index d2774d788..0384c1d42 100644
--- a/lapack-netlib/SRC/zgeqr2.f
+++ b/lapack-netlib/SRC/zgeqr2.f
@@ -33,8 +33,17 @@
 *>
 *> \verbatim
 *>
-*> ZGEQR2 computes a QR factorization of a complex m by n matrix A:
-*> A = Q * R.
+*> ZGEQR2 computes a QR factorization of a complex m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -96,7 +105,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complex16GEcomputational
 *
@@ -121,10 +130,10 @@
 *  =====================================================================
       SUBROUTINE ZGEQR2( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/zgeqr2p.f b/lapack-netlib/SRC/zgeqr2p.f
index 0e5e55486..7bbd81da9 100644
--- a/lapack-netlib/SRC/zgeqr2p.f
+++ b/lapack-netlib/SRC/zgeqr2p.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> ZGEQR2P computes a QR factorization of a complex m by n matrix A:
-*> A = Q * R. The diagonal entries of R are real and nonnegative.
+*> ZGEQR2P computes a QR factorization of a complex m-by-n matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a m-by-m orthogonal matrix;
+*>    R is an upper-triangular n-by-n matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (m-n)-by-n zero matrix, if m > n.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -97,7 +107,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complex16GEcomputational
 *
@@ -124,10 +134,10 @@
 *  =====================================================================
       SUBROUTINE ZGEQR2P( M, N, A, LDA, TAU, WORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, M, N
diff --git a/lapack-netlib/SRC/zgeqrf.f b/lapack-netlib/SRC/zgeqrf.f
index 3ea1e71e1..2c03ebe73 100644
--- a/lapack-netlib/SRC/zgeqrf.f
+++ b/lapack-netlib/SRC/zgeqrf.f
@@ -34,7 +34,16 @@
 *> \verbatim
 *>
 *> ZGEQRF computes a QR factorization of a complex M-by-N matrix A:
-*> A = Q * R.
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -111,7 +120,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complex16GEcomputational
 *
@@ -136,10 +145,10 @@
 *  =====================================================================
       SUBROUTINE ZGEQRF( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/zgeqrfp.f b/lapack-netlib/SRC/zgeqrfp.f
index cdc4bfa94..80ead21ca 100644
--- a/lapack-netlib/SRC/zgeqrfp.f
+++ b/lapack-netlib/SRC/zgeqrfp.f
@@ -33,8 +33,18 @@
 *>
 *> \verbatim
 *>
-*> ZGEQRFP computes a QR factorization of a complex M-by-N matrix A:
-*> A = Q * R. The diagonal entries of R are real and nonnegative.
+*> ZGEQR2P computes a QR factorization of a complex M-by-N matrix A:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix;
+*>    R is an upper-triangular N-by-N matrix with nonnegative diagonal
+*>    entries;
+*>    0 is a (M-N)-by-N zero matrix, if M > N.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -112,7 +122,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date December 2016
+*> \date November 2019
 *
 *> \ingroup complex16GEcomputational
 *
@@ -139,10 +149,10 @@
 *  =====================================================================
       SUBROUTINE ZGEQRFP( M, N, A, LDA, TAU, WORK, LWORK, INFO )
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LWORK, M, N
diff --git a/lapack-netlib/SRC/zgerfsx.f b/lapack-netlib/SRC/zgerfsx.f
index 5aabe50ed..3af7f8b6b 100644
--- a/lapack-netlib/SRC/zgerfsx.f
+++ b/lapack-netlib/SRC/zgerfsx.f
@@ -74,7 +74,7 @@
 *>     Specifies the form of the system of equations:
 *>       = 'N':  A * X = B     (No transpose)
 *>       = 'T':  A**T * X = B  (Transpose)
-*>       = 'C':  A**H * X = B  (Conjugate transpose = Transpose)
+*>       = 'C':  A**H * X = B  (Conjugate transpose)
 *> \endverbatim
 *>
 *> \param[in] EQUED
@@ -283,7 +283,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -319,14 +319,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -334,9 +334,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/zgesc2.f b/lapack-netlib/SRC/zgesc2.f
index 72ef99dba..cdf15e4f4 100644
--- a/lapack-netlib/SRC/zgesc2.f
+++ b/lapack-netlib/SRC/zgesc2.f
@@ -91,7 +91,7 @@
 *> \verbatim
 *>          SCALE is DOUBLE PRECISION
 *>           On exit, SCALE contains the scale factor. SCALE is chosen
-*>           0 <= SCALE <= 1 to prevent owerflow in the solution.
+*>           0 <= SCALE <= 1 to prevent overflow in the solution.
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zgesvdq.f b/lapack-netlib/SRC/zgesvdq.f
new file mode 100644
index 000000000..e0fb920bb
--- /dev/null
+++ b/lapack-netlib/SRC/zgesvdq.f
@@ -0,0 +1,1389 @@
+*> \brief <b> ZGESVDQ computes the singular value decomposition (SVD) with a QR-Preconditioned QR SVD Method for GE matrices</b>
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZGESVDQ + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zgesvdq.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zgesvdq.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zgesvdq.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*      SUBROUTINE ZGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+*                          S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+*                          CWORK, LCWORK, RWORK, LRWORK, INFO )
+*
+*     .. Scalar Arguments ..
+*      IMPLICIT    NONE
+*      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+*      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK,
+*                  INFO
+*     ..
+*     .. Array Arguments ..
+*      COMPLEX*16       A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * )
+*      DOUBLE PRECISION S( * ), RWORK( * )
+*      INTEGER          IWORK( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+* ZCGESVDQ computes the singular value decomposition (SVD) of a complex
+*> M-by-N matrix A, where M >= N. The SVD of A is written as
+*>                                    [++]   [xx]   [x0]   [xx]
+*>              A = U * SIGMA * V^*,  [++] = [xx] * [ox] * [xx]
+*>                                    [++]   [xx]
+*> where SIGMA is an N-by-N diagonal matrix, U is an M-by-N orthonormal
+*> matrix, and V is an N-by-N unitary matrix. The diagonal elements
+*> of SIGMA are the singular values of A. The columns of U and V are the
+*> left and the right singular vectors of A, respectively.
+*> \endverbatim
+*
+*  Arguments
+*  =========
+*
+*> \param[in] JOBA
+*> \verbatim
+*>  JOBA is CHARACTER*1
+*>  Specifies the level of accuracy in the computed SVD
+*>  = 'A' The requested accuracy corresponds to having the backward
+*>        error bounded by || delta A ||_F <= f(m,n) * EPS * || A ||_F,
+*>        where EPS = DLAMCH('Epsilon'). This authorises ZGESVDQ to
+*>        truncate the computed triangular factor in a rank revealing
+*>        QR factorization whenever the truncated part is below the
+*>        threshold of the order of EPS * ||A||_F. This is aggressive
+*>        truncation level.
+*>  = 'M' Similarly as with 'A', but the truncation is more gentle: it
+*>        is allowed only when there is a drop on the diagonal of the
+*>        triangular factor in the QR factorization. This is medium
+*>        truncation level.
+*>  = 'H' High accuracy requested. No numerical rank determination based
+*>        on the rank revealing QR factorization is attempted.
+*>  = 'E' Same as 'H', and in addition the condition number of column
+*>        scaled A is estimated and returned in  RWORK(1).
+*>        N^(-1/4)*RWORK(1) <= ||pinv(A_scaled)||_2 <= N^(1/4)*RWORK(1)
+*> \endverbatim
+*>
+*> \param[in] JOBP
+*> \verbatim
+*>  JOBP is CHARACTER*1
+*>  = 'P' The rows of A are ordered in decreasing order with respect to
+*>        ||A(i,:)||_\infty. This enhances numerical accuracy at the cost
+*>        of extra data movement. Recommended for numerical robustness.
+*>  = 'N' No row pivoting.
+*> \endverbatim
+*>
+*> \param[in] JOBR
+*> \verbatim
+*>          JOBR is CHARACTER*1
+*>          = 'T' After the initial pivoted QR factorization, ZGESVD is applied to
+*>          the adjoint R**H of the computed triangular factor R. This involves
+*>          some extra data movement (matrix transpositions). Useful for
+*>          experiments, research and development.
+*>          = 'N' The triangular factor R is given as input to CGESVD. This may be
+*>          preferred as it involves less data movement.
+*> \endverbatim
+*>
+*> \param[in] JOBU
+*> \verbatim
+*>          JOBU is CHARACTER*1
+*>          = 'A' All M left singular vectors are computed and returned in the
+*>          matrix U. See the description of U.
+*>          = 'S' or 'U' N = min(M,N) left singular vectors are computed and returned
+*>          in the matrix U. See the description of U.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK left singular
+*>          vectors are computed and returned in the matrix U.
+*>          = 'F' The N left singular vectors are returned in factored form as the
+*>          product of the Q factor from the initial QR factorization and the
+*>          N left singular vectors of (R**H , 0)**H. If row pivoting is used,
+*>          then the necessary information on the row pivoting is stored in
+*>          IWORK(N+1:N+M-1).
+*>          = 'N' The left singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] JOBV
+*> \verbatim
+*>          JOBV is CHARACTER*1
+*>          = 'A', 'V' All N right singular vectors are computed and returned in
+*>          the matrix V.
+*>          = 'R' Numerical rank NUMRANK is determined and only NUMRANK right singular
+*>          vectors are computed and returned in the matrix V. This option is
+*>          allowed only if JOBU = 'R' or JOBU = 'N'; otherwise it is illegal.
+*>          = 'N' The right singular vectors are not computed.
+*> \endverbatim
+*>
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the input matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the input matrix A.  M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array of dimensions LDA x N
+*>          On entry, the input matrix A.
+*>          On exit, if JOBU .NE. 'N' or JOBV .NE. 'N', the lower triangle of A contains
+*>          the Householder vectors as stored by ZGEQP3. If JOBU = 'F', these Householder
+*>          vectors together with CWORK(1:N) can be used to restore the Q factors from
+*>          the initial pivoted QR factorization of A. See the description of U.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER.
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] S
+*> \verbatim
+*>          S is DOUBLE PRECISION array of dimension N.
+*>          The singular values of A, ordered so that S(i) >= S(i+1).
+*> \endverbatim
+*>
+*> \param[out] U
+*> \verbatim
+*>          U is COMPLEX*16 array, dimension
+*>          LDU x M if JOBU = 'A'; see the description of LDU. In this case,
+*>          on exit, U contains the M left singular vectors.
+*>          LDU x N if JOBU = 'S', 'U', 'R' ; see the description of LDU. In this
+*>          case, U contains the leading N or the leading NUMRANK left singular vectors.
+*>          LDU x N if JOBU = 'F' ; see the description of LDU. In this case U
+*>          contains N x N unitary matrix that can be used to form the left
+*>          singular vectors.
+*>          If JOBU = 'N', U is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDU
+*> \verbatim
+*>          LDU is INTEGER.
+*>          The leading dimension of the array U.
+*>          If JOBU = 'A', 'S', 'U', 'R',  LDU >= max(1,M).
+*>          If JOBU = 'F',                 LDU >= max(1,N).
+*>          Otherwise,                     LDU >= 1.
+*> \endverbatim
+*>
+*> \param[out] V
+*> \verbatim
+*>          V is COMPLEX*16 array, dimension
+*>          LDV x N if JOBV = 'A', 'V', 'R' or if JOBA = 'E' .
+*>          If JOBV = 'A', or 'V',  V contains the N-by-N unitary matrix  V**H;
+*>          If JOBV = 'R', V contains the first NUMRANK rows of V**H (the right
+*>          singular vectors, stored rowwise, of the NUMRANK largest singular values).
+*>          If JOBV = 'N' and JOBA = 'E', V is used as a workspace.
+*>          If JOBV = 'N', and JOBA.NE.'E', V is not referenced.
+*> \endverbatim
+*>
+*> \param[in] LDV
+*> \verbatim
+*>          LDV is INTEGER
+*>          The leading dimension of the array V.
+*>          If JOBV = 'A', 'V', 'R',  or JOBA = 'E', LDV >= max(1,N).
+*>          Otherwise,                               LDV >= 1.
+*> \endverbatim
+*>
+*> \param[out] NUMRANK
+*> \verbatim
+*>          NUMRANK is INTEGER
+*>          NUMRANK is the numerical rank first determined after the rank
+*>          revealing QR factorization, following the strategy specified by the
+*>          value of JOBA. If JOBV = 'R' and JOBU = 'R', only NUMRANK
+*>          leading singular values and vectors are then requested in the call
+*>          of CGESVD. The final value of NUMRANK might be further reduced if
+*>          some singular values are computed as zeros.
+*> \endverbatim
+*>
+*> \param[out] IWORK
+*> \verbatim
+*>          IWORK is INTEGER array, dimension (max(1, LIWORK)).
+*>          On exit, IWORK(1:N) contains column pivoting permutation of the
+*>          rank revealing QR factorization.
+*>          If JOBP = 'P', IWORK(N+1:N+M-1) contains the indices of the sequence
+*>          of row swaps used in row pivoting. These can be used to restore the
+*>          left singular vectors in the case JOBU = 'F'.
+*
+*>          If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          LIWORK(1) returns the minimal LIWORK.
+*> \endverbatim
+*>
+*> \param[in] LIWORK
+*> \verbatim
+*>          LIWORK is INTEGER
+*>          The dimension of the array IWORK.
+*>          LIWORK >= N + M - 1,  if JOBP = 'P';
+*>          LIWORK >= N           if JOBP = 'N'.
+*>
+*>          If LIWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the CWORK, IWORK, and RWORK arrays, and no error
+*>          message related to LCWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] CWORK
+*> \verbatim
+*>          CWORK is COMPLEX*12 array, dimension (max(2, LCWORK)), used as a workspace.
+*>          On exit, if, on entry, LCWORK.NE.-1, CWORK(1:N) contains parameters
+*>          needed to recover the Q factor from the QR factorization computed by
+*>          ZGEQP3.
+*
+*>          If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          CWORK(1) returns the optimal LCWORK, and
+*>          CWORK(2) returns the minimal LCWORK.
+*> \endverbatim
+*>
+*> \param[in,out] LCWORK
+*> \verbatim
+*>          LCWORK is INTEGER
+*>          The dimension of the array CWORK. It is determined as follows:
+*>          Let  LWQP3 = N+1,  LWCON = 2*N, and let
+*>          LWUNQ = { MAX( N, 1 ),  if JOBU = 'R', 'S', or 'U'
+*>          { MAX( M, 1 ),  if JOBU = 'A'
+*>          LWSVD = MAX( 3*N, 1 )
+*>          LWLQF = MAX( N/2, 1 ), LWSVD2 = MAX( 3*(N/2), 1 ), LWUNLQ = MAX( N, 1 ),
+*>          LWQRF = MAX( N/2, 1 ), LWUNQ2 = MAX( N, 1 )
+*>          Then the minimal value of LCWORK is:
+*>          = MAX( N + LWQP3, LWSVD )        if only the singular values are needed;
+*>          = MAX( N + LWQP3, LWCON, LWSVD ) if only the singular values are needed,
+*>                                   and a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWUNQ ) if the singular values and the left
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the singular values and the left
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition estimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD )        if the singular values and the right
+*>                                   singular vectors are requested;
+*>          = N + MAX( LWQP3, LWCON, LWSVD ) if the singular values and the right
+*>                                   singular vectors are requested, and also
+*>                                   a scaled condition etimate requested;
+*>
+*>          = N + MAX( LWQP3, LWSVD, LWUNQ ) if the full SVD is requested with JOBV = 'R';
+*>                                   independent of JOBR;
+*>          = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ) if the full SVD is requested,
+*>                                   JOBV = 'R' and, also a scaled condition
+*>                                   estimate requested; independent of JOBR;
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ) ) if the
+*>                         full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWLQF, N/2+LWSVD2, N/2+LWUNLQ, LWUNQ ) )
+*>                         if the full SVD is requested with JOBV = 'A' or 'V', and
+*>                         JOBR ='N', and also a scaled condition number estimate
+*>                         requested.
+*>          = MAX( N + MAX( LWQP3, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) ) if the
+*>                         full SVD is requested with JOBV = 'A', 'V', and JOBR ='T'
+*>          = MAX( N + MAX( LWQP3, LWCON, LWSVD, LWUNQ ),
+*>         N + MAX( LWQP3, LWCON, N/2+LWQRF, N/2+LWSVD2, N/2+LWUNQ2, LWUNQ ) )
+*>                         if the full SVD is requested with JOBV = 'A', 'V' and
+*>                         JOBR ='T', and also a scaled condition number estimate
+*>                         requested.
+*>          Finally, LCWORK must be at least two: LCWORK = MAX( 2, LCWORK ).
+*>
+*>          If LCWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the CWORK, IWORK, and RWORK arrays, and no error
+*>          message related to LCWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] RWORK
+*> \verbatim
+*>          RWORK is DOUBLE PRECISION array, dimension (max(1, LRWORK)).
+*>          On exit,
+*>          1. If JOBA = 'E', RWORK(1) contains an estimate of the condition
+*>          number of column scaled A. If A = C * D where D is diagonal and C
+*>          has unit columns in the Euclidean norm, then, assuming full column rank,
+*>          N^(-1/4) * RWORK(1) <= ||pinv(C)||_2 <= N^(1/4) * RWORK(1).
+*>          Otherwise, RWORK(1) = -1.
+*>          2. RWORK(2) contains the number of singular values computed as
+*>          exact zeros in ZGESVD applied to the upper triangular or trapeziodal
+*>          R (from the initial QR factorization). In case of early exit (no call to
+*>          ZGESVD, such as in the case of zero matrix) RWORK(2) = -1.
+*
+*>          If LIWORK, LCWORK, or LRWORK = -1, then on exit, if INFO = 0,
+*>          RWORK(1) returns the minimal LRWORK.
+*> \endverbatim
+*>
+*> \param[in] LRWORK
+*> \verbatim
+*>          LRWORK is INTEGER.
+*>          The dimension of the array RWORK.
+*>          If JOBP ='P', then LRWORK >= MAX(2, M, 5*N);
+*>          Otherwise, LRWORK >= MAX(2, 5*N).
+*
+*>          If LRWORK = -1, then a workspace query is assumed; the routine
+*>          only calculates and returns the optimal and minimal sizes
+*>          for the CWORK, IWORK, and RWORK arrays, and no error
+*>          message related to LCWORK is issued by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value.
+*>          > 0:  if ZBDSQR did not converge, INFO specifies how many superdiagonals
+*>          of an intermediate bidiagonal form B (computed in ZGESVD) did not
+*>          converge to zero.
+*> \endverbatim
+*
+*> \par Further Details:
+*  ========================
+*>
+*> \verbatim
+*>
+*>   1. The data movement (matrix transpose) is coded using simple nested
+*>   DO-loops because BLAS and LAPACK do not provide corresponding subroutines.
+*>   Those DO-loops are easily identified in this source code - by the CONTINUE
+*>   statements labeled with 11**. In an optimized version of this code, the
+*>   nested DO loops should be replaced with calls to an optimized subroutine.
+*>   2. This code scales A by 1/SQRT(M) if the largest ABS(A(i,j)) could cause
+*>   column norm overflow. This is the minial precaution and it is left to the
+*>   SVD routine (CGESVD) to do its own preemptive scaling if potential over-
+*>   or underflows are detected. To avoid repeated scanning of the array A,
+*>   an optimal implementation would do all necessary scaling before calling
+*>   CGESVD and the scaling in CGESVD can be switched off.
+*>   3. Other comments related to code optimization are given in comments in the
+*>   code, enlosed in [[double brackets]].
+*> \endverbatim
+*
+*> \par Bugs, examples and comments
+*  ===========================
+*
+*> \verbatim
+*>  Please report all bugs and send interesting examples and/or comments to
+*>  drmac@math.hr. Thank you.
+*> \endverbatim
+*
+*> \par References
+*  ===============
+*
+*> \verbatim
+*>  [1] Zlatko Drmac, Algorithm 977: A QR-Preconditioned QR SVD Method for
+*>      Computing the SVD with High Accuracy. ACM Trans. Math. Softw.
+*>      44(1): 11:1-11:30 (2017)
+*>
+*>  SIGMA library, xGESVDQ section updated February 2016.
+*>  Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*> Developed and coded by Zlatko Drmac, Department of Mathematics
+*>  University of Zagreb, Croatia, drmac@math.hr
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2018
+*
+*> \ingroup complex16GEsing
+*
+*  =====================================================================
+      SUBROUTINE ZGESVDQ( JOBA, JOBP, JOBR, JOBU, JOBV, M, N, A, LDA,
+     $                    S, U, LDU, V, LDV, NUMRANK, IWORK, LIWORK,
+     $                    CWORK, LCWORK, RWORK, LRWORK, INFO )
+*     .. Scalar Arguments ..
+      IMPLICIT    NONE
+      CHARACTER   JOBA, JOBP, JOBR, JOBU, JOBV
+      INTEGER     M, N, LDA, LDU, LDV, NUMRANK, LIWORK, LCWORK, LRWORK,
+     $            INFO
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16       A( LDA, * ), U( LDU, * ), V( LDV, * ), CWORK( * )
+      DOUBLE PRECISION S( * ), RWORK( * )
+      INTEGER          IWORK( * )
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION ZERO,         ONE
+      PARAMETER      ( ZERO = 0.0D0, ONE = 1.0D0 )
+      COMPLEX*16       CZERO,                 CONE
+      PARAMETER      ( CZERO = (0.0D0,0.0D0), CONE = (1.0D0,0.0D0) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER     IERR, NR, N1, OPTRATIO, p, q
+      INTEGER     LWCON, LWQP3, LWRK_ZGELQF, LWRK_ZGESVD, LWRK_ZGESVD2,
+     $            LWRK_ZGEQP3, LWRK_ZGEQRF, LWRK_ZUNMLQ, LWRK_ZUNMQR,
+     $            LWRK_ZUNMQR2, LWLQF, LWQRF, LWSVD, LWSVD2, LWUNQ,
+     $            LWUNQ2, LWUNLQ, MINWRK, MINWRK2, OPTWRK, OPTWRK2,
+     $            IMINWRK, RMINWRK
+      LOGICAL     ACCLA,  ACCLM, ACCLH, ASCALED, CONDA, DNTWU,  DNTWV,
+     $            LQUERY, LSVC0, LSVEC, ROWPRM,  RSVEC, RTRANS, WNTUA,
+     $            WNTUF,  WNTUR, WNTUS, WNTVA,   WNTVR
+      DOUBLE PRECISION BIG, EPSLN, RTMP, SCONDA, SFMIN
+      COMPLEX*16       CTMP
+*     ..
+*     .. Local Arrays
+      COMPLEX*16         CDUMMY(1)
+      DOUBLE PRECISION   RDUMMY(1)
+*     ..
+*     .. External Subroutines (BLAS, LAPACK)
+      EXTERNAL    ZGELQF, ZGEQP3, ZGEQRF, ZGESVD, ZLACPY, ZLAPMT,
+     $            ZLASCL, ZLASET, ZLASWP, ZDSCAL, DLASET, DLASCL,
+     $            ZPOCON, ZUNMLQ, ZUNMQR, XERBLA
+*     ..
+*     .. External Functions (BLAS, LAPACK)
+      LOGICAL     LSAME
+      INTEGER                     IDAMAX
+      DOUBLE PRECISION   ZLANGE,          DZNRM2, DLAMCH
+      EXTERNAL    LSAME, ZLANGE,  IDAMAX, DZNRM2, DLAMCH
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC   ABS, CONJG, MAX, MIN, DBLE, SQRT
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input arguments
+*
+      WNTUS  = LSAME( JOBU, 'S' ) .OR. LSAME( JOBU, 'U' )
+      WNTUR  = LSAME( JOBU, 'R' )
+      WNTUA  = LSAME( JOBU, 'A' )
+      WNTUF  = LSAME( JOBU, 'F' )
+      LSVC0  = WNTUS .OR. WNTUR .OR. WNTUA
+      LSVEC  = LSVC0 .OR. WNTUF
+      DNTWU  = LSAME( JOBU, 'N' )
+*
+      WNTVR  = LSAME( JOBV, 'R' )
+      WNTVA  = LSAME( JOBV, 'A' ) .OR. LSAME( JOBV, 'V' )
+      RSVEC  = WNTVR .OR. WNTVA
+      DNTWV  = LSAME( JOBV, 'N' )
+*
+      ACCLA  = LSAME( JOBA, 'A' )
+      ACCLM  = LSAME( JOBA, 'M' )
+      CONDA  = LSAME( JOBA, 'E' )
+      ACCLH  = LSAME( JOBA, 'H' ) .OR. CONDA
+*
+      ROWPRM = LSAME( JOBP, 'P' )
+      RTRANS = LSAME( JOBR, 'T' )
+*
+      IF ( ROWPRM ) THEN
+         IMINWRK = MAX( 1, N + M - 1 )
+         RMINWRK = MAX( 2, M, 5*N )
+      ELSE
+         IMINWRK = MAX( 1, N )
+         RMINWRK = MAX( 2, 5*N )
+      END IF
+      LQUERY = (LIWORK .EQ. -1 .OR. LCWORK .EQ. -1 .OR. LRWORK .EQ. -1)
+      INFO  = 0
+      IF ( .NOT. ( ACCLA .OR. ACCLM .OR. ACCLH ) ) THEN
+         INFO = -1
+      ELSE IF ( .NOT.( ROWPRM .OR. LSAME( JOBP, 'N' ) ) ) THEN
+          INFO = -2
+      ELSE IF ( .NOT.( RTRANS .OR. LSAME( JOBR, 'N' ) ) ) THEN
+          INFO = -3
+      ELSE IF ( .NOT.( LSVEC .OR. DNTWU ) ) THEN
+         INFO = -4
+      ELSE IF ( WNTUR .AND. WNTVA ) THEN
+         INFO = -5
+      ELSE IF ( .NOT.( RSVEC .OR. DNTWV )) THEN
+         INFO = -5
+      ELSE IF ( M.LT.0 ) THEN
+         INFO = -6
+      ELSE IF ( ( N.LT.0 ) .OR. ( N.GT.M ) ) THEN
+         INFO = -7
+      ELSE IF ( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -9
+      ELSE IF ( LDU.LT.1 .OR. ( LSVC0 .AND. LDU.LT.M ) .OR.
+     $       ( WNTUF .AND. LDU.LT.N ) ) THEN
+         INFO = -12
+      ELSE IF ( LDV.LT.1 .OR. ( RSVEC .AND. LDV.LT.N ) .OR.
+     $          ( CONDA .AND. LDV.LT.N ) ) THEN
+         INFO = -14
+      ELSE IF ( LIWORK .LT. IMINWRK .AND. .NOT. LQUERY ) THEN
+         INFO = -17
+      END IF
+*
+*
+      IF ( INFO .EQ. 0 ) THEN
+*        .. compute the minimal and the optimal workspace lengths
+*        [[The expressions for computing the minimal and the optimal
+*        values of LCWORK are written with a lot of redundancy and
+*        can be simplified. However, this detailed form is easier for
+*        maintenance and modifications of the code.]]
+*
+*        .. minimal workspace length for ZGEQP3 of an M x N matrix
+         LWQP3 = N+1
+*        .. minimal workspace length for ZUNMQR to build left singular vectors
+         IF ( WNTUS .OR. WNTUR ) THEN
+             LWUNQ  = MAX( N  , 1 )
+         ELSE IF ( WNTUA ) THEN
+             LWUNQ = MAX( M , 1 )
+         END IF
+*        .. minimal workspace length for ZPOCON of an N x N matrix
+         LWCON = 2 * N
+*        .. ZGESVD of an N x N matrix
+         LWSVD = MAX( 3 * N, 1 )
+         IF ( LQUERY ) THEN
+             CALL ZGEQP3( M, N, A, LDA, IWORK, CDUMMY, CDUMMY, -1,
+     $            RDUMMY, IERR )
+             LWRK_ZGEQP3 = INT( CDUMMY(1) )
+             IF ( WNTUS .OR. WNTUR ) THEN
+                 CALL ZUNMQR( 'L', 'N', M, N, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQR = INT( CDUMMY(1) )
+             ELSE IF ( WNTUA ) THEN
+                 CALL ZUNMQR( 'L', 'N', M, M, N, A, LDA, CDUMMY, U,
+     $                LDU, CDUMMY, -1, IERR )
+                 LWRK_ZUNMQR = INT( CDUMMY(1) )
+             ELSE
+                 LWRK_ZUNMQR = 0
+             END IF
+         END IF
+         MINWRK = 2
+         OPTWRK = 2
+         IF ( .NOT. (LSVEC .OR. RSVEC ) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if
+*            only the singular values are requested
+             IF ( CONDA ) THEN
+                MINWRK = MAX( N+LWQP3, LWCON, LWSVD )
+             ELSE
+                MINWRK = MAX( N+LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 CALL ZGESVD( 'N', 'N', N, N, A, LDA, S, U, LDU,
+     $                V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                 LWRK_ZGESVD = INT( CDUMMY(1) )
+                 IF ( CONDA ) THEN
+                    OPTWRK = MAX( N+LWRK_ZGEQP3, N+LWCON, LWRK_ZGESVD )
+                 ELSE
+                    OPTWRK = MAX( N+LWRK_ZGEQP3, LWRK_ZGESVD )
+                 END IF
+             END IF
+         ELSE IF ( LSVEC .AND. (.NOT.RSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the left singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD, LWUNQ )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD, LWUNQ )
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL ZGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                ELSE
+                   CALL ZGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                END IF
+                LWRK_ZGESVD = INT( CDUMMY(1) )
+                IF ( CONDA ) THEN
+                    OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, LWRK_ZGESVD,
+     $                               LWRK_ZUNMQR )
+                ELSE
+                    OPTWRK = N + MAX( LWRK_ZGEQP3, LWRK_ZGESVD,
+     $                               LWRK_ZUNMQR )
+                END IF
+             END IF
+         ELSE IF ( RSVEC .AND. (.NOT.LSVEC) ) THEN
+*            .. minimal and optimal sizes of the complex workspace if the
+*            singular values and the right singular vectors are requested
+             IF ( CONDA ) THEN
+                 MINWRK = N + MAX( LWQP3, LWCON, LWSVD )
+             ELSE
+                 MINWRK = N + MAX( LWQP3, LWSVD )
+             END IF
+             IF ( LQUERY ) THEN
+                 IF ( RTRANS ) THEN
+                     CALL ZGESVD( 'O', 'N', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                 ELSE
+                     CALL ZGESVD( 'N', 'O', N, N, A, LDA, S, U, LDU,
+     $                    V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                 END IF
+                 LWRK_ZGESVD = INT( CDUMMY(1) )
+                 IF ( CONDA ) THEN
+                     OPTWRK = N + MAX( LWRK_ZGEQP3, LWCON, LWRK_ZGESVD )
+                 ELSE
+                     OPTWRK = N + MAX( LWRK_ZGEQP3, LWRK_ZGESVD )
+                 END IF
+             END IF
+         ELSE
+*            .. minimal and optimal sizes of the complex workspace if the
+*            full SVD is requested
+             IF ( RTRANS ) THEN
+                 MINWRK = MAX( LWQP3, LWSVD, LWUNQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N x N/2 ZGEQRF
+                    LWQRF  = MAX( N/2, 1 )
+*                   .. minimal workspace lengt for N/2 x N/2 ZGESVD
+                    LWSVD2 = MAX( 3 * (N/2), 1 )
+                    LWUNQ2 = MAX( N, 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWQRF, N/2+LWSVD2,
+     $                        N/2+LWUNQ2, LWUNQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             ELSE
+                 MINWRK = MAX( LWQP3, LWSVD, LWUNQ )
+                 IF ( CONDA ) MINWRK = MAX( MINWRK, LWCON )
+                 MINWRK = MINWRK + N
+                 IF ( WNTVA ) THEN
+*                   .. minimal workspace length for N/2 x N ZGELQF
+                    LWLQF  = MAX( N/2, 1 )
+                    LWSVD2 = MAX( 3 * (N/2), 1 )
+                    LWUNLQ = MAX( N , 1 )
+                    MINWRK2 = MAX( LWQP3, N/2+LWLQF, N/2+LWSVD2,
+     $                        N/2+LWUNLQ, LWUNQ )
+                    IF ( CONDA ) MINWRK2 = MAX( MINWRK2, LWCON )
+                    MINWRK2 = N + MINWRK2
+                    MINWRK = MAX( MINWRK, MINWRK2 )
+                 END IF
+             END IF
+             IF ( LQUERY ) THEN
+                IF ( RTRANS ) THEN
+                   CALL ZGESVD( 'O', 'A', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                   LWRK_ZGESVD = INT( CDUMMY(1) )
+                   OPTWRK = MAX(LWRK_ZGEQP3,LWRK_ZGESVD,LWRK_ZUNMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                       CALL ZGEQRF(N,N/2,U,LDU,CDUMMY,CDUMMY,-1,IERR)
+                       LWRK_ZGEQRF = INT( CDUMMY(1) )
+                       CALL ZGESVD( 'S', 'O', N/2,N/2, V,LDV, S, U,LDU,
+     $                      V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                       LWRK_ZGESVD2 = INT( CDUMMY(1) )
+                       CALL ZUNMQR( 'R', 'C', N, N, N/2, U, LDU, CDUMMY,
+     $                      V, LDV, CDUMMY, -1, IERR )
+                       LWRK_ZUNMQR2 = INT( CDUMMY(1) )
+                       OPTWRK2 = MAX( LWRK_ZGEQP3, N/2+LWRK_ZGEQRF,
+     $                           N/2+LWRK_ZGESVD2, N/2+LWRK_ZUNMQR2 )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                ELSE
+                   CALL ZGESVD( 'S', 'O', N, N, A, LDA, S, U, LDU,
+     $                  V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                   LWRK_ZGESVD = INT( CDUMMY(1) )
+                   OPTWRK = MAX(LWRK_ZGEQP3,LWRK_ZGESVD,LWRK_ZUNMQR)
+                   IF ( CONDA ) OPTWRK = MAX( OPTWRK, LWCON )
+                   OPTWRK = N + OPTWRK
+                   IF ( WNTVA ) THEN
+                      CALL ZGELQF(N/2,N,U,LDU,CDUMMY,CDUMMY,-1,IERR)
+                      LWRK_ZGELQF = INT( CDUMMY(1) )
+                      CALL ZGESVD( 'S','O', N/2,N/2, V, LDV, S, U, LDU,
+     $                     V, LDV, CDUMMY, -1, RDUMMY, IERR )
+                      LWRK_ZGESVD2 = INT( CDUMMY(1) )
+                      CALL ZUNMLQ( 'R', 'N', N, N, N/2, U, LDU, CDUMMY,
+     $                     V, LDV, CDUMMY,-1,IERR )
+                      LWRK_ZUNMLQ = INT( CDUMMY(1) )
+                      OPTWRK2 = MAX( LWRK_ZGEQP3, N/2+LWRK_ZGELQF,
+     $                           N/2+LWRK_ZGESVD2, N/2+LWRK_ZUNMLQ )
+                       IF ( CONDA ) OPTWRK2 = MAX( OPTWRK2, LWCON )
+                       OPTWRK2 = N + OPTWRK2
+                       OPTWRK = MAX( OPTWRK, OPTWRK2 )
+                   END IF
+                END IF
+             END IF
+         END IF
+*
+         MINWRK = MAX( 2, MINWRK )
+         OPTWRK = MAX( 2, OPTWRK )
+         IF ( LCWORK .LT. MINWRK .AND. (.NOT.LQUERY) ) INFO = -19
+*
+      END IF
+*
+      IF (INFO .EQ. 0 .AND. LRWORK .LT. RMINWRK .AND. .NOT. LQUERY) THEN
+         INFO = -21
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZGESVDQ', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+*
+*     Return optimal workspace
+*
+          IWORK(1) = IMINWRK
+          CWORK(1) = OPTWRK
+          CWORK(2) = MINWRK
+          RWORK(1) = RMINWRK
+          RETURN
+      END IF
+*
+*     Quick return if the matrix is void.
+*
+      IF( ( M.EQ.0 ) .OR. ( N.EQ.0 ) ) THEN
+*     .. all output is void.
+         RETURN
+      END IF
+*
+      BIG = DLAMCH('O')
+      ASCALED = .FALSE.
+      IF ( ROWPRM ) THEN
+*           .. reordering the rows in decreasing sequence in the
+*           ell-infinity norm - this enhances numerical robustness in
+*           the case of differently scaled rows.
+            DO 1904 p = 1, M
+*               RWORK(p) = ABS( A(p,IZAMAX(N,A(p,1),LDA)) )
+*               [[ZLANGE will return NaN if an entry of the p-th row is Nan]]
+                RWORK(p) = ZLANGE( 'M', 1, N, A(p,1), LDA, RDUMMY )
+*               .. check for NaN's and Inf's
+                IF ( ( RWORK(p) .NE. RWORK(p) ) .OR.
+     $               ( (RWORK(p)*ZERO) .NE. ZERO ) ) THEN
+                    INFO = -8
+                    CALL XERBLA( 'ZGESVDQ', -INFO )
+                    RETURN
+                END IF
+ 1904       CONTINUE
+            DO 1952 p = 1, M - 1
+            q = IDAMAX( M-p+1, RWORK(p), 1 ) + p - 1
+            IWORK(N+p) = q
+            IF ( p .NE. q ) THEN
+               RTMP     = RWORK(p)
+               RWORK(p) = RWORK(q)
+               RWORK(q) = RTMP
+            END IF
+ 1952       CONTINUE
+*
+            IF ( RWORK(1) .EQ. ZERO ) THEN
+*              Quick return: A is the M x N zero matrix.
+               NUMRANK = 0
+               CALL DLASET( 'G', N, 1, ZERO, ZERO, S, N )
+               IF ( WNTUS ) CALL ZLASET('G', M, N, CZERO, CONE, U, LDU)
+               IF ( WNTUA ) CALL ZLASET('G', M, M, CZERO, CONE, U, LDU)
+               IF ( WNTVA ) CALL ZLASET('G', N, N, CZERO, CONE, V, LDV)
+               IF ( WNTUF ) THEN
+                   CALL ZLASET( 'G', N, 1, CZERO, CZERO, CWORK, N )
+                   CALL ZLASET( 'G', M, N, CZERO, CONE, U, LDU )
+               END IF
+               DO 5001 p = 1, N
+                   IWORK(p) = p
+ 5001          CONTINUE
+               IF ( ROWPRM ) THEN
+                   DO 5002 p = N + 1, N + M - 1
+                       IWORK(p) = p - N
+ 5002              CONTINUE
+               END IF
+               IF ( CONDA ) RWORK(1) = -1
+               RWORK(2) = -1
+               RETURN
+            END IF
+*
+            IF ( RWORK(1) .GT. BIG / SQRT(DBLE(M)) ) THEN
+*               .. to prevent overflow in the QR factorization, scale the
+*               matrix by 1/sqrt(M) if too large entry detected
+                CALL ZLASCL('G',0,0,SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR)
+                ASCALED = .TRUE.
+            END IF
+            CALL ZLASWP( N, A, LDA, 1, M-1, IWORK(N+1), 1 )
+      END IF
+*
+*    .. At this stage, preemptive scaling is done only to avoid column
+*    norms overflows during the QR factorization. The SVD procedure should
+*    have its own scaling to save the singular values from overflows and
+*    underflows. That depends on the SVD procedure.
+*
+      IF ( .NOT.ROWPRM ) THEN
+          RTMP = ZLANGE( 'M', M, N, A, LDA, RWORK )
+          IF ( ( RTMP .NE. RTMP ) .OR.
+     $         ( (RTMP*ZERO) .NE. ZERO ) ) THEN
+               INFO = -8
+               CALL XERBLA( 'ZGESVDQ', -INFO )
+               RETURN
+          END IF
+          IF ( RTMP .GT. BIG / SQRT(DBLE(M)) ) THEN
+*             .. to prevent overflow in the QR factorization, scale the
+*             matrix by 1/sqrt(M) if too large entry detected
+              CALL ZLASCL('G',0,0, SQRT(DBLE(M)),ONE, M,N, A,LDA, IERR)
+              ASCALED = .TRUE.
+          END IF
+      END IF
+*
+*     .. QR factorization with column pivoting
+*
+*     A * P = Q * [ R ]
+*                 [ 0 ]
+*
+      DO 1963 p = 1, N
+*        .. all columns are free columns
+         IWORK(p) = 0
+ 1963 CONTINUE
+      CALL ZGEQP3( M, N, A, LDA, IWORK, CWORK, CWORK(N+1), LCWORK-N,
+     $     RWORK, IERR )
+*
+*    If the user requested accuracy level allows truncation in the
+*    computed upper triangular factor, the matrix R is examined and,
+*    if possible, replaced with its leading upper trapezoidal part.
+*
+      EPSLN = DLAMCH('E')
+      SFMIN = DLAMCH('S')
+*     SMALL = SFMIN / EPSLN
+      NR = N
+*
+      IF ( ACCLA ) THEN
+*
+*        Standard absolute error bound suffices. All sigma_i with
+*        sigma_i < N*EPS*||A||_F are flushed to zero. This is an
+*        aggressive enforcement of lower numerical rank by introducing a
+*        backward error of the order of N*EPS*||A||_F.
+         NR = 1
+         RTMP = SQRT(DBLE(N))*EPSLN
+         DO 3001 p = 2, N
+            IF ( ABS(A(p,p)) .LT. (RTMP*ABS(A(1,1))) ) GO TO 3002
+               NR = NR + 1
+ 3001    CONTINUE
+ 3002    CONTINUE
+*
+      ELSEIF ( ACCLM ) THEN
+*        .. similarly as above, only slightly more gentle (less aggressive).
+*        Sudden drop on the diagonal of R is used as the criterion for being
+*        close-to-rank-deficient. The threshold is set to EPSLN=DLAMCH('E').
+*        [[This can be made more flexible by replacing this hard-coded value
+*        with a user specified threshold.]] Also, the values that underflow
+*        will be truncated.
+         NR = 1
+         DO 3401 p = 2, N
+            IF ( ( ABS(A(p,p)) .LT. (EPSLN*ABS(A(p-1,p-1))) ) .OR.
+     $           ( ABS(A(p,p)) .LT. SFMIN ) ) GO TO 3402
+            NR = NR + 1
+ 3401    CONTINUE
+ 3402    CONTINUE
+*
+      ELSE
+*        .. RRQR not authorized to determine numerical rank except in the
+*        obvious case of zero pivots.
+*        .. inspect R for exact zeros on the diagonal;
+*        R(i,i)=0 => R(i:N,i:N)=0.
+         NR = 1
+         DO 3501 p = 2, N
+            IF ( ABS(A(p,p)) .EQ. ZERO ) GO TO 3502
+            NR = NR + 1
+ 3501    CONTINUE
+ 3502    CONTINUE
+*
+         IF ( CONDA ) THEN
+*           Estimate the scaled condition number of A. Use the fact that it is
+*           the same as the scaled condition number of R.
+*              .. V is used as workspace
+               CALL ZLACPY( 'U', N, N, A, LDA, V, LDV )
+*              Only the leading NR x NR submatrix of the triangular factor
+*              is considered. Only if NR=N will this give a reliable error
+*              bound. However, even for NR < N, this can be used on an
+*              expert level and obtain useful information in the sense of
+*              perturbation theory.
+               DO 3053 p = 1, NR
+                  RTMP = DZNRM2( p, V(1,p), 1 )
+                  CALL ZDSCAL( p, ONE/RTMP, V(1,p), 1 )
+ 3053          CONTINUE
+               IF ( .NOT. ( LSVEC .OR. RSVEC ) ) THEN
+                   CALL ZPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  CWORK, RWORK, IERR )
+               ELSE
+                   CALL ZPOCON( 'U', NR, V, LDV, ONE, RTMP,
+     $                  CWORK(N+1), RWORK, IERR )
+               END IF
+               SCONDA = ONE / SQRT(RTMP)
+*           For NR=N, SCONDA is an estimate of SQRT(||(R^* * R)^(-1)||_1),
+*           N^(-1/4) * SCONDA <= ||R^(-1)||_2 <= N^(1/4) * SCONDA
+*           See the reference [1] for more details.
+         END IF
+*
+      ENDIF
+*
+      IF ( WNTUR ) THEN
+          N1 = NR
+      ELSE IF ( WNTUS .OR. WNTUF) THEN
+          N1 = N
+      ELSE IF ( WNTUA ) THEN
+          N1 = M
+      END IF
+*
+      IF ( .NOT. ( RSVEC .OR. LSVEC ) ) THEN
+*.......................................................................
+*        .. only the singular values are requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*         .. compute the singular values of R**H = [A](1:NR,1:N)**H
+*           .. set the lower triangle of [A] to [A](1:NR,1:N)**H and
+*           the upper triangle of [A] to zero.
+            DO 1146 p = 1, MIN( N, NR )
+               A(p,p) = CONJG(A(p,p))
+               DO 1147 q = p + 1, N
+                  A(q,p) = CONJG(A(p,q))
+                  IF ( q .LE. NR ) A(p,q) = CZERO
+ 1147          CONTINUE
+ 1146       CONTINUE
+*
+            CALL ZGESVD( 'N', 'N', N, NR, A, LDA, S, U, LDU,
+     $           V, LDV, CWORK, LCWORK, RWORK, INFO )
+*
+         ELSE
+*
+*           .. compute the singular values of R = [A](1:NR,1:N)
+*
+            IF ( NR .GT. 1 )
+     $          CALL ZLASET( 'L', NR-1,NR-1, CZERO,CZERO, A(2,1), LDA )
+            CALL ZGESVD( 'N', 'N', NR, N, A, LDA, S, U, LDU,
+     $           V, LDV, CWORK, LCWORK, RWORK, INFO )
+*
+         END IF
+*
+      ELSE IF ( LSVEC .AND. ( .NOT. RSVEC) ) THEN
+*.......................................................................
+*       .. the singular values and the left singular vectors requested
+*.......................................................................""""""""
+         IF ( RTRANS ) THEN
+*            .. apply ZGESVD to R**H
+*            .. copy R**H into [U] and overwrite [U] with the right singular
+*            vectors of R
+            DO 1192 p = 1, NR
+               DO 1193 q = p, N
+                  U(q,p) = CONJG(A(p,q))
+ 1193          CONTINUE
+ 1192       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, U(1,2), LDU )
+*           .. the left singular vectors not computed, the NR right singular
+*           vectors overwrite [U](1:NR,1:NR) as conjugate transposed. These
+*           will be pre-multiplied by Q to build the left singular vectors of A.
+               CALL ZGESVD( 'N', 'O', N, NR, U, LDU, S, U, LDU,
+     $              U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+               DO 1119 p = 1, NR
+                   U(p,p) = CONJG(U(p,p))
+                   DO 1120 q = p + 1, NR
+                      CTMP   = CONJG(U(q,p))
+                      U(q,p) = CONJG(U(p,q))
+                      U(p,q) = CTMP
+ 1120              CONTINUE
+ 1119          CONTINUE
+*
+         ELSE
+*            .. apply ZGESVD to R
+*            .. copy R into [U] and overwrite [U] with the left singular vectors
+             CALL ZLACPY( 'U', NR, N, A, LDA, U, LDU )
+             IF ( NR .GT. 1 )
+     $         CALL ZLASET( 'L', NR-1, NR-1, CZERO, CZERO, U(2,1), LDU )
+*            .. the right singular vectors not computed, the NR left singular
+*            vectors overwrite [U](1:NR,1:NR)
+                CALL ZGESVD( 'O', 'N', NR, N, U, LDU, S, U, LDU,
+     $               V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*               .. now [U](1:NR,1:NR) contains the NR left singular vectors of
+*               R. These will be pre-multiplied by Q to build the left singular
+*               vectors of A.
+         END IF
+*
+*           .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+         IF ( ( NR .LT. M ) .AND. ( .NOT.WNTUF ) ) THEN
+             CALL ZLASET('A', M-NR, NR, CZERO, CZERO, U(NR+1,1), LDU)
+             IF ( NR .LT. N1 ) THEN
+                CALL ZLASET( 'A',NR,N1-NR,CZERO,CZERO,U(1,NR+1), LDU )
+                CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $               U(NR+1,NR+1), LDU )
+             END IF
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT.WNTUF )
+     $       CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $            LDU, CWORK(N+1), LCWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+      ELSE IF ( RSVEC .AND. ( .NOT. LSVEC ) ) THEN
+*.......................................................................
+*       .. the singular values and the right singular vectors requested
+*.......................................................................
+          IF ( RTRANS ) THEN
+*            .. apply ZGESVD to R**H
+*            .. copy R**H into V and overwrite V with the left singular vectors
+            DO 1165 p = 1, NR
+               DO 1166 q = p, N
+                  V(q,p) = CONJG(A(p,q))
+ 1166          CONTINUE
+ 1165       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+*           .. the left singular vectors of R**H overwrite V, the right singular
+*           vectors not computed
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+               CALL ZGESVD( 'O', 'N', N, NR, V, LDV, S, U, LDU,
+     $              U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+               DO 1121 p = 1, NR
+                   V(p,p) = CONJG(V(p,p))
+                   DO 1122 q = p + 1, NR
+                      CTMP   = CONJG(V(q,p))
+                      V(q,p) = CONJG(V(p,q))
+                      V(p,q) = CTMP
+ 1122              CONTINUE
+ 1121          CONTINUE
+*
+               IF ( NR .LT. N ) THEN
+                   DO 1103 p = 1, NR
+                      DO 1104 q = NR + 1, N
+                          V(p,q) = CONJG(V(q,p))
+ 1104                 CONTINUE
+ 1103              CONTINUE
+               END IF
+               CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:N,1:NR)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the QR factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                CALL ZLASET('G', N, N-NR, CZERO, CZERO, V(1,NR+1), LDV)
+                CALL ZGESVD( 'O', 'N', N, N, V, LDV, S, U, LDU,
+     $               U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+                DO 1123 p = 1, N
+                   V(p,p) = CONJG(V(p,p))
+                   DO 1124 q = p + 1, N
+                      CTMP   = CONJG(V(q,p))
+                      V(q,p) = CONJG(V(p,q))
+                      V(p,q) = CTMP
+ 1124              CONTINUE
+ 1123           CONTINUE
+                CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK )
+            END IF
+*
+          ELSE
+*            .. aply ZGESVD to R
+*            .. copy R into V and overwrite V with the right singular vectors
+             CALL ZLACPY( 'U', NR, N, A, LDA, V, LDV )
+             IF ( NR .GT. 1 )
+     $         CALL ZLASET( 'L', NR-1, NR-1, CZERO, CZERO, V(2,1), LDV )
+*            .. the right singular vectors overwrite V, the NR left singular
+*            vectors stored in U(1:NR,1:NR)
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+                CALL ZGESVD( 'N', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H
+             ELSE
+*               .. need all N right singular vectors and NR < N
+*               [!] This is simple implementation that augments [V](1:NR,1:N)
+*               by padding a zero block. In the case NR << N, a more efficient
+*               way is to first use the LQ factorization. For more details
+*               how to implement this, see the " FULL SVD " branch.
+                 CALL ZLASET('G', N-NR, N, CZERO,CZERO, V(NR+1,1), LDV)
+                 CALL ZGESVD( 'N', 'O', N, N, V, LDV, S, U, LDU,
+     $                V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                 CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK )
+             END IF
+*            .. now [V] contains the adjoint of the matrix of the right singular
+*            vectors of A.
+          END IF
+*
+      ELSE
+*.......................................................................
+*       .. FULL SVD requested
+*.......................................................................
+         IF ( RTRANS ) THEN
+*
+*            .. apply ZGESVD to R**H [[this option is left for R&D&T]]
+*
+            IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*            .. copy R**H into [V] and overwrite [V] with the left singular
+*            vectors of R**H
+            DO 1168 p = 1, NR
+               DO 1169 q = p, N
+                  V(q,p) = CONJG(A(p,q))
+ 1169          CONTINUE
+ 1168       CONTINUE
+            IF ( NR .GT. 1 )
+     $          CALL ZLASET( 'U', NR-1,NR-1, CZERO,CZERO, V(1,2), LDV )
+*
+*           .. the left singular vectors of R**H overwrite [V], the NR right
+*           singular vectors of R**H stored in [U](1:NR,1:NR) as conjugate
+*           transposed
+               CALL ZGESVD( 'O', 'A', N, NR, V, LDV, S, V, LDV,
+     $              U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*              .. assemble V
+               DO 1115 p = 1, NR
+                  V(p,p) = CONJG(V(p,p))
+                  DO 1116 q = p + 1, NR
+                     CTMP   = CONJG(V(q,p))
+                     V(q,p) = CONJG(V(p,q))
+                     V(p,q) = CTMP
+ 1116             CONTINUE
+ 1115          CONTINUE
+               IF ( NR .LT. N ) THEN
+                   DO 1101 p = 1, NR
+                      DO 1102 q = NR+1, N
+                         V(p,q) = CONJG(V(q,p))
+ 1102                 CONTINUE
+ 1101              CONTINUE
+               END IF
+               CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*
+                DO 1117 p = 1, NR
+                   U(p,p) = CONJG(U(p,p))
+                   DO 1118 q = p + 1, NR
+                      CTMP   = CONJG(U(q,p))
+                      U(q,p) = CONJG(U(p,q))
+                      U(p,q) = CTMP
+ 1118              CONTINUE
+ 1117           CONTINUE
+*
+                IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL ZLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+            ELSE
+*               .. need all N right singular vectors and NR < N
+*            .. copy R**H into [V] and overwrite [V] with the left singular
+*            vectors of R**H
+*               [[The optimal ratio N/NR for using QRF instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'ZGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+                OPTRATIO = 2
+                IF ( OPTRATIO*NR .GT. N ) THEN
+                   DO 1198 p = 1, NR
+                      DO 1199 q = p, N
+                         V(q,p) = CONJG(A(p,q))
+ 1199                 CONTINUE
+ 1198              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL ZLASET('U',NR-1,NR-1, CZERO,CZERO, V(1,2),LDV)
+*
+                   CALL ZLASET('A',N,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                   CALL ZGESVD( 'O', 'A', N, N, V, LDV, S, V, LDV,
+     $                  U, LDU, CWORK(N+1), LCWORK-N, RWORK, INFO )
+*
+                   DO 1113 p = 1, N
+                      V(p,p) = CONJG(V(p,p))
+                      DO 1114 q = p + 1, N
+                         CTMP   = CONJG(V(q,p))
+                         V(q,p) = CONJG(V(p,q))
+                         V(p,q) = CTMP
+ 1114                 CONTINUE
+ 1113              CONTINUE
+                   CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*              .. assemble the left singular vector matrix U of dimensions
+*              (M x N1), i.e. (M x N) or (M x M).
+*
+                   DO 1111 p = 1, N
+                      U(p,p) = CONJG(U(p,p))
+                      DO 1112 q = p + 1, N
+                         CTMP   = CONJG(U(q,p))
+                         U(q,p) = CONJG(U(p,q))
+                         U(p,q) = CTMP
+ 1112                 CONTINUE
+ 1111              CONTINUE
+*
+                   IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL ZLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL ZLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU)
+                        CALL ZLASET('A',M-N,N1-N,CZERO,CONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                   END IF
+                ELSE
+*                  .. copy R**H into [U] and overwrite [U] with the right
+*                  singular vectors of R
+                   DO 1196 p = 1, NR
+                      DO 1197 q = p, N
+                         U(q,NR+p) = CONJG(A(p,q))
+ 1197                 CONTINUE
+ 1196              CONTINUE
+                   IF ( NR .GT. 1 )
+     $             CALL ZLASET('U',NR-1,NR-1,CZERO,CZERO,U(1,NR+2),LDU)
+                   CALL ZGEQRF( N, NR, U(1,NR+1), LDU, CWORK(N+1),
+     $                  CWORK(N+NR+1), LCWORK-N-NR, IERR )
+                   DO 1143 p = 1, NR
+                       DO 1144 q = 1, N
+                           V(q,p) = CONJG(U(p,NR+q))
+ 1144                  CONTINUE
+ 1143              CONTINUE
+                  CALL ZLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV)
+                  CALL ZGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V,LDV, CWORK(N+NR+1),LCWORK-N-NR,RWORK, INFO )
+                  CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  CALL ZUNMQR('R','C', N, N, NR, U(1,NR+1), LDU,
+     $                 CWORK(N+1),V,LDV,CWORK(N+NR+1),LCWORK-N-NR,IERR)
+                  CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL ZLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1),LDU)
+                     END IF
+                  END IF
+                END IF
+            END IF
+*
+         ELSE
+*
+*            .. apply ZGESVD to R [[this is the recommended option]]
+*
+             IF ( WNTVR .OR. ( NR .EQ. N ) ) THEN
+*                .. copy R into [V] and overwrite V with the right singular vectors
+                 CALL ZLACPY( 'U', NR, N, A, LDA, V, LDV )
+                IF ( NR .GT. 1 )
+     $          CALL ZLASET( 'L', NR-1,NR-1, CZERO,CZERO, V(2,1), LDV )
+*               .. the right singular vectors of R overwrite [V], the NR left
+*               singular vectors of R stored in [U](1:NR,1:NR)
+                CALL ZGESVD( 'S', 'O', NR, N, V, LDV, S, U, LDU,
+     $               V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                CALL ZLAPMT( .FALSE., NR, N, V, LDV, IWORK )
+*               .. now [V](1:NR,1:N) contains V(1:N,1:NR)**H
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+               IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                  CALL ZLASET('A', M-NR,NR, CZERO,CZERO, U(NR+1,1), LDU)
+                  IF ( NR .LT. N1 ) THEN
+                     CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1), LDU )
+                  END IF
+               END IF
+*
+             ELSE
+*              .. need all N right singular vectors and NR < N
+*              .. the requested number of the left singular vectors
+*               is then N1 (N or M)
+*               [[The optimal ratio N/NR for using LQ instead of padding
+*                 with zeros. Here hard coded to 2; it must be at least
+*                 two due to work space constraints.]]
+*               OPTRATIO = ILAENV(6, 'ZGESVD', 'S' // 'O', NR,N,0,0)
+*               OPTRATIO = MAX( OPTRATIO, 2 )
+               OPTRATIO = 2
+               IF ( OPTRATIO * NR .GT. N ) THEN
+                  CALL ZLACPY( 'U', NR, N, A, LDA, V, LDV )
+                  IF ( NR .GT. 1 )
+     $            CALL ZLASET('L', NR-1,NR-1, CZERO,CZERO, V(2,1),LDV)
+*              .. the right singular vectors of R overwrite [V], the NR left
+*                 singular vectors of R stored in [U](1:NR,1:NR)
+                  CALL ZLASET('A', N-NR,N, CZERO,CZERO, V(NR+1,1),LDV)
+                  CALL ZGESVD( 'S', 'O', N, N, V, LDV, S, U, LDU,
+     $                 V, LDV, CWORK(N+1), LCWORK-N, RWORK, INFO )
+                  CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*                 .. now [V] contains the adjoint of the matrix of the right
+*                 singular vectors of A. The leading N left singular vectors
+*                 are in [U](1:N,1:N)
+*                 .. assemble the left singular vector matrix U of dimensions
+*                 (M x N1), i.e. (M x N) or (M x M).
+                  IF ( ( N .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                      CALL ZLASET('A',M-N,N,CZERO,CZERO,U(N+1,1),LDU)
+                      IF ( N .LT. N1 ) THEN
+                        CALL ZLASET('A',N,N1-N,CZERO,CZERO,U(1,N+1),LDU)
+                        CALL ZLASET( 'A',M-N,N1-N,CZERO,CONE,
+     $                       U(N+1,N+1), LDU )
+                      END IF
+                  END IF
+               ELSE
+                  CALL ZLACPY( 'U', NR, N, A, LDA, U(NR+1,1), LDU )
+                  IF ( NR .GT. 1 )
+     $            CALL ZLASET('L',NR-1,NR-1,CZERO,CZERO,U(NR+2,1),LDU)
+                  CALL ZGELQF( NR, N, U(NR+1,1), LDU, CWORK(N+1),
+     $                 CWORK(N+NR+1), LCWORK-N-NR, IERR )
+                  CALL ZLACPY('L',NR,NR,U(NR+1,1),LDU,V,LDV)
+                  IF ( NR .GT. 1 )
+     $            CALL ZLASET('U',NR-1,NR-1,CZERO,CZERO,V(1,2),LDV)
+                  CALL ZGESVD( 'S', 'O', NR, NR, V, LDV, S, U, LDU,
+     $                 V, LDV, CWORK(N+NR+1), LCWORK-N-NR, RWORK, INFO )
+                  CALL ZLASET('A',N-NR,NR,CZERO,CZERO,V(NR+1,1),LDV)
+                  CALL ZLASET('A',NR,N-NR,CZERO,CZERO,V(1,NR+1),LDV)
+                  CALL ZLASET('A',N-NR,N-NR,CZERO,CONE,V(NR+1,NR+1),LDV)
+                  CALL ZUNMLQ('R','N',N,N,NR,U(NR+1,1),LDU,CWORK(N+1),
+     $                 V, LDV, CWORK(N+NR+1),LCWORK-N-NR,IERR)
+                  CALL ZLAPMT( .FALSE., N, N, V, LDV, IWORK )
+*               .. assemble the left singular vector matrix U of dimensions
+*              (M x NR) or (M x N) or (M x M).
+                  IF ( ( NR .LT. M ) .AND. .NOT.(WNTUF)) THEN
+                     CALL ZLASET('A',M-NR,NR,CZERO,CZERO,U(NR+1,1),LDU)
+                     IF ( NR .LT. N1 ) THEN
+                     CALL ZLASET('A',NR,N1-NR,CZERO,CZERO,U(1,NR+1),LDU)
+                     CALL ZLASET( 'A',M-NR,N1-NR,CZERO,CONE,
+     $                    U(NR+1,NR+1), LDU )
+                     END IF
+                  END IF
+               END IF
+             END IF
+*        .. end of the "R**H or R" branch
+         END IF
+*
+*           The Q matrix from the first QRF is built into the left singular
+*           vectors matrix U.
+*
+         IF ( .NOT. WNTUF )
+     $       CALL ZUNMQR( 'L', 'N', M, N1, N, A, LDA, CWORK, U,
+     $            LDU, CWORK(N+1), LCWORK-N, IERR )
+         IF ( ROWPRM .AND. .NOT.WNTUF )
+     $          CALL ZLASWP( N1, U, LDU, 1, M-1, IWORK(N+1), -1 )
+*
+*     ... end of the "full SVD" branch
+      END IF
+*
+*     Check whether some singular values are returned as zeros, e.g.
+*     due to underflow, and update the numerical rank.
+      p = NR
+      DO 4001 q = p, 1, -1
+          IF ( S(q) .GT. ZERO ) GO TO 4002
+          NR = NR - 1
+ 4001 CONTINUE
+ 4002 CONTINUE
+*
+*     .. if numerical rank deficiency is detected, the truncated
+*     singular values are set to zero.
+      IF ( NR .LT. N ) CALL DLASET( 'G', N-NR,1, ZERO,ZERO, S(NR+1), N )
+*     .. undo scaling; this may cause overflow in the largest singular
+*     values.
+      IF ( ASCALED )
+     $   CALL DLASCL( 'G',0,0, ONE,SQRT(DBLE(M)), NR,1, S, N, IERR )
+      IF ( CONDA ) RWORK(1) = SCONDA
+      RWORK(2) = p - NR
+*     .. p-NR is the number of singular values that are computed as
+*     exact zeros in ZGESVD() applied to the (possibly truncated)
+*     full row rank triangular (trapezoidal) factor of A.
+      NUMRANK = NR
+*
+      RETURN
+*
+*     End of ZGESVDQ
+*
+      END
diff --git a/lapack-netlib/SRC/zgesvdx.f b/lapack-netlib/SRC/zgesvdx.f
index 56b5cd4f2..12b20c0ba 100644
--- a/lapack-netlib/SRC/zgesvdx.f
+++ b/lapack-netlib/SRC/zgesvdx.f
@@ -18,7 +18,7 @@
 *  Definition:
 *  ===========
 *
-*     SUBROUTINE CGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU,
+*     SUBROUTINE ZGESVDX( JOBU, JOBVT, RANGE, M, N, A, LDA, VL, VU,
 *    $                    IL, IU, NS, S, U, LDU, VT, LDVT, WORK,
 *    $                    LWORK, RWORK, IWORK, INFO )
 *
diff --git a/lapack-netlib/SRC/zgesvj.f b/lapack-netlib/SRC/zgesvj.f
index fd32f92d8..7c25a3495 100644
--- a/lapack-netlib/SRC/zgesvj.f
+++ b/lapack-netlib/SRC/zgesvj.f
@@ -89,12 +89,12 @@
 *>          Specifies whether to compute the right singular vectors, that
 *>          is, the matrix V:
 *>          = 'V' or 'J': the matrix V is computed and returned in the array V
-*>          = 'A' : the Jacobi rotations are applied to the MV-by-N
+*>          = 'A':  the Jacobi rotations are applied to the MV-by-N
 *>                  array V. In other words, the right singular vector
 *>                  matrix V is not computed explicitly; instead it is
 *>                  applied to an MV-by-N matrix initially stored in the
 *>                  first MV rows of V.
-*>          = 'N' : the matrix V is not computed and the array V is not
+*>          = 'N':  the matrix V is not computed and the array V is not
 *>                  referenced
 *> \endverbatim
 *>
@@ -116,8 +116,8 @@
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>          On entry, the M-by-N matrix A.
 *>          On exit,
-*>          If JOBU .EQ. 'U' .OR. JOBU .EQ. 'C':
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'U' .OR. JOBU = 'C':
+*>                 If INFO = 0 :
 *>                 RANKA orthonormal columns of U are returned in the
 *>                 leading RANKA columns of the array A. Here RANKA <= N
 *>                 is the number of computed singular values of A that are
@@ -127,9 +127,9 @@
 *>                 in the array RWORK as RANKA=NINT(RWORK(2)). Also see the
 *>                 descriptions of SVA and RWORK. The computed columns of U
 *>                 are mutually numerically orthogonal up to approximately
-*>                 TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU.EQ.'C'),
+*>                 TOL=SQRT(M)*EPS (default); or TOL=CTOL*EPS (JOBU = 'C'),
 *>                 see the description of JOBU.
-*>                 If INFO .GT. 0,
+*>                 If INFO > 0,
 *>                 the procedure ZGESVJ did not converge in the given number
 *>                 of iterations (sweeps). In that case, the computed
 *>                 columns of U may not be orthogonal up to TOL. The output
@@ -137,8 +137,8 @@
 *>                 values in SVA(1:N)) and V is still a decomposition of the
 *>                 input matrix A in the sense that the residual
 *>                 || A - SCALE * U * SIGMA * V^* ||_2 / ||A||_2 is small.
-*>          If JOBU .EQ. 'N':
-*>                 If INFO .EQ. 0 :
+*>          If JOBU = 'N':
+*>                 If INFO = 0 :
 *>                 Note that the left singular vectors are 'for free' in the
 *>                 one-sided Jacobi SVD algorithm. However, if only the
 *>                 singular values are needed, the level of numerical
@@ -147,7 +147,7 @@
 *>                 numerically orthogonal up to approximately M*EPS. Thus,
 *>                 on exit, A contains the columns of U scaled with the
 *>                 corresponding singular values.
-*>                 If INFO .GT. 0 :
+*>                 If INFO > 0:
 *>                 the procedure ZGESVJ did not converge in the given number
 *>                 of iterations (sweeps).
 *> \endverbatim
@@ -162,9 +162,9 @@
 *> \verbatim
 *>          SVA is DOUBLE PRECISION array, dimension (N)
 *>          On exit,
-*>          If INFO .EQ. 0 :
+*>          If INFO = 0 :
 *>          depending on the value SCALE = RWORK(1), we have:
-*>                 If SCALE .EQ. ONE:
+*>                 If SCALE = ONE:
 *>                 SVA(1:N) contains the computed singular values of A.
 *>                 During the computation SVA contains the Euclidean column
 *>                 norms of the iterated matrices in the array A.
@@ -173,7 +173,7 @@
 *>                 factored representation is due to the fact that some of the
 *>                 singular values of A might underflow or overflow.
 *>
-*>          If INFO .GT. 0 :
+*>          If INFO > 0:
 *>          the procedure ZGESVJ did not converge in the given number of
 *>          iterations (sweeps) and SCALE*SVA(1:N) may not be accurate.
 *> \endverbatim
@@ -181,7 +181,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then the product of Jacobi rotations in ZGESVJ
+*>          If JOBV = 'A', then the product of Jacobi rotations in ZGESVJ
 *>          is applied to the first MV rows of V. See the description of JOBV.
 *> \endverbatim
 *>
@@ -199,16 +199,16 @@
 *> \param[in] LDV
 *> \verbatim
 *>          LDV is INTEGER
-*>          The leading dimension of the array V, LDV .GE. 1.
-*>          If JOBV .EQ. 'V', then LDV .GE. max(1,N).
-*>          If JOBV .EQ. 'A', then LDV .GE. max(1,MV) .
+*>          The leading dimension of the array V, LDV >= 1.
+*>          If JOBV = 'V', then LDV >= max(1,N).
+*>          If JOBV = 'A', then LDV >= max(1,MV) .
 *> \endverbatim
 *>
 *> \param[in,out] CWORK
 *> \verbatim
 *>          CWORK is COMPLEX*16 array, dimension (max(1,LWORK))
 *>          Used as workspace.
-*>          If on entry LWORK .EQ. -1, then a workspace query is assumed and
+*>          If on entry LWORK = -1, then a workspace query is assumed and
 *>          no computation is done; CWORK(1) is set to the minial (and optimal)
 *>          length of CWORK.
 *> \endverbatim
@@ -223,7 +223,7 @@
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (max(6,LRWORK))
 *>          On entry,
-*>          If JOBU .EQ. 'C' :
+*>          If JOBU = 'C' :
 *>          RWORK(1) = CTOL, where CTOL defines the threshold for convergence.
 *>                    The process stops if all columns of A are mutually
 *>                    orthogonal up to CTOL*EPS, EPS=DLAMCH('E').
@@ -243,11 +243,11 @@
 *>          RWORK(5) = max_{i.NE.j} |COS(A(:,i),A(:,j))| in the last sweep.
 *>                    This is useful information in cases when ZGESVJ did
 *>                    not converge, as it can be used to estimate whether
-*>                    the output is stil useful and for post festum analysis.
+*>                    the output is still useful and for post festum analysis.
 *>          RWORK(6) = the largest absolute value over all sines of the
 *>                    Jacobi rotation angles in the last sweep. It can be
 *>                    useful for a post festum analysis.
-*>         If on entry LRWORK .EQ. -1, then a workspace query is assumed and
+*>         If on entry LRWORK = -1, then a workspace query is assumed and
 *>         no computation is done; RWORK(1) is set to the minial (and optimal)
 *>         length of RWORK.
 *> \endverbatim
@@ -261,9 +261,9 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
-*>          > 0 : ZGESVJ did not converge in the maximal allowed number
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
+*>          > 0:  ZGESVJ did not converge in the maximal allowed number
 *>                (NSWEEP=30) of sweeps. The output may still be useful.
 *>                See the description of RWORK.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zgesvxx.f b/lapack-netlib/SRC/zgesvxx.f
index c3727b70e..60bb71cd3 100644
--- a/lapack-netlib/SRC/zgesvxx.f
+++ b/lapack-netlib/SRC/zgesvxx.f
@@ -411,7 +411,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -447,14 +447,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -462,9 +462,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/zgetsls.f b/lapack-netlib/SRC/zgetsls.f
index 5ce11efef..1aab3c662 100644
--- a/lapack-netlib/SRC/zgetsls.f
+++ b/lapack-netlib/SRC/zgetsls.f
@@ -1,3 +1,5 @@
+*> \brief \b ZGETSLS
+*
 *  Definition:
 *  ===========
 *
diff --git a/lapack-netlib/SRC/zggesx.f b/lapack-netlib/SRC/zggesx.f
index 661523465..c546e61f1 100644
--- a/lapack-netlib/SRC/zggesx.f
+++ b/lapack-netlib/SRC/zggesx.f
@@ -120,10 +120,10 @@
 *> \verbatim
 *>          SENSE is CHARACTER*1
 *>          Determines which reciprocal condition numbers are computed.
-*>          = 'N' : None are computed;
-*>          = 'E' : Computed for average of selected eigenvalues only;
-*>          = 'V' : Computed for selected deflating subspaces only;
-*>          = 'B' : Computed for both.
+*>          = 'N': None are computed;
+*>          = 'E': Computed for average of selected eigenvalues only;
+*>          = 'V': Computed for selected deflating subspaces only;
+*>          = 'B': Computed for both.
 *>          If SENSE = 'E', 'V', or 'B', SORT must equal 'S'.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zgsvj0.f b/lapack-netlib/SRC/zgsvj0.f
index c4a6bd38a..ab7e31725 100644
--- a/lapack-netlib/SRC/zgsvj0.f
+++ b/lapack-netlib/SRC/zgsvj0.f
@@ -117,7 +117,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -125,9 +125,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
@@ -136,8 +136,8 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -157,7 +157,7 @@
 *>          TOL is DOUBLE PRECISION
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -175,14 +175,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zgsvj1.f b/lapack-netlib/SRC/zgsvj1.f
index 91e39ca8a..f0a23034b 100644
--- a/lapack-netlib/SRC/zgsvj1.f
+++ b/lapack-netlib/SRC/zgsvj1.f
@@ -61,7 +61,7 @@
 *> In terms of the columns of A, the first N1 columns are rotated 'against'
 *> the remaining N-N1 columns, trying to increase the angle between the
 *> corresponding subspaces. The off-diagonal block is N1-by(N-N1) and it is
-*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parmeter.
+*> tiled using quadratic tiles of side KBL. Here, KBL is a tunning parameter.
 *> The number of sweeps is given in NSWEEP and the orthogonality threshold
 *> is given in TOL.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *> \param[in] MV
 *> \verbatim
 *>          MV is INTEGER
-*>          If JOBV .EQ. 'A', then MV rows of V are post-multipled by a
+*>          If JOBV = 'A', then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then MV is not referenced.
 *> \endverbatim
@@ -155,9 +155,9 @@
 *> \param[in,out] V
 *> \verbatim
 *>          V is COMPLEX*16 array, dimension (LDV,N)
-*>          If JOBV .EQ. 'V' then N rows of V are post-multipled by a
+*>          If JOBV = 'V' then N rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
-*>          If JOBV .EQ. 'A' then MV rows of V are post-multipled by a
+*>          If JOBV = 'A' then MV rows of V are post-multipled by a
 *>                           sequence of Jacobi rotations.
 *>          If JOBV = 'N',   then V is not referenced.
 *> \endverbatim
@@ -166,8 +166,8 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of the array V,  LDV >= 1.
-*>          If JOBV = 'V', LDV .GE. N.
-*>          If JOBV = 'A', LDV .GE. MV.
+*>          If JOBV = 'V', LDV >= N.
+*>          If JOBV = 'A', LDV >= MV.
 *> \endverbatim
 *>
 *> \param[in] EPS
@@ -187,7 +187,7 @@
 *>          TOL is DOUBLE PRECISION
 *>          TOL is the threshold for Jacobi rotations. For a pair
 *>          A(:,p), A(:,q) of pivot columns, the Jacobi rotation is
-*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) .GT. TOL.
+*>          applied only if ABS(COS(angle(A(:,p),A(:,q)))) > TOL.
 *> \endverbatim
 *>
 *> \param[in] NSWEEP
@@ -205,14 +205,14 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>          LWORK is the dimension of WORK. LWORK .GE. M.
+*>          LWORK is the dimension of WORK. LWORK >= M.
 *> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>          = 0 : successful exit.
-*>          < 0 : if INFO = -i, then the i-th argument had an illegal value
+*>          = 0:  successful exit.
+*>          < 0:  if INFO = -i, then the i-th argument had an illegal value
 *> \endverbatim
 *
 *  Authors:
diff --git a/lapack-netlib/SRC/zhb2st_kernels.f b/lapack-netlib/SRC/zhb2st_kernels.f
index a440b5c0d..2c0cb6870 100644
--- a/lapack-netlib/SRC/zhb2st_kernels.f
+++ b/lapack-netlib/SRC/zhb2st_kernels.f
@@ -1,26 +1,26 @@
 *> \brief \b ZHB2ST_KERNELS
 *
 *  @precisions fortran z -> s d c
-*      
+*
 *  =========== DOCUMENTATION ===========
 *
-* Online html documentation available at 
-*            http://www.netlib.org/lapack/explore-html/ 
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
 *
 *> \htmlonly
-*> Download ZHB2ST_KERNELS + dependencies 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhb2st_kernels.f"> 
-*> [TGZ]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhb2st_kernels.f"> 
-*> [ZIP]</a> 
-*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhb2st_kernels.f"> 
+*> Download ZHB2ST_KERNELS + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zhb2st_kernels.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zhb2st_kernels.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zhb2st_kernels.f">
 *> [TXT]</a>
-*> \endhtmlonly 
+*> \endhtmlonly
 *
 *  Definition:
 *  ===========
 *
-*       SUBROUTINE  ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+*       SUBROUTINE  ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
 *                                   ST, ED, SWEEP, N, NB, IB,
 *                                   A, LDA, V, TAU, LDVT, WORK)
 *
@@ -32,9 +32,9 @@
 *       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *       ..
 *       .. Array Arguments ..
-*       COMPLEX*16         A( LDA, * ), V( * ), 
+*       COMPLEX*16         A( LDA, * ), V( * ),
 *                          TAU( * ), WORK( * )
-*  
+*
 *> \par Purpose:
 *  =============
 *>
@@ -124,7 +124,7 @@
 *>          LDVT is INTEGER.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array. Workspace of size nb.
 *> \endverbatim
@@ -147,7 +147,7 @@
 *>  http://doi.acm.org/10.1145/2063384.2063394
 *>
 *>  A. Haidar, J. Kurzak, P. Luszczek, 2013.
-*>  An improved parallel singular value algorithm and its implementation 
+*>  An improved parallel singular value algorithm and its implementation
 *>  for multicore hardware, In Proceedings of 2013 International Conference
 *>  for High Performance Computing, Networking, Storage and Analysis (SC '13).
 *>  Denver, Colorado, USA, 2013.
@@ -155,16 +155,16 @@
 *>  http://doi.acm.org/10.1145/2503210.2503292
 *>
 *>  A. Haidar, R. Solca, S. Tomov, T. Schulthess and J. Dongarra.
-*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure 
+*>  A novel hybrid CPU-GPU generalized eigensolver for electronic structure
 *>  calculations based on fine-grained memory aware tasks.
 *>  International Journal of High Performance Computing Applications.
 *>  Volume 28 Issue 2, Pages 196-209, May 2014.
-*>  http://hpc.sagepub.com/content/28/2/196 
+*>  http://hpc.sagepub.com/content/28/2/196
 *>
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE  ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE, 
+      SUBROUTINE  ZHB2ST_KERNELS( UPLO, WANTZ, TTYPE,
      $                            ST, ED, SWEEP, N, NB, IB,
      $                            A, LDA, V, TAU, LDVT, WORK)
 *
@@ -181,7 +181,7 @@
       INTEGER            TTYPE, ST, ED, SWEEP, N, NB, IB, LDA, LDVT
 *     ..
 *     .. Array Arguments ..
-      COMPLEX*16         A( LDA, * ), V( * ), 
+      COMPLEX*16         A( LDA, * ), V( * ),
      $                   TAU( * ), WORK( * )
 *     ..
 *
@@ -195,8 +195,8 @@
 *     .. Local Scalars ..
       LOGICAL            UPPER
       INTEGER            I, J1, J2, LM, LN, VPOS, TAUPOS,
-     $                   DPOS, OFDPOS, AJETER 
-      COMPLEX*16         CTMP 
+     $                   DPOS, OFDPOS, AJETER
+      COMPLEX*16         CTMP
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ZLARFG, ZLARFX, ZLARFY
@@ -209,7 +209,7 @@
 *     ..
 *     ..
 *     .. Executable Statements ..
-*      
+*
       AJETER = IB + LDVT
       UPPER = LSAME( UPLO, 'U' )
 
@@ -240,10 +240,10 @@
               V( VPOS ) = ONE
               DO 10 I = 1, LM-1
                   V( VPOS+I )         = DCONJG( A( OFDPOS-I, ST+I ) )
-                  A( OFDPOS-I, ST+I ) = ZERO  
+                  A( OFDPOS-I, ST+I ) = ZERO
    10         CONTINUE
               CTMP = DCONJG( A( OFDPOS, ST ) )
-              CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, 
+              CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
               A( OFDPOS, ST ) = CTMP
 *
@@ -281,14 +281,14 @@
 *
                   V( VPOS ) = ONE
                   DO 30 I = 1, LM-1
-                      V( VPOS+I )          = 
+                      V( VPOS+I )          =
      $                                    DCONJG( A( DPOS-NB-I, J1+I ) )
                       A( DPOS-NB-I, J1+I ) = ZERO
    30             CONTINUE
                   CTMP = DCONJG( A( DPOS-NB, J1 ) )
                   CALL ZLARFG( LM, CTMP, V( VPOS+1 ), 1, TAU( TAUPOS ) )
                   A( DPOS-NB, J1 ) = CTMP
-*                 
+*
                   CALL ZLARFX( 'Right', LN-1, LM, V( VPOS ),
      $                         TAU( TAUPOS ),
      $                         A( DPOS-NB+1, J1 ), LDA-1, WORK)
@@ -296,9 +296,9 @@
           ENDIF
 *
 *     Lower case
-*  
+*
       ELSE
-*      
+*
           IF( WANTZ ) THEN
               VPOS   = MOD( SWEEP-1, 2 ) * N + ST
               TAUPOS = MOD( SWEEP-1, 2 ) * N + ST
@@ -313,9 +313,9 @@
               V( VPOS ) = ONE
               DO 20 I = 1, LM-1
                   V( VPOS+I )         = A( OFDPOS+I, ST-1 )
-                  A( OFDPOS+I, ST-1 ) = ZERO  
+                  A( OFDPOS+I, ST-1 ) = ZERO
    20         CONTINUE
-              CALL ZLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1, 
+              CALL ZLARFG( LM, A( OFDPOS, ST-1 ), V( VPOS+1 ), 1,
      $                                       TAU( TAUPOS ) )
 *
               LM = ED - ST + 1
@@ -342,7 +342,7 @@
               LM = J2-J1+1
 *
               IF( LM.GT.0) THEN
-                  CALL ZLARFX( 'Right', LM, LN, V( VPOS ), 
+                  CALL ZLARFX( 'Right', LM, LN, V( VPOS ),
      $                         TAU( TAUPOS ), A( DPOS+NB, ST ),
      $                         LDA-1, WORK)
 *
@@ -359,13 +359,13 @@
                       V( VPOS+I )        = A( DPOS+NB+I, ST )
                       A( DPOS+NB+I, ST ) = ZERO
    40             CONTINUE
-                  CALL ZLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1, 
+                  CALL ZLARFG( LM, A( DPOS+NB, ST ), V( VPOS+1 ), 1,
      $                                        TAU( TAUPOS ) )
 *
-                  CALL ZLARFX( 'Left', LM, LN-1, V( VPOS ), 
+                  CALL ZLARFX( 'Left', LM, LN-1, V( VPOS ),
      $                         DCONJG( TAU( TAUPOS ) ),
      $                         A( DPOS+NB-1, ST+1 ), LDA-1, WORK)
-             
+
               ENDIF
           ENDIF
       ENDIF
@@ -374,4 +374,4 @@
 *
 *     END OF ZHB2ST_KERNELS
 *
-      END      
+      END
diff --git a/lapack-netlib/SRC/zhecon_3.f b/lapack-netlib/SRC/zhecon_3.f
index 8c3a9f32b..9d2a240b6 100644
--- a/lapack-netlib/SRC/zhecon_3.f
+++ b/lapack-netlib/SRC/zhecon_3.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE ZHECON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND,
-*                            WORK, IWORK, INFO )
+*                            WORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          UPLO
@@ -27,7 +27,7 @@
 *       DOUBLE PRECISION   ANORM, RCOND
 *       ..
 *       .. Array Arguments ..
-*       INTEGER            IPIV( * ), IWORK( * )
+*       INTEGER            IPIV( * )
 *       COMPLEX*16         A( LDA, * ), E ( * ), WORK( * )
 *       ..
 *
@@ -129,11 +129,6 @@
 *>          WORK is COMPLEX*16 array, dimension (2*N)
 *> \endverbatim
 *>
-*> \param[out] IWORK
-*> \verbatim
-*>          IWORK is INTEGER array, dimension (N)
-*> \endverbatim
-*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
diff --git a/lapack-netlib/SRC/zheevr.f b/lapack-netlib/SRC/zheevr.f
index 810373c83..def2d1f9d 100644
--- a/lapack-netlib/SRC/zheevr.f
+++ b/lapack-netlib/SRC/zheevr.f
@@ -210,7 +210,7 @@
 *>          eigenvalues are computed to high relative accuracy when
 *>          possible in future releases.  The current code does not
 *>          make any guarantees about high relative accuracy, but
-*>          furutre releases will. See J. Barlow and J. Demmel,
+*>          future releases will. See J. Barlow and J. Demmel,
 *>          "Computing Accurate Eigensystems of Scaled Diagonally
 *>          Dominant Matrices", LAPACK Working Note #7, for a discussion
 *>          of which matrices define their eigenvalues to high relative
diff --git a/lapack-netlib/SRC/zheevr_2stage.f b/lapack-netlib/SRC/zheevr_2stage.f
index ab7f3374e..fe4a72160 100644
--- a/lapack-netlib/SRC/zheevr_2stage.f
+++ b/lapack-netlib/SRC/zheevr_2stage.f
@@ -217,7 +217,7 @@
 *>          eigenvalues are computed to high relative accuracy when
 *>          possible in future releases.  The current code does not
 *>          make any guarantees about high relative accuracy, but
-*>          furutre releases will. See J. Barlow and J. Demmel,
+*>          future releases will. See J. Barlow and J. Demmel,
 *>          "Computing Accurate Eigensystems of Scaled Diagonally
 *>          Dominant Matrices", LAPACK Working Note #7, for a discussion
 *>          of which matrices define their eigenvalues to high relative
diff --git a/lapack-netlib/SRC/zhegs2.f b/lapack-netlib/SRC/zhegs2.f
index 0bdc653b9..aec526353 100644
--- a/lapack-netlib/SRC/zhegs2.f
+++ b/lapack-netlib/SRC/zhegs2.f
@@ -97,6 +97,7 @@
 *>          B is COMPLEX*16 array, dimension (LDB,N)
 *>          The triangular factor from the Cholesky factorization of B,
 *>          as returned by ZPOTRF.
+*>          B is modified by the routine but restored on exit.
 *> \endverbatim
 *>
 *> \param[in] LDB
diff --git a/lapack-netlib/SRC/zhegst.f b/lapack-netlib/SRC/zhegst.f
index d0c08a8f6..dcf5fe8b5 100644
--- a/lapack-netlib/SRC/zhegst.f
+++ b/lapack-netlib/SRC/zhegst.f
@@ -97,6 +97,7 @@
 *>          B is COMPLEX*16 array, dimension (LDB,N)
 *>          The triangular factor from the Cholesky factorization of B,
 *>          as returned by ZPOTRF.
+*>          B is modified by the routine but restored on exit.
 *> \endverbatim
 *>
 *> \param[in] LDB
diff --git a/lapack-netlib/SRC/zherfsx.f b/lapack-netlib/SRC/zherfsx.f
index d176b102c..fa11702a8 100644
--- a/lapack-netlib/SRC/zherfsx.f
+++ b/lapack-netlib/SRC/zherfsx.f
@@ -102,7 +102,7 @@
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
-*>     The symmetric matrix A.  If UPLO = 'U', the leading N-by-N
+*>     The Hermitian matrix A.  If UPLO = 'U', the leading N-by-N
 *>     upper triangular part of A contains the upper triangular
 *>     part of the matrix A, and the strictly lower triangular
 *>     part of A is not referenced.  If UPLO = 'L', the leading
@@ -270,7 +270,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -306,14 +306,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -321,9 +321,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/zhesv_aa.f b/lapack-netlib/SRC/zhesv_aa.f
index 8511f0e7d..5f1a9f4b3 100644
--- a/lapack-netlib/SRC/zhesv_aa.f
+++ b/lapack-netlib/SRC/zhesv_aa.f
@@ -42,7 +42,7 @@
 *> matrices.
 *>
 *> Aasen's algorithm is used to factor A as
-*>    A = U * T * U**H,  if UPLO = 'U', or
+*>    A = U**H * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**H,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is Hermitian and tridiagonal. The factored form
@@ -86,7 +86,7 @@
 *>
 *>          On exit, if INFO = 0, the tridiagonal matrix T and the
 *>          multipliers used to obtain the factor U or L from the
-*>          factorization A = U*T*U**H or A = L*T*L**H as computed by
+*>          factorization A = U**H*T*U or A = L*T*L**H as computed by
 *>          ZHETRF_AA.
 *> \endverbatim
 *>
@@ -230,7 +230,7 @@
          RETURN
       END IF
 *
-*     Compute the factorization A = U*T*U**H or A = L*T*L**H.
+*     Compute the factorization A = U**H*T*U or A = L*T*L**H.
 *
       CALL ZHETRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
       IF( INFO.EQ.0 ) THEN
diff --git a/lapack-netlib/SRC/zhesv_aa_2stage.f b/lapack-netlib/SRC/zhesv_aa_2stage.f
index ed221dc69..7a4e35f45 100644
--- a/lapack-netlib/SRC/zhesv_aa_2stage.f
+++ b/lapack-netlib/SRC/zhesv_aa_2stage.f
@@ -44,7 +44,7 @@
 *> matrices.
 *>
 *> Aasen's 2-stage algorithm is used to factor A as
-*>    A = U * T * U**H,  if UPLO = 'U', or
+*>    A = U**H * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**H,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is Hermitian and band. The matrix T is
@@ -211,9 +211,7 @@
 *
 *     .. Local Scalars ..
       LOGICAL            UPPER, TQUERY, WQUERY
-      INTEGER            I, J, K, I1, I2, TD
-      INTEGER            LDTB, LWKOPT, NB, KB, NT, IINFO
-      COMPLEX            PIV
+      INTEGER            LWKOPT
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME
@@ -263,7 +261,7 @@
          RETURN
       END IF
 *
-*     Compute the factorization A = U*T*U**H or A = L*T*L**H.
+*     Compute the factorization A = U**H*T*U or A = L*T*L**H.
 *
       CALL ZHETRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
      $                       WORK, LWORK, INFO )
diff --git a/lapack-netlib/SRC/zhesvxx.f b/lapack-netlib/SRC/zhesvxx.f
index 375fc072d..20168185c 100644
--- a/lapack-netlib/SRC/zhesvxx.f
+++ b/lapack-netlib/SRC/zhesvxx.f
@@ -46,7 +46,7 @@
 *>
 *>    ZHESVXX uses the diagonal pivoting factorization to compute the
 *>    solution to a complex*16 system of linear equations A * X = B, where
-*>    A is an N-by-N symmetric matrix and X and B are N-by-NRHS
+*>    A is an N-by-N Hermitian matrix and X and B are N-by-NRHS
 *>    matrices.
 *>
 *>    If requested, both normwise and maximum componentwise error bounds
@@ -88,7 +88,7 @@
 *>       A = L * D * L**T,  if UPLO = 'L',
 *>
 *>    where U (or L) is a product of permutation and unit upper (lower)
-*>    triangular matrices, and D is symmetric and block diagonal with
+*>    triangular matrices, and D is Hermitian and block diagonal with
 *>    1-by-1 and 2-by-2 diagonal blocks.
 *>
 *>    3. If some D(i,i)=0, so that D is exactly singular, then the
@@ -161,7 +161,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
-*>     The symmetric matrix A.  If UPLO = 'U', the leading N-by-N
+*>     The Hermitian matrix A.  If UPLO = 'U', the leading N-by-N
 *>     upper triangular part of A contains the upper triangular
 *>     part of the matrix A, and the strictly lower triangular
 *>     part of A is not referenced.  If UPLO = 'L', the leading
@@ -378,7 +378,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -414,14 +414,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -429,9 +429,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/zhetf2_rk.f b/lapack-netlib/SRC/zhetf2_rk.f
index 84d3a0248..6578214df 100644
--- a/lapack-netlib/SRC/zhetf2_rk.f
+++ b/lapack-netlib/SRC/zhetf2_rk.f
@@ -322,7 +322,7 @@
 *
 *        Factorize A as U*D*U**H using the upper triangle of A
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -676,7 +676,7 @@
 *
 *        Factorize A as L*D*L**H using the lower triangle of A
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/zhetrd_2stage.f b/lapack-netlib/SRC/zhetrd_2stage.f
index 9d6a426a3..1a2c00a2f 100644
--- a/lapack-netlib/SRC/zhetrd_2stage.f
+++ b/lapack-netlib/SRC/zhetrd_2stage.f
@@ -123,23 +123,22 @@
 *>
 *> \param[out] HOUS2
 *> \verbatim
-*>          HOUS2 is COMPLEX*16 array, dimension LHOUS2, that
-*>          store the Householder representation of the stage2
+*>          HOUS2 is COMPLEX*16 array, dimension (LHOUS2)
+*>          Stores the Householder representation of the stage2
 *>          band to tridiagonal.
 *> \endverbatim
 *>
 *> \param[in] LHOUS2
 *> \verbatim
 *>          LHOUS2 is INTEGER
-*>          The dimension of the array HOUS2. LHOUS2 = MAX(1, dimension)
-*>          If LWORK = -1, or LHOUS2=-1,
+*>          The dimension of the array HOUS2.
+*>          If LWORK = -1, or LHOUS2 = -1,
 *>          then a query is assumed; the routine
 *>          only calculates the optimal size of the HOUS2 array, returns
 *>          this value as the first entry of the HOUS2 array, and no error
 *>          message related to LHOUS2 is issued by XERBLA.
-*>          LHOUS2 = MAX(1, dimension) where
-*>          dimension = 4*N if VECT='N'
-*>          not available now if VECT='H'
+*>          If VECT='N', LHOUS2 = max(1, 4*n);
+*>          if VECT='V', option not yet available.
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/zhetrd_hb2st.F b/lapack-netlib/SRC/zhetrd_hb2st.F
index 86122cccc..4ba7bfc21 100644
--- a/lapack-netlib/SRC/zhetrd_hb2st.F
+++ b/lapack-netlib/SRC/zhetrd_hb2st.F
@@ -50,9 +50,9 @@
 *  Arguments:
 *  ==========
 *
-*> \param[in] STAGE
+*> \param[in] STAGE1
 *> \verbatim
-*>          STAGE is CHARACTER*1
+*>          STAGE1 is CHARACTER*1
 *>          = 'N':  "No": to mention that the stage 1 of the reduction  
 *>                  from dense to band using the zhetrd_he2hb routine
 *>                  was not called before this routine to reproduce AB. 
diff --git a/lapack-netlib/SRC/zhetrd_he2hb.f b/lapack-netlib/SRC/zhetrd_he2hb.f
index e33bf4b2b..b85b3889a 100644
--- a/lapack-netlib/SRC/zhetrd_he2hb.f
+++ b/lapack-netlib/SRC/zhetrd_he2hb.f
@@ -363,7 +363,7 @@
 *
 *
 *     Set the workspace of the triangular matrix T to zero once such a
-*     way everytime T is generated the upper/lower portion will be always zero  
+*     way every time T is generated the upper/lower portion will be always zero
 *   
       CALL ZLASET( "A", LDT, KD, ZERO, ZERO, WORK( TPOS ), LDT )
 *
diff --git a/lapack-netlib/SRC/zhetrf_aa.f b/lapack-netlib/SRC/zhetrf_aa.f
index e355aed14..b80a84118 100644
--- a/lapack-netlib/SRC/zhetrf_aa.f
+++ b/lapack-netlib/SRC/zhetrf_aa.f
@@ -37,7 +37,7 @@
 *> ZHETRF_AA computes the factorization of a complex hermitian matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**H  or  A = L*T*L**H
+*>    A = U**H*T*U  or  A = L*T*L**H
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a hermitian tridiagonal matrix.
@@ -223,7 +223,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**H using the upper triangle of A
+*        Factorize A as U**H*D*U using the upper triangle of A
 *        .....................................................
 *
 *        copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N))
@@ -256,7 +256,7 @@
      $                      A( MAX(1, J), J+1 ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
@@ -376,7 +376,7 @@
      $                      A( J+1, MAX(1, J) ), LDA,
      $                      IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
diff --git a/lapack-netlib/SRC/zhetrf_aa_2stage.f b/lapack-netlib/SRC/zhetrf_aa_2stage.f
index 73c0ebe9a..f63713664 100644
--- a/lapack-netlib/SRC/zhetrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrf_aa_2stage.f
@@ -38,7 +38,7 @@
 *> ZHETRF_AA_2STAGE computes the factorization of a double hermitian matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**H*T*U  or  A = L*T*L**H
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a hermitian band matrix with the
@@ -66,7 +66,7 @@
 *>
 *> \param[in,out] A
 *> \verbatim
-*>          A is COMPLEX array, dimension (LDA,N)
+*>          A is COMPLEX*16 array, dimension (LDA,N)
 *>          On entry, the hermitian matrix A.  If UPLO = 'U', the leading
 *>          N-by-N upper triangular part of A contains the upper
 *>          triangular part of the matrix A, and the strictly lower
@@ -87,7 +87,7 @@
 *>
 *> \param[out] TB
 *> \verbatim
-*>          TB is COMPLEX array, dimension (LTB)
+*>          TB is COMPLEX*16 array, dimension (LTB)
 *>          On exit, details of the LU factorization of the band matrix.
 *> \endverbatim
 *>
@@ -121,7 +121,7 @@
 *>
 *> \param[out] WORK
 *> \verbatim
-*>          WORK is COMPLEX workspace of size LWORK
+*>          WORK is COMPLEX*16 workspace of size LWORK
 *> \endverbatim
 *>
 *> \param[in] LWORK
@@ -276,7 +276,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**H*D*U using the upper triangle of A
 *        .....................................................
 *
          DO J = 0, NT-1
@@ -452,14 +452,17 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL ZSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, 
      $                                A( (J+1)*NB+1, I2 ), 1 )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
-     $                                    A( I1+1, I2 ), 1 )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) ) THEN
+                        CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
+     $                                       A( I1+1, I2 ), 1 )
+                        CALL ZLACGV( I2-I1-1, A( I1+1, I2 ), 1 )
+                     END IF
                      CALL ZLACGV( I2-I1, A( I1, I1+1 ), LDA )
-                     CALL ZLACGV( I2-I1-1, A( I1+1, I2 ), 1 )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA,
-     $                                 A( I2, I2+1 ), LDA ) 
+                     IF( I2.LT.N )
+     $                  CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA,
+     $                                    A( I2, I2+1 ), LDA ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
@@ -476,7 +479,7 @@ c               END IF
       ELSE
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the lower triangle of A
+*        Factorize A as L*D*L**H using the lower triangle of A
 *        .....................................................
 *
          DO J = 0, NT-1
@@ -629,14 +632,17 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL ZSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, 
      $                                A( I2, (J+1)*NB+1 ), LDA )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1,
-     $                                    A( I2, I1+1 ), LDA )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) ) THEN
+                        CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1,
+     $                                       A( I2, I1+1 ), LDA )
+                        CALL ZLACGV( I2-I1-1, A( I2, I1+1 ), LDA )
+                     END IF
                      CALL ZLACGV( I2-I1, A( I1+1, I1 ), 1 )
-                     CALL ZLACGV( I2-I1-1, A( I2, I1+1 ), LDA )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL ZSWAP( N-I2, A( I2+1, I1 ), 1,
-     $                                 A( I2+1, I2 ), 1 ) 
+                     IF( I2.LT.N )
+     $                  CALL ZSWAP( N-I2, A( I2+1, I1 ), 1,
+     $                                    A( I2+1, I2 ), 1 ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
diff --git a/lapack-netlib/SRC/zhetri2.f b/lapack-netlib/SRC/zhetri2.f
index a7acff49f..ae43b14fe 100644
--- a/lapack-netlib/SRC/zhetri2.f
+++ b/lapack-netlib/SRC/zhetri2.f
@@ -62,7 +62,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
+*>          On entry, the block diagonal matrix D and the multipliers
 *>          used to obtain the factor U or L as computed by ZHETRF.
 *>
 *>          On exit, if INFO = 0, the (symmetric) inverse of the original
@@ -82,7 +82,7 @@
 *> \param[in] IPIV
 *> \verbatim
 *>          IPIV is INTEGER array, dimension (N)
-*>          Details of the interchanges and the NB structure of D
+*>          Details of the interchanges and the block structure of D
 *>          as determined by ZHETRF.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zhetrs_aa.f b/lapack-netlib/SRC/zhetrs_aa.f
index 9d302b9cd..4b3253abc 100644
--- a/lapack-netlib/SRC/zhetrs_aa.f
+++ b/lapack-netlib/SRC/zhetrs_aa.f
@@ -38,8 +38,8 @@
 *> \verbatim
 *>
 *> ZHETRS_AA solves a system of linear equations A*X = B with a complex
-*> hermitian matrix A using the factorization A = U*T*U**H or
-*> A = L*T*L**T computed by ZHETRF_AA.
+*> hermitian matrix A using the factorization A = U**H*T*U or
+*> A = L*T*L**H computed by ZHETRF_AA.
 *> \endverbatim
 *
 *  Arguments:
@@ -50,7 +50,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**H;
+*>          = 'U':  Upper triangular, form is A = U**H*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**H.
 *> \endverbatim
 *>
@@ -98,14 +98,16 @@
 *>          The leading dimension of the array B.  LDB >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE array, dimension (MAX(1,LWORK))
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER, LWORK >= MAX(1,3*N-2).
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -199,61 +201,80 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**H*T*U.
 *
-*        Pivot, P**T * B
+*        1) Forward substitution with U**H
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $          CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+         IF( N.GT.1 ) THEN
 *
-*        Compute (U \P**T * B) -> B    [ (U \P**T * B) ]
+*           Pivot, P**T * B -> B
 *
-         CALL ZTRSM('L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
 *
-*        Compute T \ B -> B   [ T \ (U \P**T * B) ]
+*           Compute U**H \ B -> B    [ (U**H \P**T * B) ]
 *
-         CALL ZLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1)
+            CALL ZTRSM( 'L', 'U', 'C', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB )
+         END IF
+*
+*        2) Solve with triangular matrix T
+*
+*        Compute T \ B -> B   [ T \ (U**H \P**T * B) ]
+*
+         CALL ZLACPY( 'F', 1, N, A(1, 1), LDA+1, WORK(N), 1 )
          IF( N.GT.1 ) THEN
              CALL ZLACPY( 'F', 1, N-1, A( 1, 2 ), LDA+1, WORK( 2*N ), 1)
-             CALL ZLACPY( 'F', 1, N-1, A( 1, 2 ), LDA+1, WORK( 1 ), 1)
+             CALL ZLACPY( 'F', 1, N-1, A( 1, 2 ), LDA+1, WORK( 1 ), 1 )
              CALL ZLACGV( N-1, WORK( 1 ), 1 )
          END IF
-         CALL ZGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB,
-     $              INFO)
+         CALL ZGTSV( N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB,
+     $               INFO )
+*
+*        3) Backward substitution with U
+*
+         IF( N.GT.1 ) THEN
 *
-*        Compute (U**T \ B) -> B   [ U**T \ (T \ (U \P**T * B) ) ]
+*           Compute U \ B -> B   [ U \ (T \ (U**H \P**T * B) ) ]
 *
-         CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B(2, 1), LDB)
+            CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B(2, 1), LDB)
 *
-*        Pivot, P * B  [ P * (U**T \ (T \ (U \P**T * B) )) ]
+*           Pivot, P * B  [ P * (U**H \ (T \ (U \P**T * B) )) ]
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       ELSE
 *
-*        Solve A*X = B, where A = L*T*L**T.
+*        Solve A*X = B, where A = L*T*L**H.
+*
+*        1) Forward substitution with L
 *
-*        Pivot, P**T * B
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
 *
-*        Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute L \ B -> B    [ (L \P**T * B) ]
 *
-         CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $               B(2, 1), LDB)
+            CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B(2, 1), LDB)
+         END IF
+*
+*        2) Solve with triangular matrix T
 *
 *        Compute T \ B -> B   [ T \ (L \P**T * B) ]
 *
@@ -266,18 +287,23 @@
          CALL ZGTSV(N, NRHS, WORK(1), WORK(N), WORK(2*N), B, LDB,
      $              INFO)
 *
-*        Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
+*        3) Backward substitution with L**H
+*
+         IF( N.GT.1 ) THEN
 *
-         CALL ZTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $              B( 2, 1 ), LDB)
+*           Compute L**H \ B -> B   [ L**H \ (T \ (L \P**T * B) ) ]
 *
-*        Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+            CALL ZTRSM( 'L', 'L', 'C', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Pivot, P * B  [ P * (L**H \ (T \ (L \P**T * B) )) ]
+*
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       END IF
 *
diff --git a/lapack-netlib/SRC/zhetrs_aa_2stage.f b/lapack-netlib/SRC/zhetrs_aa_2stage.f
index 7fcee1118..c621bd571 100644
--- a/lapack-netlib/SRC/zhetrs_aa_2stage.f
+++ b/lapack-netlib/SRC/zhetrs_aa_2stage.f
@@ -38,8 +38,8 @@
 *> \verbatim
 *>
 *> ZHETRS_AA_2STAGE solves a system of linear equations A*X = B with a 
-*> hermitian matrix A using the factorization A = U*T*U**T or
-*> A = L*T*L**T computed by ZHETRF_AA_2STAGE.
+*> hermitian matrix A using the factorization A = U**H*T*U or
+*> A = L*T*L**H computed by ZHETRF_AA_2STAGE.
 *> \endverbatim
 *
 *  Arguments:
@@ -50,8 +50,8 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
-*>          = 'L':  Lower triangular, form is A = L*T*L**T.
+*>          = 'U':  Upper triangular, form is A = U**H*T*U;
+*>          = 'L':  Lower triangular, form is A = L*T*L**H.
 *> \endverbatim
 *>
 *> \param[in] N
@@ -210,33 +210,33 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**H*T*U.
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (U**T \P**T * B) -> B    [ (U**T \P**T * B) ]
+*           Compute (U**H \ B) -> B    [ (U**H \P**T * B) ]
 *
             CALL ZTRSM( 'L', 'U', 'C', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                 LDA, B(NB+1, 1), LDB)
 *
          END IF
 *
-*        Compute T \ B -> B   [ T \ (U**T \P**T * B) ]
+*        Compute T \ B -> B   [ T \ (U**H \P**T * B) ]
 *
          CALL ZGBTRS( 'N', N, NB, NB, NRHS, TB, LDTB, IPIV2, B, LDB,
      $               INFO)
          IF( N.GT.NB ) THEN
 *
-*           Compute (U \ B) -> B   [ U \ (T \ (U**T \P**T * B) ) ]
+*           Compute (U \ B) -> B   [ U \ (T \ (U**H \P**T * B) ) ]
 *
             CALL ZTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**H \P**T * B) )) ]
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
@@ -244,15 +244,15 @@
 *
       ELSE
 *
-*        Solve A*X = B, where A = L*T*L**T.
+*        Solve A*X = B, where A = L*T*L**H.
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute (L \ B) -> B    [ (L \P**T * B) ]
 *
             CALL ZTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -265,12 +265,12 @@
      $               INFO)
          IF( N.GT.NB ) THEN
 *
-*           Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
+*           Compute (L**H \ B) -> B   [ L**H \ (T \ (L \P**T * B) ) ]
 *
             CALL ZTRSM( 'L', 'L', 'C', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (L**H \ (T \ (L \P**T * B) )) ]
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f
index 1e8134c39..2ee874dfd 100644
--- a/lapack-netlib/SRC/zhseqr.f
+++ b/lapack-netlib/SRC/zhseqr.f
@@ -69,7 +69,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -86,7 +86,7 @@
 *>           set by a previous call to ZGEBAL, and then passed to ZGEHRD
 *>           when the matrix output by ZGEBAL is reduced to Hessenberg
 *>           form. Otherwise ILO and IHI should be set to 1 and N
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -98,17 +98,17 @@
 *>           triangular matrix T from the Schur decomposition (the
 *>           Schur form). If INFO = 0 and JOB = 'E', the contents of
 *>           H are unspecified on exit.  (The output value of H when
-*>           INFO.GT.0 is given under the description of INFO below.)
+*>           INFO > 0 is given under the description of INFO below.)
 *>
 *>           Unlike earlier versions of ZHSEQR, this subroutine may
-*>           explicitly H(i,j) = 0 for i.GT.j and j = 1, 2, ... ILO-1
+*>           explicitly H(i,j) = 0 for i > j and j = 1, 2, ... ILO-1
 *>           or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] W
@@ -131,7 +131,7 @@
 *>           if INFO = 0, Z contains Q*Z.
 *>           Normally Q is the unitary matrix generated by ZUNGHR
 *>           after the call to ZGEHRD which formed the Hessenberg matrix
-*>           H. (The output value of Z when INFO.GT.0 is given under
+*>           H. (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -139,7 +139,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if COMPZ = 'I' or
-*>           COMPZ = 'V', then LDZ.GE.MAX(1,N).  Otherwize, LDZ.GE.1.
+*>           COMPZ = 'V', then LDZ >= MAX(1,N).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -152,7 +152,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient and delivers very good and sometimes
 *>           optimal performance.  However, LWORK as large as 11*N
 *>           may be required for optimal performance.  A workspace
@@ -170,21 +170,21 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .LT. 0:  if INFO = -i, the i-th argument had an illegal
+*>             = 0:  successful exit
+*>             < 0:  if INFO = -i, the i-th argument had an illegal
 *>                    value
-*>           .GT. 0:  if INFO = i, ZHSEQR failed to compute all of
-*>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
-*>                and WI contain those eigenvalues which have been
+*>             > 0:  if INFO = i, ZHSEQR failed to compute all of
+*>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of W
+*>                contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and JOB = 'E', then on exit, the
+*>                If INFO > 0 and JOB = 'E', then on exit, the
 *>                remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and JOB   = 'S', then on exit
+*>                If INFO > 0 and JOB   = 'S', then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -192,19 +192,19 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and COMPZ = 'V', then on exit
+*>                If INFO > 0 and COMPZ = 'V', then on exit
 *>
 *>                  (final value of Z)  =  (initial value of Z)*U
 *>
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'I', then on exit
+*>                If INFO > 0 and COMPZ = 'I', then on exit
 *>                      (final value of Z)  = U
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of JOB.)
 *>
-*>                If INFO .GT. 0 and COMPZ = 'N', then Z is not
+*>                If INFO > 0 and COMPZ = 'N', then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -244,8 +244,8 @@
 *>                      This depends on ILO, IHI and NS.  NS is the
 *>                      number of simultaneous shifts returned
 *>                      by ILAENV(ISPEC=15).  (See ISPEC=15 below.)
-*>                      The default for (IHI-ILO+1).LE.500 is NS.
-*>                      The default for (IHI-ILO+1).GT.500 is 3*NS/2.
+*>                      The default for (IHI-ILO+1) <= 500 is NS.
+*>                      The default for (IHI-ILO+1) >  500 is 3*NS/2.
 *>
 *>            ISPEC=14: Nibble crossover point. (See IPARMQ for
 *>                      details.)  Default: 14% of deflation window
@@ -323,8 +323,8 @@
       PARAMETER          ( NTINY = 11 )
 *
 *     ==== NL allocates some local workspace to help small matrices
-*     .    through a rare ZLAHQR failure.  NL .GT. NTINY = 11 is
-*     .    required and NL .LE. NMIN = ILAENV(ISPEC=12,...) is recom-
+*     .    through a rare ZLAHQR failure.  NL > NTINY = 11 is
+*     .    required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom-
 *     .    mended.  (The default value of NMIN is 75.)  Using NL = 49
 *     .    allows up to six simultaneous shifts and a 16-by-16
 *     .    deflation window.  ====
diff --git a/lapack-netlib/SRC/zla_gbrcond_c.f b/lapack-netlib/SRC/zla_gbrcond_c.f
index 20109124b..5b2dc46fc 100644
--- a/lapack-netlib/SRC/zla_gbrcond_c.f
+++ b/lapack-netlib/SRC/zla_gbrcond_c.f
@@ -133,13 +133,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_gbrcond_x.f b/lapack-netlib/SRC/zla_gbrcond_x.f
index 7e6c12ea5..17e9eede7 100644
--- a/lapack-netlib/SRC/zla_gbrcond_x.f
+++ b/lapack-netlib/SRC/zla_gbrcond_x.f
@@ -126,13 +126,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_gbrfsx_extended.f b/lapack-netlib/SRC/zla_gbrfsx_extended.f
index 7a850f1aa..a22b5592e 100644
--- a/lapack-netlib/SRC/zla_gbrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gbrfsx_extended.f
@@ -65,19 +65,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -269,7 +269,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/zla_gercond_c.f b/lapack-netlib/SRC/zla_gercond_c.f
index e629f90e8..a1c0df588 100644
--- a/lapack-netlib/SRC/zla_gercond_c.f
+++ b/lapack-netlib/SRC/zla_gercond_c.f
@@ -22,7 +22,7 @@
 *                                                LDAF, IPIV, C, CAPPLY,
 *                                                INFO, WORK, RWORK )
 *
-*       .. Scalar Aguments ..
+*       .. Scalar Arguments ..
 *       CHARACTER          TRANS
 *       LOGICAL            CAPPLY
 *       INTEGER            N, LDA, LDAF, INFO
@@ -114,13 +114,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
@@ -148,7 +148,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
-*     .. Scalar Aguments ..
+*     .. Scalar Arguments ..
       CHARACTER          TRANS
       LOGICAL            CAPPLY
       INTEGER            N, LDA, LDAF, INFO
diff --git a/lapack-netlib/SRC/zla_gercond_x.f b/lapack-netlib/SRC/zla_gercond_x.f
index 244bf58a3..3aa63ea84 100644
--- a/lapack-netlib/SRC/zla_gercond_x.f
+++ b/lapack-netlib/SRC/zla_gercond_x.f
@@ -107,13 +107,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_gerfsx_extended.f b/lapack-netlib/SRC/zla_gerfsx_extended.f
index 2e93e265e..e42ffa8e2 100644
--- a/lapack-netlib/SRC/zla_gerfsx_extended.f
+++ b/lapack-netlib/SRC/zla_gerfsx_extended.f
@@ -64,19 +64,19 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] TRANS_TYPE
 *> \verbatim
 *>          TRANS_TYPE is INTEGER
 *>     Specifies the transposition operation on A.
-*>     The value is defined by ILATRANS(T) where T is a CHARACTER and
-*>     T    = 'N':  No transpose
+*>     The value is defined by ILATRANS(T) where T is a CHARACTER and T
+*>          = 'N':  No transpose
 *>          = 'T':  Transpose
 *>          = 'C':  Conjugate transpose
 *> \endverbatim
@@ -256,7 +256,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERRS_C is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERRS_C is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERRS_C(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/zla_hercond_c.f b/lapack-netlib/SRC/zla_hercond_c.f
index 61cfe95f1..7c933cc3c 100644
--- a/lapack-netlib/SRC/zla_hercond_c.f
+++ b/lapack-netlib/SRC/zla_hercond_c.f
@@ -111,13 +111,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_hercond_x.f b/lapack-netlib/SRC/zla_hercond_x.f
index 9c19b487d..ee283c0b5 100644
--- a/lapack-netlib/SRC/zla_hercond_x.f
+++ b/lapack-netlib/SRC/zla_hercond_x.f
@@ -104,13 +104,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_herfsx_extended.f b/lapack-netlib/SRC/zla_herfsx_extended.f
index 5b43a58b9..8329080ef 100644
--- a/lapack-netlib/SRC/zla_herfsx_extended.f
+++ b/lapack-netlib/SRC/zla_herfsx_extended.f
@@ -66,11 +66,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -254,7 +254,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/zla_herpvgrw.f b/lapack-netlib/SRC/zla_herpvgrw.f
index 557d6e830..d414c371f 100644
--- a/lapack-netlib/SRC/zla_herpvgrw.f
+++ b/lapack-netlib/SRC/zla_herpvgrw.f
@@ -102,7 +102,7 @@
 *>     as determined by ZHETRF.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zla_porcond_c.f b/lapack-netlib/SRC/zla_porcond_c.f
index a74295b41..2e591dd09 100644
--- a/lapack-netlib/SRC/zla_porcond_c.f
+++ b/lapack-netlib/SRC/zla_porcond_c.f
@@ -103,13 +103,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_porcond_x.f b/lapack-netlib/SRC/zla_porcond_x.f
index 0b2c84f42..4f409544f 100644
--- a/lapack-netlib/SRC/zla_porcond_x.f
+++ b/lapack-netlib/SRC/zla_porcond_x.f
@@ -96,13 +96,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_porfsx_extended.f b/lapack-netlib/SRC/zla_porfsx_extended.f
index 85dd42780..169a9a5d4 100644
--- a/lapack-netlib/SRC/zla_porfsx_extended.f
+++ b/lapack-netlib/SRC/zla_porfsx_extended.f
@@ -65,11 +65,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -246,7 +246,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/zla_porpvgrw.f b/lapack-netlib/SRC/zla_porpvgrw.f
index cd71635ec..f669b2864 100644
--- a/lapack-netlib/SRC/zla_porpvgrw.f
+++ b/lapack-netlib/SRC/zla_porpvgrw.f
@@ -86,7 +86,7 @@
 *>     The leading dimension of the array AF.  LDAF >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zla_syrcond_c.f b/lapack-netlib/SRC/zla_syrcond_c.f
index be9d14bd0..ff44d6c3b 100644
--- a/lapack-netlib/SRC/zla_syrcond_c.f
+++ b/lapack-netlib/SRC/zla_syrcond_c.f
@@ -111,13 +111,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_syrcond_x.f b/lapack-netlib/SRC/zla_syrcond_x.f
index 2d0269092..53022bbfb 100644
--- a/lapack-netlib/SRC/zla_syrcond_x.f
+++ b/lapack-netlib/SRC/zla_syrcond_x.f
@@ -104,13 +104,13 @@
 *>     i > 0:  The ith argument is invalid.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is COMPLEX*16 array, dimension (2*N).
 *>     Workspace.
 *> \endverbatim
 *>
-*> \param[in] RWORK
+*> \param[out] RWORK
 *> \verbatim
 *>          RWORK is DOUBLE PRECISION array, dimension (N).
 *>     Workspace.
diff --git a/lapack-netlib/SRC/zla_syrfsx_extended.f b/lapack-netlib/SRC/zla_syrfsx_extended.f
index a9716fd23..69844c94b 100644
--- a/lapack-netlib/SRC/zla_syrfsx_extended.f
+++ b/lapack-netlib/SRC/zla_syrfsx_extended.f
@@ -66,11 +66,11 @@
 *> \verbatim
 *>          PREC_TYPE is INTEGER
 *>     Specifies the intermediate precision to be used in refinement.
-*>     The value is defined by ILAPREC(P) where P is a CHARACTER and
-*>     P    = 'S':  Single
+*>     The value is defined by ILAPREC(P) where P is a CHARACTER and P
+*>          = 'S':  Single
 *>          = 'D':  Double
 *>          = 'I':  Indigenous
-*>          = 'X', 'E':  Extra
+*>          = 'X' or 'E':  Extra
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -254,7 +254,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
diff --git a/lapack-netlib/SRC/zla_syrpvgrw.f b/lapack-netlib/SRC/zla_syrpvgrw.f
index ccf4fc2d6..82c9f52f8 100644
--- a/lapack-netlib/SRC/zla_syrpvgrw.f
+++ b/lapack-netlib/SRC/zla_syrpvgrw.f
@@ -102,7 +102,7 @@
 *>     as determined by ZSYTRF.
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
 *>          WORK is DOUBLE PRECISION array, dimension (2*N)
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zla_wwaddw.f b/lapack-netlib/SRC/zla_wwaddw.f
index b4f9df332..f06113a95 100644
--- a/lapack-netlib/SRC/zla_wwaddw.f
+++ b/lapack-netlib/SRC/zla_wwaddw.f
@@ -36,7 +36,7 @@
 *>    ZLA_WWADDW adds a vector W into a doubled-single vector (X, Y).
 *>
 *>    This works for all extant IBM's hex and binary floating point
-*>    arithmetics, but not for decimal.
+*>    arithmetic, but not for decimal.
 *> \endverbatim
 *
 *  Arguments:
diff --git a/lapack-netlib/SRC/zlahef_aa.f b/lapack-netlib/SRC/zlahef_aa.f
index 8bad4aba9..ddd1e9493 100644
--- a/lapack-netlib/SRC/zlahef_aa.f
+++ b/lapack-netlib/SRC/zlahef_aa.f
@@ -288,8 +288,9 @@
 *
 *              Swap A(I1, I2+1:N) with A(I2, I2+1:N)
 *
-               CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
-     $                           A( J1+I2-1, I2+1 ), LDA )
+               IF( I2.LT.M )
+     $            CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
+     $                              A( J1+I2-1, I2+1 ), LDA )
 *
 *              Swap A(I1, I1) with A(I2,I2)
 *
@@ -329,13 +330,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1)
 *
-            IF( A( K, J+1 ).NE.ZERO ) THEN
-               ALPHA = ONE / A( K, J+1 )
-               CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
-               CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
-            ELSE
-               CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO,
-     $                      A( K, J+2 ), LDA)
+            IF( J.LT.(M-1) ) THEN
+               IF( A( K, J+1 ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( K, J+1 )
+                  CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
+                  CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
+               ELSE
+                  CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO,
+     $                         A( K, J+2 ), LDA)
+               END IF
             END IF
          END IF
          J = J + 1
@@ -440,8 +443,9 @@
 *
 *              Swap A(I2+1:N, I1) with A(I2+1:N, I2)
 *
-               CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
-     $                           A( I2+1, J1+I2-1 ), 1 )
+               IF( I2.LT.M )
+     $            CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
+     $                              A( I2+1, J1+I2-1 ), 1 )
 *
 *              Swap A(I1, I1) with A(I2, I2)
 *
@@ -481,13 +485,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:N ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:N, J) = L(J+2:N, J+1)
 *
-            IF( A( J+1, K ).NE.ZERO ) THEN
-               ALPHA = ONE / A( J+1, K )
-               CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
-               CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
-            ELSE
-               CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO,
-     $                      A( J+2, K ), LDA )
+            IF( J.LT.(M-1) ) THEN
+               IF( A( J+1, K ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( J+1, K )
+                  CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
+                  CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
+               ELSE
+                  CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO,
+     $                         A( J+2, K ), LDA )
+               END IF
             END IF
          END IF
          J = J + 1
diff --git a/lapack-netlib/SRC/zlahef_rk.f b/lapack-netlib/SRC/zlahef_rk.f
index d8d54f4ce..6a8549cf5 100644
--- a/lapack-netlib/SRC/zlahef_rk.f
+++ b/lapack-netlib/SRC/zlahef_rk.f
@@ -331,7 +331,7 @@
 *        Factorize the trailing columns of A using the upper triangle
 *        of A and working backwards, and compute the matrix W = U12*D
 *        for use in updating A11 (note that conjg(W) is actually stored)
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -789,7 +789,7 @@
 *        of A and working forwards, and compute the matrix W = L21*D
 *        for use in updating A22 (note that conjg(W) is actually stored)
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/zlahqr.f b/lapack-netlib/SRC/zlahqr.f
index 19015b3fa..0a8318874 100644
--- a/lapack-netlib/SRC/zlahqr.f
+++ b/lapack-netlib/SRC/zlahqr.f
@@ -138,26 +138,26 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>           =   0: successful exit
-*>          .GT. 0: if INFO = i, ZLAHQR failed to compute all the
+*>           = 0:   successful exit
+*>           > 0:   if INFO = i, ZLAHQR failed to compute all the
 *>                  eigenvalues ILO to IHI in a total of 30 iterations
 *>                  per eigenvalue; elements i+1:ihi of W contain
 *>                  those eigenvalues which have been successfully
 *>                  computed.
 *>
-*>                  If INFO .GT. 0 and WANTT is .FALSE., then on exit,
+*>                  If INFO > 0 and WANTT is .FALSE., then on exit,
 *>                  the remaining unconverged eigenvalues are the
 *>                  eigenvalues of the upper Hessenberg matrix
-*>                  rows and columns ILO thorugh INFO of the final,
+*>                  rows and columns ILO through INFO of the final,
 *>                  output value of H.
 *>
-*>                  If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                  If INFO > 0 and WANTT is .TRUE., then on exit
 *>          (*)       (initial value of H)*U  = U*(final value of H)
-*>                  where U is an orthognal matrix.    The final
+*>                  where U is an orthogonal matrix.    The final
 *>                  value of H is upper Hessenberg and triangular in
 *>                  rows and columns INFO+1 through IHI.
 *>
-*>                  If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                  If INFO > 0 and WANTZ is .TRUE., then on exit
 *>                      (final value of Z)  = (initial value of Z)*U
 *>                  where U is the orthogonal matrix in (*)
 *>                  (regardless of the value of WANTT.)
diff --git a/lapack-netlib/SRC/zlamswlq.f b/lapack-netlib/SRC/zlamswlq.f
index 0e0b0a1da..f32f5667c 100644
--- a/lapack-netlib/SRC/zlamswlq.f
+++ b/lapack-netlib/SRC/zlamswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b ZLAMSWLQ
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/zlamtsqr.f b/lapack-netlib/SRC/zlamtsqr.f
index 1ee732425..034c45505 100644
--- a/lapack-netlib/SRC/zlamtsqr.f
+++ b/lapack-netlib/SRC/zlamtsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b ZLAMTSQR
 *
 *  Definition:
 *  ===========
diff --git a/lapack-netlib/SRC/zlangb.f b/lapack-netlib/SRC/zlangb.f
index 949bb2c01..e40a470fd 100644
--- a/lapack-netlib/SRC/zlangb.f
+++ b/lapack-netlib/SRC/zlangb.f
@@ -130,6 +130,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            KL, KU, LDAB, N
@@ -147,14 +148,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K, L
-      DOUBLE PRECISION   SCALE, SUM, VALUE, TEMP
+      DOUBLE PRECISION   SUM, VALUE, TEMP
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
@@ -207,15 +211,22 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
             L = MAX( 1, J-KU )
             K = KU + 1 - J + L
-            CALL ZLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL ZLASSQ( MIN( N, J+KL )-L+1, AB( K, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL DCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANGB = VALUE
diff --git a/lapack-netlib/SRC/zlange.f b/lapack-netlib/SRC/zlange.f
index 5407decef..8162786fb 100644
--- a/lapack-netlib/SRC/zlange.f
+++ b/lapack-netlib/SRC/zlange.f
@@ -120,6 +120,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, M, N
@@ -137,14 +138,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      DOUBLE PRECISION   SCALE, SUM, VALUE, TEMP
+      DOUBLE PRECISION   SUM, VALUE, TEMP
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
@@ -196,13 +200,19 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL ZLASSQ( M, A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL ZLASSQ( M, A( 1, J ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL DCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANGE = VALUE
diff --git a/lapack-netlib/SRC/zlanhb.f b/lapack-netlib/SRC/zlanhb.f
index b3717804f..16b5c117c 100644
--- a/lapack-netlib/SRC/zlanhb.f
+++ b/lapack-netlib/SRC/zlanhb.f
@@ -137,6 +137,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            K, LDAB, N
@@ -154,14 +155,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, L
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX, MIN, SQRT
@@ -233,39 +237,57 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( K.GT.0 ) THEN
             IF( LSAME( UPLO, 'U' ) ) THEN
                DO 110 J = 2, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL ZLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   110          CONTINUE
                L = K + 1
             ELSE
                DO 120 J = 1, N - 1
-                  CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   120          CONTINUE
                L = 1
             END IF
-            SUM = 2*SUM
+            SSQ( 2 ) = 2*SSQ( 2 )
          ELSE
             L = 1
          END IF
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 J = 1, N
             IF( DBLE( AB( L, J ) ).NE.ZERO ) THEN
                ABSA = ABS( DBLE( AB( L, J ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANHB = VALUE
diff --git a/lapack-netlib/SRC/zlanhe.f b/lapack-netlib/SRC/zlanhe.f
index 7c7f7f3be..5aef9a756 100644
--- a/lapack-netlib/SRC/zlanhe.f
+++ b/lapack-netlib/SRC/zlanhe.f
@@ -129,6 +129,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            LDA, N
@@ -146,14 +147,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, SQRT
@@ -223,31 +227,48 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL ZLASSQ( J-1, A( 1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( J-1, A( 1, J ), 1,
+     $                      COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL ZLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( N-J, A( J+1, J ), 1,
+     $                      COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          DO 130 I = 1, N
             IF( DBLE( A( I, I ) ).NE.ZERO ) THEN
                ABSA = ABS( DBLE( A( I, I ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( SSQ( 1 ).LT.ABSA ) THEN
+                  SSQ( 2 ) = ONE + SSQ( 2 )*( SSQ( 1 ) / ABSA )**2
+                  SSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  SSQ( 2 ) = SSQ( 2 ) + ( ABSA / SSQ( 1 ) )**2
                END IF
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANHE = VALUE
diff --git a/lapack-netlib/SRC/zlanhp.f b/lapack-netlib/SRC/zlanhp.f
index 9ded60746..d795aeca9 100644
--- a/lapack-netlib/SRC/zlanhp.f
+++ b/lapack-netlib/SRC/zlanhp.f
@@ -122,6 +122,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            N
@@ -139,14 +140,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, SQRT
@@ -225,31 +229,48 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          K = 2
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL ZLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
                K = K + J
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL ZLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
                K = K + N - J + 1
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          K = 1
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 I = 1, N
             IF( DBLE( AP( K ) ).NE.ZERO ) THEN
                ABSA = ABS( DBLE( AP( K ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( LSAME( UPLO, 'U' ) ) THEN
@@ -258,7 +279,8 @@
                K = K + N - I + 1
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANHP = VALUE
diff --git a/lapack-netlib/SRC/zlanhs.f b/lapack-netlib/SRC/zlanhs.f
index f2d36b304..bd8e86be9 100644
--- a/lapack-netlib/SRC/zlanhs.f
+++ b/lapack-netlib/SRC/zlanhs.f
@@ -114,6 +114,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM
       INTEGER            LDA, N
@@ -131,14 +132,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
@@ -190,13 +194,20 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
-         SCALE = ZERO
-         SUM = ONE
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
          DO 90 J = 1, N
-            CALL ZLASSQ( MIN( N, J+1 ), A( 1, J ), 1, SCALE, SUM )
+            COLSSQ( 1 ) = ZERO
+            COLSSQ( 2 ) = ONE
+            CALL ZLASSQ( MIN( N, J+1 ), A( 1, J ), 1,
+     $                   COLSSQ( 1 ), COLSSQ( 2 ) )
+            CALL DCOMBSSQ( SSQ, COLSSQ )
    90    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANHS = VALUE
diff --git a/lapack-netlib/SRC/zlansb.f b/lapack-netlib/SRC/zlansb.f
index 3468c49b3..245dcaf4b 100644
--- a/lapack-netlib/SRC/zlansb.f
+++ b/lapack-netlib/SRC/zlansb.f
@@ -135,6 +135,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            K, LDAB, N
@@ -152,14 +153,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, L
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
@@ -227,29 +231,47 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( K.GT.0 ) THEN
             IF( LSAME( UPLO, 'U' ) ) THEN
                DO 110 J = 2, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL ZLASSQ( MIN( J-1, K ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   110          CONTINUE
                L = K + 1
             ELSE
                DO 120 J = 1, N - 1
-                  CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   120          CONTINUE
                L = 1
             END IF
-            SUM = 2*SUM
+            SSQ( 2 ) = 2*SSQ( 2 )
          ELSE
             L = 1
          END IF
-         CALL ZLASSQ( N, AB( L, 1 ), LDAB, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL ZLASSQ( N, AB( L, 1 ), LDAB, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANSB = VALUE
diff --git a/lapack-netlib/SRC/zlansp.f b/lapack-netlib/SRC/zlansp.f
index 84fb972bb..fa9220487 100644
--- a/lapack-netlib/SRC/zlansp.f
+++ b/lapack-netlib/SRC/zlansp.f
@@ -120,6 +120,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            N
@@ -137,14 +138,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J, K
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, DIMAG, SQRT
@@ -219,40 +223,57 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          K = 2
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL ZLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( J-1, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
                K = K + J
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL ZLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( N-J, AP( K ), 1, COLSSQ( 1 ), COLSSQ( 2 ) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
                K = K + N - J + 1
   120       CONTINUE
          END IF
-         SUM = 2*SUM
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
          K = 1
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
          DO 130 I = 1, N
             IF( DBLE( AP( K ) ).NE.ZERO ) THEN
                ABSA = ABS( DBLE( AP( K ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( DIMAG( AP( K ) ).NE.ZERO ) THEN
                ABSA = ABS( DIMAG( AP( K ) ) )
-               IF( SCALE.LT.ABSA ) THEN
-                  SUM = ONE + SUM*( SCALE / ABSA )**2
-                  SCALE = ABSA
+               IF( COLSSQ( 1 ).LT.ABSA ) THEN
+                  COLSSQ( 2 ) = ONE + COLSSQ(2)*( COLSSQ(1) / ABSA )**2
+                  COLSSQ( 1 ) = ABSA
                ELSE
-                  SUM = SUM + ( ABSA / SCALE )**2
+                  COLSSQ( 2 ) = COLSSQ( 2 ) + ( ABSA / COLSSQ( 1 ) )**2
                END IF
             END IF
             IF( LSAME( UPLO, 'U' ) ) THEN
@@ -261,7 +282,8 @@
                K = K + N - I + 1
             END IF
   130    CONTINUE
-         VALUE = SCALE*SQRT( SUM )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANSP = VALUE
diff --git a/lapack-netlib/SRC/zlansy.f b/lapack-netlib/SRC/zlansy.f
index 58269a911..e022f85e1 100644
--- a/lapack-netlib/SRC/zlansy.f
+++ b/lapack-netlib/SRC/zlansy.f
@@ -128,6 +128,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          NORM, UPLO
       INTEGER            LDA, N
@@ -145,14 +146,17 @@
 *     ..
 *     .. Local Scalars ..
       INTEGER            I, J
-      DOUBLE PRECISION   ABSA, SCALE, SUM, VALUE
+      DOUBLE PRECISION   ABSA, SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
@@ -218,21 +222,39 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
+*
+         SSQ( 1 ) = ZERO
+         SSQ( 2 ) = ONE
+*
+*        Sum off-diagonals
 *
-         SCALE = ZERO
-         SUM = ONE
          IF( LSAME( UPLO, 'U' ) ) THEN
             DO 110 J = 2, N
-               CALL ZLASSQ( J-1, A( 1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( J-1, A( 1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
   110       CONTINUE
          ELSE
             DO 120 J = 1, N - 1
-               CALL ZLASSQ( N-J, A( J+1, J ), 1, SCALE, SUM )
+               COLSSQ( 1 ) = ZERO
+               COLSSQ( 2 ) = ONE
+               CALL ZLASSQ( N-J, A( J+1, J ), 1, COLSSQ(1), COLSSQ(2) )
+               CALL DCOMBSSQ( SSQ, COLSSQ )
   120       CONTINUE
          END IF
-         SUM = 2*SUM
-         CALL ZLASSQ( N, A, LDA+1, SCALE, SUM )
-         VALUE = SCALE*SQRT( SUM )
+         SSQ( 2 ) = 2*SSQ( 2 )
+*
+*        Sum diagonal
+*
+         COLSSQ( 1 ) = ZERO
+         COLSSQ( 2 ) = ONE
+         CALL ZLASSQ( N, A, LDA+1, COLSSQ( 1 ), COLSSQ( 2 ) )
+         CALL DCOMBSSQ( SSQ, COLSSQ )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANSY = VALUE
diff --git a/lapack-netlib/SRC/zlantb.f b/lapack-netlib/SRC/zlantb.f
index 3077ba151..f02509223 100644
--- a/lapack-netlib/SRC/zlantb.f
+++ b/lapack-netlib/SRC/zlantb.f
@@ -146,6 +146,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            K, LDAB, N
@@ -164,14 +165,17 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, L
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MAX, MIN, SQRT
@@ -313,46 +317,61 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 280 J = 2, N
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
                      CALL ZLASSQ( MIN( J-1, K ),
-     $                            AB( MAX( K+2-J, 1 ), J ), 1, SCALE,
-     $                            SUM )
+     $                            AB( MAX( K+2-J, 1 ), J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL DCOMBSSQ( SSQ, COLSSQ )
   280             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 290 J = 1, N
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
                   CALL ZLASSQ( MIN( J, K+1 ), AB( MAX( K+2-J, 1 ), J ),
-     $                         1, SCALE, SUM )
+     $                         1, COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                IF( K.GT.0 ) THEN
                   DO 300 J = 1, N - 1
-                     CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1, SCALE,
-     $                            SUM )
+                     COLSSQ( 1 ) = ZERO
+                     COLSSQ( 2 ) = ONE
+                     CALL ZLASSQ( MIN( N-J, K ), AB( 2, J ), 1,
+     $                            COLSSQ( 1 ), COLSSQ( 2 ) )
+                     CALL DCOMBSSQ( SSQ, COLSSQ )
   300             CONTINUE
                END IF
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 310 J = 1, N
-                  CALL ZLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( MIN( N-J+1, K+1 ), AB( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANTB = VALUE
diff --git a/lapack-netlib/SRC/zlantp.f b/lapack-netlib/SRC/zlantp.f
index 69dbaa5bc..d32a00f13 100644
--- a/lapack-netlib/SRC/zlantp.f
+++ b/lapack-netlib/SRC/zlantp.f
@@ -130,6 +130,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            N
@@ -148,14 +149,17 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J, K
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, SQRT
@@ -308,45 +312,64 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 280 J = 2, N
-                  CALL ZLASSQ( J-1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( J-1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   280          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 290 J = 1, N
-                  CALL ZLASSQ( J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + J
   290          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = N
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = N
                K = 2
                DO 300 J = 1, N - 1
-                  CALL ZLASSQ( N-J, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( N-J, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   300          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                K = 1
                DO 310 J = 1, N
-                  CALL ZLASSQ( N-J+1, AP( K ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( N-J+1, AP( K ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
                   K = K + N - J + 1
   310          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANTP = VALUE
diff --git a/lapack-netlib/SRC/zlantr.f b/lapack-netlib/SRC/zlantr.f
index 04ee482f7..7d63c972e 100644
--- a/lapack-netlib/SRC/zlantr.f
+++ b/lapack-netlib/SRC/zlantr.f
@@ -147,6 +147,7 @@
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     December 2016
 *
+      IMPLICIT NONE
 *     .. Scalar Arguments ..
       CHARACTER          DIAG, NORM, UPLO
       INTEGER            LDA, M, N
@@ -165,14 +166,17 @@
 *     .. Local Scalars ..
       LOGICAL            UDIAG
       INTEGER            I, J
-      DOUBLE PRECISION   SCALE, SUM, VALUE
+      DOUBLE PRECISION   SUM, VALUE
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   SSQ( 2 ), COLSSQ( 2 )
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAME, DISNAN
       EXTERNAL           LSAME, DISNAN
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           ZLASSQ
+      EXTERNAL           ZLASSQ, DCOMBSSQ
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, MIN, SQRT
@@ -283,7 +287,7 @@
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               DO 210 I = 1, N
+               DO 210 I = 1, MIN( M, N )
                   WORK( I ) = ONE
   210          CONTINUE
                DO 220 I = N + 1, M
@@ -313,38 +317,56 @@
       ELSE IF( ( LSAME( NORM, 'F' ) ) .OR. ( LSAME( NORM, 'E' ) ) ) THEN
 *
 *        Find normF(A).
+*        SSQ(1) is scale
+*        SSQ(2) is sum-of-squares
+*        For better accuracy, sum each column separately.
 *
          IF( LSAME( UPLO, 'U' ) ) THEN
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 290 J = 2, N
-                  CALL ZLASSQ( MIN( M, J-1 ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( MIN( M, J-1 ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   290          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 300 J = 1, N
-                  CALL ZLASSQ( MIN( M, J ), A( 1, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( MIN( M, J ), A( 1, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   300          CONTINUE
             END IF
          ELSE
             IF( LSAME( DIAG, 'U' ) ) THEN
-               SCALE = ONE
-               SUM = MIN( M, N )
+               SSQ( 1 ) = ONE
+               SSQ( 2 ) = MIN( M, N )
                DO 310 J = 1, N
-                  CALL ZLASSQ( M-J, A( MIN( M, J+1 ), J ), 1, SCALE,
-     $                         SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( M-J, A( MIN( M, J+1 ), J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   310          CONTINUE
             ELSE
-               SCALE = ZERO
-               SUM = ONE
+               SSQ( 1 ) = ZERO
+               SSQ( 2 ) = ONE
                DO 320 J = 1, N
-                  CALL ZLASSQ( M-J+1, A( J, J ), 1, SCALE, SUM )
+                  COLSSQ( 1 ) = ZERO
+                  COLSSQ( 2 ) = ONE
+                  CALL ZLASSQ( M-J+1, A( J, J ), 1,
+     $                         COLSSQ( 1 ), COLSSQ( 2 ) )
+                  CALL DCOMBSSQ( SSQ, COLSSQ )
   320          CONTINUE
             END IF
          END IF
-         VALUE = SCALE*SQRT( SUM )
+         VALUE = SSQ( 1 )*SQRT( SSQ( 2 ) )
       END IF
 *
       ZLANTR = VALUE
diff --git a/lapack-netlib/SRC/zlaqps.f b/lapack-netlib/SRC/zlaqps.f
index c142e8c69..66c721517 100644
--- a/lapack-netlib/SRC/zlaqps.f
+++ b/lapack-netlib/SRC/zlaqps.f
@@ -127,7 +127,7 @@
 *> \param[in,out] AUXV
 *> \verbatim
 *>          AUXV is COMPLEX*16 array, dimension (NB)
-*>          Auxiliar vector.
+*>          Auxiliary vector.
 *> \endverbatim
 *>
 *> \param[in,out] F
diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f
index 59b8ed7a6..feffe9782 100644
--- a/lapack-netlib/SRC/zlaqr0.f
+++ b/lapack-netlib/SRC/zlaqr0.f
@@ -66,7 +66,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -79,12 +79,12 @@
 *>          IHI is INTEGER
 *>
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to ZGEBAL, and then passed to ZGEHRD when the
 *>           matrix output by ZGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -96,17 +96,17 @@
 *>           contains the upper triangular matrix T from the Schur
 *>           decomposition (the Schur form). If INFO = 0 and WANT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] W
@@ -128,7 +128,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -138,7 +138,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -146,7 +146,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -159,7 +159,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -175,19 +175,19 @@
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
-*>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, ZLAQR0 failed to compute all of
+*>             = 0:  successful exit
+*>             > 0:  if INFO = i, ZLAQR0 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -195,7 +195,7 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -203,7 +203,7 @@
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -641,7 +641,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/zlaqr1.f b/lapack-netlib/SRC/zlaqr1.f
index 34341cb10..fc2df3cb4 100644
--- a/lapack-netlib/SRC/zlaqr1.f
+++ b/lapack-netlib/SRC/zlaqr1.f
@@ -64,7 +64,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>              The leading dimension of H as declared in
-*>              the calling procedure.  LDH.GE.N
+*>              the calling procedure.  LDH >= N
 *> \endverbatim
 *>
 *> \param[in] S1
diff --git a/lapack-netlib/SRC/zlaqr2.f b/lapack-netlib/SRC/zlaqr2.f
index e6e2ea48c..b5434e899 100644
--- a/lapack-netlib/SRC/zlaqr2.f
+++ b/lapack-netlib/SRC/zlaqr2.f
@@ -103,7 +103,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -121,7 +121,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -133,7 +133,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -149,7 +149,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -186,13 +186,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -204,14 +204,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -223,7 +223,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/zlaqr3.f b/lapack-netlib/SRC/zlaqr3.f
index 64ab59f31..dfb798ca9 100644
--- a/lapack-netlib/SRC/zlaqr3.f
+++ b/lapack-netlib/SRC/zlaqr3.f
@@ -100,7 +100,7 @@
 *> \param[in] NW
 *> \verbatim
 *>          NW is INTEGER
-*>          Deflation window size.  1 .LE. NW .LE. (KBOT-KTOP+1).
+*>          Deflation window size.  1 <= NW <= (KBOT-KTOP+1).
 *> \endverbatim
 *>
 *> \param[in,out] H
@@ -118,7 +118,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>          Leading dimension of H just as declared in the calling
-*>          subroutine.  N .LE. LDH
+*>          subroutine.  N <= LDH
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -130,7 +130,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>          Specify the rows of Z to which transformations must be
-*>          applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N.
+*>          applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -146,7 +146,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>          The leading dimension of Z just as declared in the
-*>          calling subroutine.  1 .LE. LDZ.
+*>          calling subroutine.  1 <= LDZ.
 *> \endverbatim
 *>
 *> \param[out] NS
@@ -183,13 +183,13 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>          The leading dimension of V just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[in] NH
 *> \verbatim
 *>          NH is INTEGER
-*>          The number of columns of T.  NH.GE.NW.
+*>          The number of columns of T.  NH >= NW.
 *> \endverbatim
 *>
 *> \param[out] T
@@ -201,14 +201,14 @@
 *> \verbatim
 *>          LDT is INTEGER
 *>          The leading dimension of T just as declared in the
-*>          calling subroutine.  NW .LE. LDT
+*>          calling subroutine.  NW <= LDT
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>          The number of rows of work array WV available for
-*>          workspace.  NV.GE.NW.
+*>          workspace.  NV >= NW.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -220,7 +220,7 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>          The leading dimension of W just as declared in the
-*>          calling subroutine.  NW .LE. LDV
+*>          calling subroutine.  NW <= LDV
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f
index 012fa37e2..a88f6508e 100644
--- a/lapack-netlib/SRC/zlaqr4.f
+++ b/lapack-netlib/SRC/zlaqr4.f
@@ -73,7 +73,7 @@
 *> \param[in] N
 *> \verbatim
 *>          N is INTEGER
-*>           The order of the matrix H.  N .GE. 0.
+*>           The order of the matrix H.  N >= 0.
 *> \endverbatim
 *>
 *> \param[in] ILO
@@ -85,12 +85,12 @@
 *> \verbatim
 *>          IHI is INTEGER
 *>           It is assumed that H is already upper triangular in rows
-*>           and columns 1:ILO-1 and IHI+1:N and, if ILO.GT.1,
+*>           and columns 1:ILO-1 and IHI+1:N and, if ILO > 1,
 *>           H(ILO,ILO-1) is zero. ILO and IHI are normally set by a
 *>           previous call to ZGEBAL, and then passed to ZGEHRD when the
 *>           matrix output by ZGEBAL is reduced to Hessenberg form.
 *>           Otherwise, ILO and IHI should be set to 1 and N,
-*>           respectively.  If N.GT.0, then 1.LE.ILO.LE.IHI.LE.N.
+*>           respectively.  If N > 0, then 1 <= ILO <= IHI <= N.
 *>           If N = 0, then ILO = 1 and IHI = 0.
 *> \endverbatim
 *>
@@ -102,17 +102,17 @@
 *>           contains the upper triangular matrix T from the Schur
 *>           decomposition (the Schur form). If INFO = 0 and WANT is
 *>           .FALSE., then the contents of H are unspecified on exit.
-*>           (The output value of H when INFO.GT.0 is given under the
+*>           (The output value of H when INFO > 0 is given under the
 *>           description of INFO below.)
 *>
-*>           This subroutine may explicitly set H(i,j) = 0 for i.GT.j and
+*>           This subroutine may explicitly set H(i,j) = 0 for i > j and
 *>           j = 1, 2, ... ILO-1 or j = IHI+1, IHI+2, ... N.
 *> \endverbatim
 *>
 *> \param[in] LDH
 *> \verbatim
 *>          LDH is INTEGER
-*>           The leading dimension of the array H. LDH .GE. max(1,N).
+*>           The leading dimension of the array H. LDH >= max(1,N).
 *> \endverbatim
 *>
 *> \param[out] W
@@ -134,7 +134,7 @@
 *>          IHIZ is INTEGER
 *>           Specify the rows of Z to which transformations must be
 *>           applied if WANTZ is .TRUE..
-*>           1 .LE. ILOZ .LE. ILO; IHI .LE. IHIZ .LE. N.
+*>           1 <= ILOZ <= ILO; IHI <= IHIZ <= N.
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -144,7 +144,7 @@
 *>           If WANTZ is .TRUE., then Z(ILO:IHI,ILOZ:IHIZ) is
 *>           replaced by Z(ILO:IHI,ILOZ:IHIZ)*U where U is the
 *>           orthogonal Schur factor of H(ILO:IHI,ILO:IHI).
-*>           (The output value of Z when INFO.GT.0 is given under
+*>           (The output value of Z when INFO > 0 is given under
 *>           the description of INFO below.)
 *> \endverbatim
 *>
@@ -152,7 +152,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>           The leading dimension of the array Z.  if WANTZ is .TRUE.
-*>           then LDZ.GE.MAX(1,IHIZ).  Otherwize, LDZ.GE.1.
+*>           then LDZ >= MAX(1,IHIZ).  Otherwise, LDZ >= 1.
 *> \endverbatim
 *>
 *> \param[out] WORK
@@ -165,7 +165,7 @@
 *> \param[in] LWORK
 *> \verbatim
 *>          LWORK is INTEGER
-*>           The dimension of the array WORK.  LWORK .GE. max(1,N)
+*>           The dimension of the array WORK.  LWORK >= max(1,N)
 *>           is sufficient, but LWORK typically as large as 6*N may
 *>           be required for optimal performance.  A workspace query
 *>           to determine the optimal workspace size is recommended.
@@ -182,18 +182,18 @@
 *> \verbatim
 *>          INFO is INTEGER
 *>             =  0:  successful exit
-*>           .GT. 0:  if INFO = i, ZLAQR4 failed to compute all of
+*>             > 0:  if INFO = i, ZLAQR4 failed to compute all of
 *>                the eigenvalues.  Elements 1:ilo-1 and i+1:n of WR
 *>                and WI contain those eigenvalues which have been
 *>                successfully computed.  (Failures are rare.)
 *>
-*>                If INFO .GT. 0 and WANT is .FALSE., then on exit,
+*>                If INFO > 0 and WANT is .FALSE., then on exit,
 *>                the remaining unconverged eigenvalues are the eigen-
 *>                values of the upper Hessenberg matrix rows and
 *>                columns ILO through INFO of the final, output
 *>                value of H.
 *>
-*>                If INFO .GT. 0 and WANTT is .TRUE., then on exit
+*>                If INFO > 0 and WANTT is .TRUE., then on exit
 *>
 *>           (*)  (initial value of H)*U  = U*(final value of H)
 *>
@@ -201,7 +201,7 @@
 *>                value of  H is upper Hessenberg and triangular in
 *>                rows and columns INFO+1 through IHI.
 *>
-*>                If INFO .GT. 0 and WANTZ is .TRUE., then on exit
+*>                If INFO > 0 and WANTZ is .TRUE., then on exit
 *>
 *>                  (final value of Z(ILO:IHI,ILOZ:IHIZ)
 *>                   =  (initial value of Z(ILO:IHI,ILOZ:IHIZ)*U
@@ -209,7 +209,7 @@
 *>                where U is the unitary matrix in (*) (regard-
 *>                less of the value of WANTT.)
 *>
-*>                If INFO .GT. 0 and WANTZ is .FALSE., then Z is not
+*>                If INFO > 0 and WANTZ is .FALSE., then Z is not
 *>                accessed.
 *> \endverbatim
 *
@@ -641,7 +641,7 @@
                   END IF
                END IF
 *
-*              ==== Use up to NS of the the smallest magnatiude
+*              ==== Use up to NS of the the smallest magnitude
 *              .    shifts.  If there aren't NS shifts available,
 *              .    then use them all, possibly dropping one to
 *              .    make the number of shifts even. ====
diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f
index 0dfbce82c..9ff7e7eca 100644
--- a/lapack-netlib/SRC/zlaqr5.f
+++ b/lapack-netlib/SRC/zlaqr5.f
@@ -125,7 +125,7 @@
 *> \verbatim
 *>          LDH is INTEGER
 *>             LDH is the leading dimension of H just as declared in the
-*>             calling procedure.  LDH.GE.MAX(1,N).
+*>             calling procedure.  LDH >= MAX(1,N).
 *> \endverbatim
 *>
 *> \param[in] ILOZ
@@ -137,7 +137,7 @@
 *> \verbatim
 *>          IHIZ is INTEGER
 *>             Specify the rows of Z to which transformations must be
-*>             applied if WANTZ is .TRUE.. 1 .LE. ILOZ .LE. IHIZ .LE. N
+*>             applied if WANTZ is .TRUE.. 1 <= ILOZ <= IHIZ <= N
 *> \endverbatim
 *>
 *> \param[in,out] Z
@@ -153,7 +153,7 @@
 *> \verbatim
 *>          LDZ is INTEGER
 *>             LDA is the leading dimension of Z just as declared in
-*>             the calling procedure. LDZ.GE.N.
+*>             the calling procedure. LDZ >= N.
 *> \endverbatim
 *>
 *> \param[out] V
@@ -165,7 +165,7 @@
 *> \verbatim
 *>          LDV is INTEGER
 *>             LDV is the leading dimension of V as declared in the
-*>             calling procedure.  LDV.GE.3.
+*>             calling procedure.  LDV >= 3.
 *> \endverbatim
 *>
 *> \param[out] U
@@ -177,33 +177,14 @@
 *> \verbatim
 *>          LDU is INTEGER
 *>             LDU is the leading dimension of U just as declared in the
-*>             in the calling subroutine.  LDU.GE.3*NSHFTS-3.
-*> \endverbatim
-*>
-*> \param[in] NH
-*> \verbatim
-*>          NH is INTEGER
-*>             NH is the number of columns in array WH available for
-*>             workspace. NH.GE.1.
-*> \endverbatim
-*>
-*> \param[out] WH
-*> \verbatim
-*>          WH is COMPLEX*16 array, dimension (LDWH,NH)
-*> \endverbatim
-*>
-*> \param[in] LDWH
-*> \verbatim
-*>          LDWH is INTEGER
-*>             Leading dimension of WH just as declared in the
-*>             calling procedure.  LDWH.GE.3*NSHFTS-3.
+*>             in the calling subroutine.  LDU >= 3*NSHFTS-3.
 *> \endverbatim
 *>
 *> \param[in] NV
 *> \verbatim
 *>          NV is INTEGER
 *>             NV is the number of rows in WV agailable for workspace.
-*>             NV.GE.1.
+*>             NV >= 1.
 *> \endverbatim
 *>
 *> \param[out] WV
@@ -215,9 +196,28 @@
 *> \verbatim
 *>          LDWV is INTEGER
 *>             LDWV is the leading dimension of WV as declared in the
-*>             in the calling subroutine.  LDWV.GE.NV.
+*>             in the calling subroutine.  LDWV >= NV.
 *> \endverbatim
 *
+*> \param[in] NH
+*> \verbatim
+*>          NH is INTEGER
+*>             NH is the number of columns in array WH available for
+*>             workspace. NH >= 1.
+*> \endverbatim
+*>
+*> \param[out] WH
+*> \verbatim
+*>          WH is COMPLEX*16 array, dimension (LDWH,NH)
+*> \endverbatim
+*>
+*> \param[in] LDWH
+*> \verbatim
+*>          LDWH is INTEGER
+*>             Leading dimension of WH just as declared in the
+*>             calling procedure.  LDWH >= 3*NSHFTS-3.
+*> \endverbatim
+*>
 *  Authors:
 *  ========
 *
diff --git a/lapack-netlib/SRC/zlarfb.f b/lapack-netlib/SRC/zlarfb.f
index b4a2b4d1a..3da49f2fc 100644
--- a/lapack-netlib/SRC/zlarfb.f
+++ b/lapack-netlib/SRC/zlarfb.f
@@ -92,6 +92,8 @@
 *>          K is INTEGER
 *>          The order of the matrix T (= the number of elementary
 *>          reflectors whose product defines the block reflector).
+*>          If SIDE = 'L', M >= K >= 0;
+*>          if SIDE = 'R', N >= K >= 0.
 *> \endverbatim
 *>
 *> \param[in] V
diff --git a/lapack-netlib/SRC/zlarfx.f b/lapack-netlib/SRC/zlarfx.f
index 685d164eb..ba6d4ed74 100644
--- a/lapack-netlib/SRC/zlarfx.f
+++ b/lapack-netlib/SRC/zlarfx.f
@@ -94,7 +94,7 @@
 *> \param[in] LDC
 *> \verbatim
 *>          LDC is INTEGER
-*>          The leading dimension of the array C. LDA >= max(1,M).
+*>          The leading dimension of the array C. LDC >= max(1,M).
 *> \endverbatim
 *>
 *> \param[out] WORK
diff --git a/lapack-netlib/SRC/zlarfy.f b/lapack-netlib/SRC/zlarfy.f
index 57605731b..4c9e08bac 100644
--- a/lapack-netlib/SRC/zlarfy.f
+++ b/lapack-netlib/SRC/zlarfy.f
@@ -103,7 +103,7 @@
 *
 *> \date December 2016
 *
-*> \ingroup complex16_eig
+*> \ingroup complex16OTHERauxiliary
 *
 *  =====================================================================
       SUBROUTINE ZLARFY( UPLO, N, V, INCV, TAU, C, LDC, WORK )
diff --git a/lapack-netlib/SRC/zlarrv.f b/lapack-netlib/SRC/zlarrv.f
index 67a67584c..23976dbef 100644
--- a/lapack-netlib/SRC/zlarrv.f
+++ b/lapack-netlib/SRC/zlarrv.f
@@ -143,7 +143,7 @@
 *>          RTOL2 is DOUBLE PRECISION
 *>           Parameters for bisection.
 *>           An interval [LEFT,RIGHT] has converged if
-*>           RIGHT-LEFT.LT.MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
+*>           RIGHT-LEFT < MAX( RTOL1*GAP, RTOL2*MAX(|LEFT|,|RIGHT|) )
 *> \endverbatim
 *>
 *> \param[in,out] W
diff --git a/lapack-netlib/SRC/zlassq.f b/lapack-netlib/SRC/zlassq.f
index fd13811bd..dccec988d 100644
--- a/lapack-netlib/SRC/zlassq.f
+++ b/lapack-netlib/SRC/zlassq.f
@@ -41,7 +41,7 @@
 *> where x( i ) = abs( X( 1 + ( i - 1 )*INCX ) ). The value of sumsq is
 *> assumed to be at least unity and the value of ssq will then satisfy
 *>
-*>    1.0 .le. ssq .le. ( sumsq + 2*n ).
+*>    1.0 <= ssq <= ( sumsq + 2*n ).
 *>
 *> scale is assumed to be non-negative and scl returns the value
 *>
@@ -65,7 +65,7 @@
 *>
 *> \param[in] X
 *> \verbatim
-*>          X is COMPLEX*16 array, dimension (N)
+*>          X is COMPLEX*16 array, dimension (1+(N-1)*INCX)
 *>          The vector x as described above.
 *>             x( i )  = X( 1 + ( i - 1 )*INCX ), 1 <= i <= n.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zlaswlq.f b/lapack-netlib/SRC/zlaswlq.f
index 24dd41d79..990630925 100644
--- a/lapack-netlib/SRC/zlaswlq.f
+++ b/lapack-netlib/SRC/zlaswlq.f
@@ -1,3 +1,4 @@
+*> \brief \b ZLASWLQ
 *
 *  Definition:
 *  ===========
@@ -18,9 +19,20 @@
 *>
 *> \verbatim
 *>
-*>          ZLASWLQ computes a blocked Short-Wide LQ factorization of a
-*>          M-by-N matrix A, where N >= M:
-*>          A = L * Q
+*> ZLASWLQ computes a blocked Tall-Skinny LQ factorization of
+*> a complexx M-by-N matrix A for M <= N:
+*>
+*>    A = ( L 0 ) *  Q,
+*>
+*> where:
+*>
+*>    Q is a n-by-N orthogonal matrix, stored on exit in an implicit
+*>    form in the elements above the digonal of the array A and in
+*>    the elemenst of the array T;
+*>    L is an lower-triangular M-by-M matrix stored on exit in
+*>    the elements on and below the diagonal of the array A.
+*>    0 is a M-by-(N-M) zero matrix, if M < N, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -150,7 +162,7 @@
       SUBROUTINE ZLASWLQ( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
      $                  INFO)
 *
-*  -- LAPACK computational routine (version 3.7.1) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
 *     June 2017
diff --git a/lapack-netlib/SRC/zlasyf_aa.f b/lapack-netlib/SRC/zlasyf_aa.f
index f321b72de..b1f1c2790 100644
--- a/lapack-netlib/SRC/zlasyf_aa.f
+++ b/lapack-netlib/SRC/zlasyf_aa.f
@@ -284,8 +284,9 @@
 *
 *              Swap A(I1, I2+1:M) with A(I2, I2+1:M)
 *
-               CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
-     $                           A( J1+I2-1, I2+1 ), LDA )
+               IF( I2.LT.M )
+     $            CALL ZSWAP( M-I2, A( J1+I1-1, I2+1 ), LDA,
+     $                              A( J1+I2-1, I2+1 ), LDA )
 *
 *              Swap A(I1, I1) with A(I2,I2)
 *
@@ -325,13 +326,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( K, J+1 ).NE.ZERO ) THEN
-               ALPHA = ONE / A( K, J+1 )
-               CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
-               CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
-            ELSE
-               CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO,
-     $                      A( K, J+2 ), LDA)
+            IF( J.LT.(M-1) ) THEN
+               IF( A( K, J+1 ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( K, J+1 )
+                  CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( K, J+2 ), LDA )
+                  CALL ZSCAL( M-J-1, ALPHA, A( K, J+2 ), LDA )
+               ELSE
+                  CALL ZLASET( 'Full', 1, M-J-1, ZERO, ZERO,
+     $                         A( K, J+2 ), LDA)
+               END IF
             END IF
          END IF
          J = J + 1
@@ -432,8 +435,9 @@
 *
 *              Swap A(I2+1:M, I1) with A(I2+1:M, I2)
 *
-               CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
-     $                           A( I2+1, J1+I2-1 ), 1 )
+               IF( I2.LT.M )
+     $            CALL ZSWAP( M-I2, A( I2+1, J1+I1-1 ), 1,
+     $                              A( I2+1, J1+I2-1 ), 1 )
 *
 *              Swap A(I1, I1) with A(I2, I2)
 *
@@ -473,13 +477,15 @@
 *           Compute L(J+2, J+1) = WORK( 3:M ) / T(J, J+1),
 *            where A(J, J+1) = T(J, J+1) and A(J+2:M, J) = L(J+2:M, J+1)
 *
-            IF( A( J+1, K ).NE.ZERO ) THEN
-               ALPHA = ONE / A( J+1, K )
-               CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
-               CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
-            ELSE
-               CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO,
-     $                      A( J+2, K ), LDA )
+            IF( J.LT.(M-1) ) THEN
+               IF( A( J+1, K ).NE.ZERO ) THEN
+                  ALPHA = ONE / A( J+1, K )
+                  CALL ZCOPY( M-J-1, WORK( 3 ), 1, A( J+2, K ), 1 )
+                  CALL ZSCAL( M-J-1, ALPHA, A( J+2, K ), 1 )
+               ELSE
+                  CALL ZLASET( 'Full', M-J-1, 1, ZERO, ZERO,
+     $                         A( J+2, K ), LDA )
+               END IF
             END IF
          END IF
          J = J + 1
diff --git a/lapack-netlib/SRC/zlasyf_rk.f b/lapack-netlib/SRC/zlasyf_rk.f
index 664ed93f3..b6c5a27c6 100644
--- a/lapack-netlib/SRC/zlasyf_rk.f
+++ b/lapack-netlib/SRC/zlasyf_rk.f
@@ -330,7 +330,7 @@
 *        of A and working backwards, and compute the matrix W = U12*D
 *        for use in updating A11
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -658,7 +658,7 @@
 *        of A and working forwards, and compute the matrix W = L21*D
 *        for use in updating A22
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/zlatdf.f b/lapack-netlib/SRC/zlatdf.f
index ab88570c5..4b8b5e330 100644
--- a/lapack-netlib/SRC/zlatdf.f
+++ b/lapack-netlib/SRC/zlatdf.f
@@ -261,7 +261,7 @@
 *
 *        Solve for U- part, lockahead for RHS(N) = +-1. This is not done
 *        In BSOLVE and will hopefully give us a better estimate because
-*        any ill-conditioning of the original matrix is transfered to U
+*        any ill-conditioning of the original matrix is transferred to U
 *        and not to L. U(N, N) is an approximation to sigma_min(LU).
 *
          CALL ZCOPY( N-1, RHS, 1, WORK, 1 )
diff --git a/lapack-netlib/SRC/zlatsqr.f b/lapack-netlib/SRC/zlatsqr.f
index 1fdf3be24..0f98cae93 100644
--- a/lapack-netlib/SRC/zlatsqr.f
+++ b/lapack-netlib/SRC/zlatsqr.f
@@ -1,3 +1,4 @@
+*> \brief \b ZLATSQR
 *
 *  Definition:
 *  ===========
@@ -18,9 +19,23 @@
 *>
 *> \verbatim
 *>
-*> SLATSQR computes a blocked Tall-Skinny QR factorization of
-*> an M-by-N matrix A, where M >= N:
-*> A = Q * R .
+*> ZLATSQR computes a blocked Tall-Skinny QR factorization of
+*> a complex M-by-N matrix A for M >= N:
+*>
+*>    A = Q * ( R ),
+*>            ( 0 )
+*>
+*> where:
+*>
+*>    Q is a M-by-M orthogonal matrix, stored on exit in an implicit
+*>    form in the elements below the digonal of the array A and in
+*>    the elemenst of the array T;
+*>
+*>    R is an upper-triangular N-by-N matrix, stored on exit in
+*>    the elements on and above the diagonal of the array A.
+*>
+*>    0 is a (M-N)-by-N zero matrix, and is not stored.
+*>
 *> \endverbatim
 *
 *  Arguments:
@@ -149,10 +164,10 @@
       SUBROUTINE ZLATSQR( M, N, MB, NB, A, LDA, T, LDT, WORK,
      $                    LWORK, INFO)
 *
-*  -- LAPACK computational routine (version 3.7.0) --
+*  -- LAPACK computational routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd. --
-*     December 2016
+*     November 2019
 *
 *     .. Scalar Arguments ..
       INTEGER           INFO, LDA, M, N, MB, NB, LDT, LWORK
diff --git a/lapack-netlib/SRC/zlaunhr_col_getrfnp.f b/lapack-netlib/SRC/zlaunhr_col_getrfnp.f
new file mode 100644
index 000000000..0ab7f0349
--- /dev/null
+++ b/lapack-netlib/SRC/zlaunhr_col_getrfnp.f
@@ -0,0 +1,248 @@
+*> \brief \b ZLAUNHR_COL_GETRFNP
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAUNHR_COL_GETRFNP + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaunhr_col_getrfnp.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaunhr_col_getrfnp.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaunhr_col_getrfnp.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAUNHR_COL_GETRFNP computes the modified LU factorization without
+*> pivoting of a complex general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is
+*>    at least one in absolute value (so that division-by-zero not
+*>    not possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the blocked right-looking version of the algorithm,
+*> calling Level 3 BLAS to update the submatrix. To factorize a block,
+*> this routine calls the recursive routine ZLAUNHR_COL_GETRFNP2.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is COMPLEX*16 array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be
+*>          only ( +1.0, 0.0 ) or (-1.0, 0.0 ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex16GEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZLAUNHR_COL_GETRFNP( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            IINFO, J, JB, NB
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZLAUNHR_COL_GETRFNP2, ZTRSM, XERBLA
+*     ..
+*     .. External Functions ..
+      INTEGER            ILAENV
+      EXTERNAL           ILAENV
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters.
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZLAUNHR_COL_GETRFNP', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+*
+*     Determine the block size for this environment.
+*
+
+      NB = ILAENV( 1, 'ZLAUNHR_COL_GETRFNP', ' ', M, N, -1, -1 )
+
+      IF( NB.LE.1 .OR. NB.GE.MIN( M, N ) ) THEN
+*
+*        Use unblocked code.
+*
+         CALL ZLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      ELSE
+*
+*        Use blocked code.
+*
+         DO J = 1, MIN( M, N ), NB
+            JB = MIN( MIN( M, N )-J+1, NB )
+*
+*           Factor diagonal and subdiagonal blocks.
+*
+            CALL ZLAUNHR_COL_GETRFNP2( M-J+1, JB, A( J, J ), LDA,
+     $                                 D( J ), IINFO )
+*
+            IF( J+JB.LE.N ) THEN
+*
+*              Compute block row of U.
+*
+               CALL ZTRSM( 'Left', 'Lower', 'No transpose', 'Unit', JB,
+     $                     N-J-JB+1, CONE, A( J, J ), LDA, A( J, J+JB ),
+     $                     LDA )
+               IF( J+JB.LE.M ) THEN
+*
+*                 Update trailing submatrix.
+*
+                  CALL ZGEMM( 'No transpose', 'No transpose', M-J-JB+1,
+     $                        N-J-JB+1, JB, -CONE, A( J+JB, J ), LDA,
+     $                        A( J, J+JB ), LDA, CONE, A( J+JB, J+JB ),
+     $                        LDA )
+               END IF
+            END IF
+         END DO
+      END IF
+      RETURN
+*
+*     End of ZLAUNHR_COL_GETRFNP
+*
+      END
diff --git a/lapack-netlib/SRC/zlaunhr_col_getrfnp2.f b/lapack-netlib/SRC/zlaunhr_col_getrfnp2.f
new file mode 100644
index 000000000..0057e430d
--- /dev/null
+++ b/lapack-netlib/SRC/zlaunhr_col_getrfnp2.f
@@ -0,0 +1,314 @@
+*> \brief \b ZLAUNHR_COL_GETRFNP2
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZLAUNHR_COL_GETRFNP2 + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zlaunhr_col_getrfnp2.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zlaunhr_col_getrfnp2.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zlaunhr_col_getrfnp2.f">
+*> [TXT]</a>
+*> \endhtmlonly
+*
+*  Definition:
+*  ===========
+*
+*       RECURSIVE SUBROUTINE ZLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER            INFO, LDA, M, N
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16         A( LDA, * ), D( * )
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZLAUNHR_COL_GETRFNP2 computes the modified LU factorization without
+*> pivoting of a complex general M-by-N matrix A. The factorization has
+*> the form:
+*>
+*>     A - S = L * U,
+*>
+*> where:
+*>    S is a m-by-n diagonal sign matrix with the diagonal D, so that
+*>    D(i) = S(i,i), 1 <= i <= min(M,N). The diagonal D is constructed
+*>    as D(i)=-SIGN(A(i,i)), where A(i,i) is the value after performing
+*>    i-1 steps of Gaussian elimination. This means that the diagonal
+*>    element at each step of "modified" Gaussian elimination is at
+*>    least one in absolute value (so that division-by-zero not
+*>    possible during the division by the diagonal element);
+*>
+*>    L is a M-by-N lower triangular matrix with unit diagonal elements
+*>    (lower trapezoidal if M > N);
+*>
+*>    and U is a M-by-N upper triangular matrix
+*>    (upper trapezoidal if M < N).
+*>
+*> This routine is an auxiliary routine used in the Householder
+*> reconstruction routine ZUNHR_COL. In ZUNHR_COL, this routine is
+*> applied to an M-by-N matrix A with orthonormal columns, where each
+*> element is bounded by one in absolute value. With the choice of
+*> the matrix S above, one can show that the diagonal element at each
+*> step of Gaussian elimination is the largest (in absolute value) in
+*> the column on or below the diagonal, so that no pivoting is required
+*> for numerical stability [1].
+*>
+*> For more details on the Householder reconstruction algorithm,
+*> including the modified LU factorization, see [1].
+*>
+*> This is the recursive version of the LU factorization algorithm.
+*> Denote A - S by B. The algorithm divides the matrix B into four
+*> submatrices:
+*>
+*>        [  B11 | B12  ]  where B11 is n1 by n1,
+*>    B = [ -----|----- ]        B21 is (m-n1) by n1,
+*>        [  B21 | B22  ]        B12 is n1 by n2,
+*>                               B22 is (m-n1) by n2,
+*>                               with n1 = min(m,n)/2, n2 = n-n1.
+*>
+*>
+*> The subroutine calls itself to factor B11, solves for B21,
+*> solves for B12, updates B22, then calls itself to factor B22.
+*>
+*> For more details on the recursive LU algorithm, see [2].
+*>
+*> ZLAUNHR_COL_GETRFNP2 is called to factorize a block by the blocked
+*> routine ZLAUNHR_COL_GETRFNP, which uses blocked code calling
+*. Level 3 BLAS to update the submatrix. However, ZLAUNHR_COL_GETRFNP2
+*> is self-sufficient and can be used without ZLAUNHR_COL_GETRFNP.
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*>
+*> [2] "Recursion leads to automatic variable blocking for dense linear
+*>     algebra algorithms", F. Gustavson, IBM J. of Res. and Dev.,
+*>     vol. 41, no. 6, pp. 737-755, 1997.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A.  N >= 0.
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>          On entry, the M-by-N matrix to be factored.
+*>          On exit, the factors L and U from the factorization
+*>          A-S=L*U; the unit diagonal elements of L are not stored.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is COMPLEX*16 array, dimension min(M,N)
+*>          The diagonal elements of the diagonal M-by-N sign matrix S,
+*>          D(i) = S(i,i), where 1 <= i <= min(M,N). The elements can be
+*>          only ( +1.0, 0.0 ) or (-1.0, 0.0 ).
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex16GEcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      RECURSIVE SUBROUTINE ZLAUNHR_COL_GETRFNP2( M, N, A, LDA, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER            INFO, LDA, M, N
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16         A( LDA, * ), D( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE
+      PARAMETER          ( ONE = 1.0D+0 )
+      COMPLEX*16         CONE
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      DOUBLE PRECISION   SFMIN
+      INTEGER            I, IINFO, N1, N2
+      COMPLEX*16         Z
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH
+      EXTERNAL           DLAMCH
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZGEMM, ZSCAL, ZTRSM, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          ABS, DBLE, DCMPLX, DIMAG, DSIGN, MAX, MIN
+*     ..
+*     .. Statement Functions ..
+      DOUBLE PRECISION   CABS1
+*     ..
+*     .. Statement Function definitions ..
+      CABS1( Z ) = ABS( DBLE( Z ) ) + ABS( DIMAG( Z ) )
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 ) THEN
+         INFO = -2
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -4
+      END IF
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZLAUNHR_COL_GETRFNP2', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 )
+     $   RETURN
+
+      IF ( M.EQ.1 ) THEN
+*
+*        One row case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = DCMPLX( -DSIGN( ONE, DBLE( A( 1, 1 ) ) ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+      ELSE IF( N.EQ.1 ) THEN
+*
+*        One column case, (also recursion termination case),
+*        use unblocked code
+*
+*        Transfer the sign
+*
+         D( 1 ) = DCMPLX( -DSIGN( ONE, DBLE( A( 1, 1 ) ) ) )
+*
+*        Construct the row of U
+*
+         A( 1, 1 ) = A( 1, 1 ) - D( 1 )
+*
+*        Scale the elements 2:M of the column
+*
+*        Determine machine safe minimum
+*
+         SFMIN = DLAMCH('S')
+*
+*        Construct the subdiagonal elements of L
+*
+         IF( CABS1( A( 1, 1 ) ) .GE. SFMIN ) THEN
+            CALL ZSCAL( M-1, CONE / A( 1, 1 ), A( 2, 1 ), 1 )
+         ELSE
+            DO I = 2, M
+               A( I, 1 ) = A( I, 1 ) / A( 1, 1 )
+            END DO
+         END IF
+*
+      ELSE
+*
+*        Divide the matrix B into four submatrices
+*
+         N1 = MIN( M, N ) / 2
+         N2 = N-N1
+
+*
+*        Factor B11, recursive call
+*
+         CALL ZLAUNHR_COL_GETRFNP2( N1, N1, A, LDA, D, IINFO )
+*
+*        Solve for B21
+*
+         CALL ZTRSM( 'R', 'U', 'N', 'N', M-N1, N1, CONE, A, LDA,
+     $               A( N1+1, 1 ), LDA )
+*
+*        Solve for B12
+*
+         CALL ZTRSM( 'L', 'L', 'N', 'U', N1, N2, CONE, A, LDA,
+     $               A( 1, N1+1 ), LDA )
+*
+*        Update B22, i.e. compute the Schur complement
+*        B22 := B22 - B21*B12
+*
+         CALL ZGEMM( 'N', 'N', M-N1, N2, N1, -CONE, A( N1+1, 1 ), LDA,
+     $               A( 1, N1+1 ), LDA, CONE, A( N1+1, N1+1 ), LDA )
+*
+*        Factor B22, recursive call
+*
+         CALL ZLAUNHR_COL_GETRFNP2( M-N1, N2, A( N1+1, N1+1 ), LDA,
+     $                              D( N1+1 ), IINFO )
+*
+      END IF
+      RETURN
+*
+*     End of ZLAUNHR_COL_GETRFNP2
+*
+      END
diff --git a/lapack-netlib/SRC/zporfsx.f b/lapack-netlib/SRC/zporfsx.f
index ee8cfbc6a..bbff4331e 100644
--- a/lapack-netlib/SRC/zporfsx.f
+++ b/lapack-netlib/SRC/zporfsx.f
@@ -44,7 +44,7 @@
 *> \verbatim
 *>
 *>    ZPORFSX improves the computed solution to a system of linear
-*>    equations when the coefficient matrix is symmetric positive
+*>    equations when the coefficient matrix is Hermitian positive
 *>    definite, and provides error bounds and backward error estimates
 *>    for the solution.  In addition to normwise error bound, the code
 *>    provides maximum componentwise error bound if possible.  See
@@ -103,7 +103,7 @@
 *> \param[in] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
-*>     The symmetric matrix A.  If UPLO = 'U', the leading N-by-N
+*>     The Hermitian matrix A.  If UPLO = 'U', the leading N-by-N
 *>     upper triangular part of A contains the upper triangular part
 *>     of the matrix A, and the strictly lower triangular part of A
 *>     is not referenced.  If UPLO = 'L', the leading N-by-N lower
@@ -134,7 +134,7 @@
 *> \param[in,out] S
 *> \verbatim
 *>          S is DOUBLE PRECISION array, dimension (N)
-*>     The row scale factors for A.  If EQUED = 'Y', A is multiplied on
+*>     The scale factors for A.  If EQUED = 'Y', A is multiplied on
 *>     the left and right by diag(S).  S is an input argument if FACT =
 *>     'F'; otherwise, S is an output argument.  If FACT = 'F' and EQUED
 *>     = 'Y', each element of S must be positive.  If S is output, each
@@ -262,7 +262,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -298,14 +298,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -313,9 +313,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/zposvxx.f b/lapack-netlib/SRC/zposvxx.f
index 8126f14be..913d16cb2 100644
--- a/lapack-netlib/SRC/zposvxx.f
+++ b/lapack-netlib/SRC/zposvxx.f
@@ -45,7 +45,7 @@
 *>
 *>    ZPOSVXX uses the Cholesky factorization A = U**T*U or A = L*L**T
 *>    to compute the solution to a complex*16 system of linear equations
-*>    A * X = B, where A is an N-by-N symmetric positive definite matrix
+*>    A * X = B, where A is an N-by-N Hermitian positive definite matrix
 *>    and X and B are N-by-NRHS matrices.
 *>
 *>    If requested, both normwise and maximum componentwise error bounds
@@ -157,7 +157,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
-*>     On entry, the symmetric matrix A, except if FACT = 'F' and EQUED =
+*>     On entry, the Hermitian matrix A, except if FACT = 'F' and EQUED =
 *>     'Y', then A must contain the equilibrated matrix
 *>     diag(S)*A*diag(S).  If UPLO = 'U', the leading N-by-N upper
 *>     triangular part of A contains the upper triangular part of the
@@ -365,7 +365,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -401,14 +401,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -416,9 +416,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/zpotrf2.f b/lapack-netlib/SRC/zpotrf2.f
index e37c9f6d6..85c434d47 100644
--- a/lapack-netlib/SRC/zpotrf2.f
+++ b/lapack-netlib/SRC/zpotrf2.f
@@ -24,7 +24,7 @@
 *>
 *> \verbatim
 *>
-*> ZPOTRF2 computes the Cholesky factorization of a real symmetric
+*> ZPOTRF2 computes the Cholesky factorization of a Hermitian
 *> positive definite matrix A using the recursive algorithm.
 *>
 *> The factorization has the form
@@ -63,7 +63,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
-*>          On entry, the symmetric matrix A.  If UPLO = 'U', the leading
+*>          On entry, the Hermitian matrix A.  If UPLO = 'U', the leading
 *>          N-by-N upper triangular part of A contains the upper
 *>          triangular part of the matrix A, and the strictly lower
 *>          triangular part of A is not referenced.  If UPLO = 'L', the
diff --git a/lapack-netlib/SRC/zstemr.f b/lapack-netlib/SRC/zstemr.f
index ac7552a6a..8685542de 100644
--- a/lapack-netlib/SRC/zstemr.f
+++ b/lapack-netlib/SRC/zstemr.f
@@ -250,13 +250,13 @@
 *> \param[in,out] TRYRAC
 *> \verbatim
 *>          TRYRAC is LOGICAL
-*>          If TRYRAC.EQ..TRUE., indicates that the code should check whether
+*>          If TRYRAC = .TRUE., indicates that the code should check whether
 *>          the tridiagonal matrix defines its eigenvalues to high relative
 *>          accuracy.  If so, the code uses relative-accuracy preserving
 *>          algorithms that might be (a bit) slower depending on the matrix.
 *>          If the matrix does not define its eigenvalues to high relative
 *>          accuracy, the code can uses possibly faster algorithms.
-*>          If TRYRAC.EQ..FALSE., the code is not required to guarantee
+*>          If TRYRAC = .FALSE., the code is not required to guarantee
 *>          relatively accurate eigenvalues and can use the fastest possible
 *>          techniques.
 *>          On exit, a .TRUE. TRYRAC will be set to .FALSE. if the matrix
diff --git a/lapack-netlib/SRC/zsycon_3.f b/lapack-netlib/SRC/zsycon_3.f
index 856845960..33bd23849 100644
--- a/lapack-netlib/SRC/zsycon_3.f
+++ b/lapack-netlib/SRC/zsycon_3.f
@@ -19,7 +19,7 @@
 *  ===========
 *
 *       SUBROUTINE ZSYCON_3( UPLO, N, A, LDA, E, IPIV, ANORM, RCOND,
-*                            WORK, IWORK, INFO )
+*                            WORK, INFO )
 *
 *       .. Scalar Arguments ..
 *       CHARACTER          UPLO
@@ -27,7 +27,7 @@
 *       DOUBLE PRECISION   ANORM, RCOND
 *       ..
 *       .. Array Arguments ..
-*       INTEGER            IPIV( * ), IWORK( * )
+*       INTEGER            IPIV( * )
 *       COMPLEX*16         A( LDA, * ), E ( * ), WORK( * )
 *       ..
 *
@@ -129,11 +129,6 @@
 *>          WORK is COMPLEX*16 array, dimension (2*N)
 *> \endverbatim
 *>
-*> \param[out] IWORK
-*> \verbatim
-*>          IWORK is INTEGER array, dimension (N)
-*> \endverbatim
-*>
 *> \param[out] INFO
 *> \verbatim
 *>          INFO is INTEGER
diff --git a/lapack-netlib/SRC/zsyconvf.f b/lapack-netlib/SRC/zsyconvf.f
index b26bfd63b..2d5ce882e 100644
--- a/lapack-netlib/SRC/zsyconvf.f
+++ b/lapack-netlib/SRC/zsyconvf.f
@@ -294,7 +294,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -347,7 +347,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -438,7 +438,7 @@
 *
 *           Convert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where k increases from 1 to N
 *
             I = 1
@@ -491,7 +491,7 @@
 *
 *           Revert PERMUTATIONS and IPIV
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/zsyconvf_rook.f b/lapack-netlib/SRC/zsyconvf_rook.f
index 5c36f4bcd..410d2eb34 100644
--- a/lapack-netlib/SRC/zsyconvf_rook.f
+++ b/lapack-netlib/SRC/zsyconvf_rook.f
@@ -285,7 +285,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in factorization order where i decreases from N to 1
 *
             I = N
@@ -336,7 +336,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of upper part of A
+*           Apply permutations to submatrices of upper part of A
 *           in reverse factorization order where i increases from 1 to N
 *
             I = 1
@@ -426,7 +426,7 @@
 *
 *           Convert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in factorization order where i increases from 1 to N
 *
             I = 1
@@ -477,7 +477,7 @@
 *
 *           Revert PERMUTATIONS
 *
-*           Apply permutaions to submatrices of lower part of A
+*           Apply permutations to submatrices of lower part of A
 *           in reverse factorization order where i decreases from N to 1
 *
             I = N
diff --git a/lapack-netlib/SRC/zsyrfsx.f b/lapack-netlib/SRC/zsyrfsx.f
index 3420d70cd..d086510d8 100644
--- a/lapack-netlib/SRC/zsyrfsx.f
+++ b/lapack-netlib/SRC/zsyrfsx.f
@@ -271,7 +271,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -307,14 +307,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -322,9 +322,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the double-precision refinement algorithm,
+*>            = 1.0:  Use the double-precision refinement algorithm,
 *>                    possibly with doubled-single computations if the
 *>                    compilation environment does not support DOUBLE
 *>                    PRECISION.
diff --git a/lapack-netlib/SRC/zsysv_aa.f b/lapack-netlib/SRC/zsysv_aa.f
index 325d07c54..4e87bd105 100644
--- a/lapack-netlib/SRC/zsysv_aa.f
+++ b/lapack-netlib/SRC/zsysv_aa.f
@@ -42,7 +42,7 @@
 *> matrices.
 *>
 *> Aasen's algorithm is used to factor A as
-*>    A = U * T * U**T,  if UPLO = 'U', or
+*>    A = U**T * T * U,  if UPLO = 'U', or
 *>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric tridiagonal. The factored
@@ -86,7 +86,7 @@
 *>
 *>          On exit, if INFO = 0, the tridiagonal matrix T and the
 *>          multipliers used to obtain the factor U or L from the
-*>          factorization A = U*T*U**T or A = L*T*L**T as computed by
+*>          factorization A = U**T*T*U or A = L*T*L**T as computed by
 *>          ZSYTRF.
 *> \endverbatim
 *>
@@ -230,7 +230,7 @@
          RETURN
       END IF
 *
-*     Compute the factorization A = U*T*U**T or A = L*T*L**T.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL ZSYTRF_AA( UPLO, N, A, LDA, IPIV, WORK, LWORK, INFO )
       IF( INFO.EQ.0 ) THEN
diff --git a/lapack-netlib/SRC/zsysv_aa_2stage.f b/lapack-netlib/SRC/zsysv_aa_2stage.f
index 029ed587d..923eaaec0 100644
--- a/lapack-netlib/SRC/zsysv_aa_2stage.f
+++ b/lapack-netlib/SRC/zsysv_aa_2stage.f
@@ -43,8 +43,8 @@
 *> matrices.
 *>
 *> Aasen's 2-stage algorithm is used to factor A as
-*>    A = U * T * U**H,  if UPLO = 'U', or
-*>    A = L * T * L**H,  if UPLO = 'L',
+*>    A = U**T * T * U,  if UPLO = 'U', or
+*>    A = L * T * L**T,  if UPLO = 'L',
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is symmetric and band. The matrix T is
 *> then LU-factored with partial pivoting. The factored form of A
@@ -257,7 +257,7 @@
       END IF
 *
 *
-*     Compute the factorization A = U*T*U**H or A = L*T*L**H.
+*     Compute the factorization A = U**T*T*U or A = L*T*L**T.
 *
       CALL ZSYTRF_AA_2STAGE( UPLO, N, A, LDA, TB, LTB, IPIV, IPIV2,
      $                       WORK, LWORK, INFO )
diff --git a/lapack-netlib/SRC/zsysvxx.f b/lapack-netlib/SRC/zsysvxx.f
index ef44d09d3..e29439385 100644
--- a/lapack-netlib/SRC/zsysvxx.f
+++ b/lapack-netlib/SRC/zsysvxx.f
@@ -378,7 +378,7 @@
 *>     information as described below. There currently are up to three
 *>     pieces of information returned for each right-hand side. If
 *>     componentwise accuracy is not requested (PARAMS(3) = 0.0), then
-*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS .LT. 3, then at most
+*>     ERR_BNDS_COMP is not accessed.  If N_ERR_BNDS < 3, then at most
 *>     the first (:,N_ERR_BNDS) entries are returned.
 *>
 *>     The first index in ERR_BNDS_COMP(i,:) corresponds to the ith
@@ -414,14 +414,14 @@
 *> \param[in] NPARAMS
 *> \verbatim
 *>          NPARAMS is INTEGER
-*>     Specifies the number of parameters set in PARAMS.  If .LE. 0, the
+*>     Specifies the number of parameters set in PARAMS.  If <= 0, the
 *>     PARAMS array is never referenced and default values are used.
 *> \endverbatim
 *>
 *> \param[in,out] PARAMS
 *> \verbatim
 *>          PARAMS is DOUBLE PRECISION array, dimension NPARAMS
-*>     Specifies algorithm parameters.  If an entry is .LT. 0.0, then
+*>     Specifies algorithm parameters.  If an entry is < 0.0, then
 *>     that entry will be filled with default value used for that
 *>     parameter.  Only positions up to NPARAMS are accessed; defaults
 *>     are used for higher-numbered parameters.
@@ -429,9 +429,9 @@
 *>       PARAMS(LA_LINRX_ITREF_I = 1) : Whether to perform iterative
 *>            refinement or not.
 *>         Default: 1.0D+0
-*>            = 0.0 : No refinement is performed, and no error bounds are
+*>            = 0.0:  No refinement is performed, and no error bounds are
 *>                    computed.
-*>            = 1.0 : Use the extra-precise refinement algorithm.
+*>            = 1.0:  Use the extra-precise refinement algorithm.
 *>              (other values are reserved for future use)
 *>
 *>       PARAMS(LA_LINRX_ITHRESH_I = 2) : Maximum number of residual
diff --git a/lapack-netlib/SRC/zsytf2_rk.f b/lapack-netlib/SRC/zsytf2_rk.f
index b1a02f4a5..4ae1a4a22 100644
--- a/lapack-netlib/SRC/zsytf2_rk.f
+++ b/lapack-netlib/SRC/zsytf2_rk.f
@@ -321,7 +321,7 @@
 *
 *        Factorize A as U*D*U**T using the upper triangle of A
 *
-*        Initilize the first entry of array E, where superdiagonal
+*        Initialize the first entry of array E, where superdiagonal
 *        elements of D are stored
 *
          E( 1 ) = CZERO
@@ -632,7 +632,7 @@
 *
 *        Factorize A as L*D*L**T using the lower triangle of A
 *
-*        Initilize the unused last entry of the subdiagonal array E.
+*        Initialize the unused last entry of the subdiagonal array E.
 *
          E( N ) = CZERO
 *
diff --git a/lapack-netlib/SRC/zsytrf.f b/lapack-netlib/SRC/zsytrf.f
index 663199c8a..54e22cca1 100644
--- a/lapack-netlib/SRC/zsytrf.f
+++ b/lapack-netlib/SRC/zsytrf.f
@@ -43,7 +43,7 @@
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and D is symmetric and block diagonal with
-*> with 1-by-1 and 2-by-2 diagonal blocks.
+*> 1-by-1 and 2-by-2 diagonal blocks.
 *>
 *> This is the blocked version of the algorithm, calling Level 3 BLAS.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zsytrf_aa.f b/lapack-netlib/SRC/zsytrf_aa.f
index b25b1fbce..e547c6a60 100644
--- a/lapack-netlib/SRC/zsytrf_aa.f
+++ b/lapack-netlib/SRC/zsytrf_aa.f
@@ -37,7 +37,7 @@
 *> ZSYTRF_AA computes the factorization of a complex symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a complex symmetric tridiagonal matrix.
@@ -223,7 +223,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
 *        Copy first row A(1, 1:N) into H(1:n) (stored in WORK(1:N))
@@ -256,7 +256,7 @@
      $                   A( MAX(1, J), J+1 ), LDA,
      $                   IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
@@ -375,7 +375,7 @@
      $                   A( J+1, MAX(1, J) ), LDA,
      $                   IPIV( J+1 ), WORK, N, WORK( N*NB+1 ) )
 *
-*        Ajust IPIV and apply it back (J-th step picks (J+1)-th pivot)
+*        Adjust IPIV and apply it back (J-th step picks (J+1)-th pivot)
 *
          DO J2 = J+2, MIN(N, J+JB+1)
             IPIV( J2 ) = IPIV( J2 ) + J
diff --git a/lapack-netlib/SRC/zsytrf_aa_2stage.f b/lapack-netlib/SRC/zsytrf_aa_2stage.f
index d3486c1a7..67a1c1f6f 100644
--- a/lapack-netlib/SRC/zsytrf_aa_2stage.f
+++ b/lapack-netlib/SRC/zsytrf_aa_2stage.f
@@ -38,7 +38,7 @@
 *> ZSYTRF_AA_2STAGE computes the factorization of a complex symmetric matrix A
 *> using the Aasen's algorithm.  The form of the factorization is
 *>
-*>    A = U*T*U**T  or  A = L*T*L**T
+*>    A = U**T*T*U  or  A = L*T*L**T
 *>
 *> where U (or L) is a product of permutation and unit upper (lower)
 *> triangular matrices, and T is a complex symmetric band matrix with the
@@ -275,7 +275,7 @@
       IF( UPPER ) THEN
 *
 *        .....................................................
-*        Factorize A as L*D*L**T using the upper triangle of A
+*        Factorize A as U**T*D*U using the upper triangle of A
 *        .....................................................
 *
          DO J = 0, NT-1
@@ -448,12 +448,14 @@ c               END IF
 *                    > Apply pivots to previous columns of L
                      CALL ZSWAP( K-1, A( (J+1)*NB+1, I1 ), 1, 
      $                                A( (J+1)*NB+1, I2 ), 1 )
-*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
-     $                                    A( I1+1, I2 ), 1 )
+*                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)
+                     IF( I2.GT.(I1+1) )
+     $                  CALL ZSWAP( I2-I1-1, A( I1, I1+1 ), LDA,
+     $                                       A( I1+1, I2 ), 1 )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA,
-     $                                 A( I2, I2+1 ), LDA ) 
+                     IF( I2.LT.N )
+     $                  CALL ZSWAP( N-I2, A( I1, I2+1 ), LDA,
+     $                                    A( I2, I2+1 ), LDA ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
@@ -637,11 +639,13 @@ c               END IF
                      CALL ZSWAP( K-1, A( I1, (J+1)*NB+1 ), LDA, 
      $                                A( I2, (J+1)*NB+1 ), LDA )
 *                    > Swap A(I1+1:M, I1) with A(I2, I1+1:M)               
-                     CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1,
-     $                                    A( I2, I1+1 ), LDA )
+                     IF( I2.GT.(I1+1) )
+     $                  CALL ZSWAP( I2-I1-1, A( I1+1, I1 ), 1,
+     $                                       A( I2, I1+1 ), LDA )
 *                    > Swap A(I2+1:M, I1) with A(I2+1:M, I2)
-                     CALL ZSWAP( N-I2, A( I2+1, I1 ), 1,
-     $                                 A( I2+1, I2 ), 1 ) 
+                     IF( I2.LT.N )
+     $                  CALL ZSWAP( N-I2, A( I2+1, I1 ), 1,
+     $                                    A( I2+1, I2 ), 1 ) 
 *                    > Swap A(I1, I1) with A(I2, I2)
                      PIV = A( I1, I1 )
                      A( I1, I1 ) = A( I2, I2 )
diff --git a/lapack-netlib/SRC/zsytri2.f b/lapack-netlib/SRC/zsytri2.f
index e7303c90b..9929eb2c6 100644
--- a/lapack-netlib/SRC/zsytri2.f
+++ b/lapack-netlib/SRC/zsytri2.f
@@ -62,7 +62,7 @@
 *> \param[in,out] A
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
-*>          On entry, the NB diagonal matrix D and the multipliers
+*>          On entry, the block diagonal matrix D and the multipliers
 *>          used to obtain the factor U or L as computed by ZSYTRF.
 *>
 *>          On exit, if INFO = 0, the (symmetric) inverse of the original
@@ -82,7 +82,7 @@
 *> \param[in] IPIV
 *> \verbatim
 *>          IPIV is INTEGER array, dimension (N)
-*>          Details of the interchanges and the NB structure of D
+*>          Details of the interchanges and the block structure of D
 *>          as determined by ZSYTRF.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/zsytrs2.f b/lapack-netlib/SRC/zsytrs2.f
index c0ee206a5..6e9cca425 100644
--- a/lapack-netlib/SRC/zsytrs2.f
+++ b/lapack-netlib/SRC/zsytrs2.f
@@ -36,7 +36,7 @@
 *>
 *> \verbatim
 *>
-*> ZSYTRS2 solves a system of linear equations A*X = B with a real
+*> ZSYTRS2 solves a system of linear equations A*X = B with a complex
 *> symmetric matrix A using the factorization A = U*D*U**T or
 *> A = L*D*L**T computed by ZSYTRF and converted by ZSYCONV.
 *> \endverbatim
diff --git a/lapack-netlib/SRC/zsytrs_aa.f b/lapack-netlib/SRC/zsytrs_aa.f
index e62e9e486..0f0664009 100644
--- a/lapack-netlib/SRC/zsytrs_aa.f
+++ b/lapack-netlib/SRC/zsytrs_aa.f
@@ -37,7 +37,7 @@
 *> \verbatim
 *>
 *> ZSYTRS_AA solves a system of linear equations A*X = B with a complex
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by ZSYTRF_AA.
 *> \endverbatim
 *
@@ -49,7 +49,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -97,14 +97,16 @@
 *>          The leading dimension of the array B.  LDB >= max(1,N).
 *> \endverbatim
 *>
-*> \param[in] WORK
+*> \param[out] WORK
 *> \verbatim
-*>          WORK is DOUBLE array, dimension (MAX(1,LWORK))
+*>          WORK is COMPLEX*16 array, dimension (MAX(1,LWORK))
 *> \endverbatim
 *>
 *> \param[in] LWORK
 *> \verbatim
-*>          LWORK is INTEGER, LWORK >= MAX(1,3*N-2).
+*>          LWORK is INTEGER
+*>          The dimension of the array WORK. LWORK >= max(1,3*N-2).
+*> \endverbatim
 *>
 *> \param[out] INFO
 *> \verbatim
@@ -198,22 +200,29 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
+*
+*        1) Forward substitution with U**T
+*
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
 *
-*        Pivot, P**T * B
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $         CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $          CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+*           Compute U**T \ B -> B    [ (U**T \P**T * B) ]
 *
-*        Compute (U \P**T * B) -> B    [ (U \P**T * B) ]
+            CALL ZTRSM( 'L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-         CALL ZTRSM('L', 'U', 'T', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+*        2) Solve with triangular matrix T
 *
-*        Compute T \ B -> B   [ T \ (U \P**T * B) ]
+*        Compute T \ B -> B   [ T \ (U**T \P**T * B) ]
 *
          CALL ZLACPY( 'F', 1, N, A( 1, 1 ), LDA+1, WORK( N ), 1)
          IF( N.GT.1 ) THEN
@@ -223,35 +232,47 @@
          CALL ZGTSV( N, NRHS, WORK( 1 ), WORK( N ), WORK( 2*N ), B, LDB,
      $               INFO )
 *
-*        Compute (U**T \ B) -> B   [ U**T \ (T \ (U \P**T * B) ) ]
+*        3) Backward substitution with U
 *
-         CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ), LDA,
-     $               B( 2, 1 ), LDB)
+         IF( N.GT.1 ) THEN
 *
-*        Pivot, P * B  [ P * (U**T \ (T \ (U \P**T * B) )) ]
+*           Compute U \ B -> B   [ U \ (T \ (U**T \P**T * B) ) ]
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+            CALL ZTRSM( 'L', 'U', 'N', 'U', N-1, NRHS, ONE, A( 1, 2 ),
+     $                  LDA, B( 2, 1 ), LDB)
+*
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
+*
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       ELSE
 *
 *        Solve A*X = B, where A = L*T*L**T.
 *
-*        Pivot, P**T * B
+*        1) Forward substitution with L
 *
-         DO K = 1, N
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+         IF( N.GT.1 ) THEN
+*
+*           Pivot, P**T * B -> B
 *
-*        Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+            DO K = 1, N
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+*
+*           Compute L \ B -> B    [ (L \P**T * B) ]
+*
+            CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
+         END IF
 *
-         CALL ZTRSM( 'L', 'L', 'N', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $               B( 2, 1 ), LDB)
+*        2) Solve with triangular matrix T
 *
 *        Compute T \ B -> B   [ T \ (L \P**T * B) ]
 *
@@ -263,18 +284,23 @@
          CALL ZGTSV( N, NRHS, WORK( 1 ), WORK(N), WORK( 2*N ), B, LDB,
      $               INFO)
 *
-*        Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
+*        3) Backward substitution with L**T
 *
-         CALL ZTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ), LDA,
-     $              B( 2, 1 ), LDB)
+         IF( N.GT.1 ) THEN
 *
-*        Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*           Compute (L**T \ B) -> B   [ L**T \ (T \ (L \P**T * B) ) ]
 *
-         DO K = N, 1, -1
-            KP = IPIV( K )
-            IF( KP.NE.K )
-     $         CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
-         END DO
+            CALL ZTRSM( 'L', 'L', 'T', 'U', N-1, NRHS, ONE, A( 2, 1 ),
+     $                  LDA, B( 2, 1 ), LDB)
+*
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*
+            DO K = N, 1, -1
+               KP = IPIV( K )
+               IF( KP.NE.K )
+     $            CALL ZSWAP( NRHS, B( K, 1 ), LDB, B( KP, 1 ), LDB )
+            END DO
+         END IF
 *
       END IF
 *
diff --git a/lapack-netlib/SRC/zsytrs_aa_2stage.f b/lapack-netlib/SRC/zsytrs_aa_2stage.f
index fa15eee90..bf060b2d3 100644
--- a/lapack-netlib/SRC/zsytrs_aa_2stage.f
+++ b/lapack-netlib/SRC/zsytrs_aa_2stage.f
@@ -36,7 +36,7 @@
 *> \verbatim
 *>
 *> ZSYTRS_AA_2STAGE solves a system of linear equations A*X = B with a complex
-*> symmetric matrix A using the factorization A = U*T*U**T or
+*> symmetric matrix A using the factorization A = U**T*T*U or
 *> A = L*T*L**T computed by ZSYTRF_AA_2STAGE.
 *> \endverbatim
 *
@@ -48,7 +48,7 @@
 *>          UPLO is CHARACTER*1
 *>          Specifies whether the details of the factorization are stored
 *>          as an upper or lower triangular matrix.
-*>          = 'U':  Upper triangular, form is A = U*T*U**T;
+*>          = 'U':  Upper triangular, form is A = U**T*T*U;
 *>          = 'L':  Lower triangular, form is A = L*T*L**T.
 *> \endverbatim
 *>
@@ -208,15 +208,15 @@
 *
       IF( UPPER ) THEN
 *
-*        Solve A*X = B, where A = U*T*U**T.
+*        Solve A*X = B, where A = U**T*T*U.
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (U**T \P**T * B) -> B    [ (U**T \P**T * B) ]
+*           Compute (U**T \ B) -> B    [ (U**T \P**T * B) ]
 *
             CALL ZTRSM( 'L', 'U', 'T', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -234,7 +234,7 @@
             CALL ZTRSM( 'L', 'U', 'N', 'U', N-NB, NRHS, ONE, A(1, NB+1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (U \ (T \ (U**T \P**T * B) )) ]
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
@@ -246,11 +246,11 @@
 *
          IF( N.GT.NB ) THEN
 *
-*           Pivot, P**T * B
+*           Pivot, P**T * B -> B
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, 1 )
 *
-*           Compute (L \P**T * B) -> B    [ (L \P**T * B) ]
+*           Compute (L \ B) -> B    [ (L \P**T * B) ]
 *
             CALL ZTRSM( 'L', 'L', 'N', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                 LDA, B(NB+1, 1), LDB)
@@ -268,7 +268,7 @@
             CALL ZTRSM( 'L', 'L', 'T', 'U', N-NB, NRHS, ONE, A(NB+1, 1),
      $                  LDA, B(NB+1, 1), LDB)
 *
-*           Pivot, P * B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
+*           Pivot, P * B -> B  [ P * (L**T \ (T \ (L \P**T * B) )) ]
 *
             CALL ZLASWP( NRHS, B, LDB, NB+1, N, IPIV, -1 )
 *
diff --git a/lapack-netlib/SRC/ztgsy2.f b/lapack-netlib/SRC/ztgsy2.f
index f89effd6c..028ddfd3d 100644
--- a/lapack-netlib/SRC/ztgsy2.f
+++ b/lapack-netlib/SRC/ztgsy2.f
@@ -67,7 +67,7 @@
 *>             R  * B**H + L  * E**H  = scale * -F
 *>
 *> This case is used to compute an estimate of Dif[(A, D), (B, E)] =
-*> = sigma_min(Z) using reverse communicaton with ZLACON.
+*> = sigma_min(Z) using reverse communication with ZLACON.
 *>
 *> ZTGSY2 also (IJOB >= 1) contributes to the computation in ZTGSYL
 *> of an upper bound on the separation between to matrix pairs. Then
@@ -81,7 +81,7 @@
 *> \param[in] TRANS
 *> \verbatim
 *>          TRANS is CHARACTER*1
-*>          = 'N', solve the generalized Sylvester equation (1).
+*>          = 'N': solve the generalized Sylvester equation (1).
 *>          = 'T': solve the 'transposed' system (3).
 *> \endverbatim
 *>
diff --git a/lapack-netlib/SRC/ztpmlqt.f b/lapack-netlib/SRC/ztpmlqt.f
index 6a67e4443..cc333f5a2 100644
--- a/lapack-netlib/SRC/ztpmlqt.f
+++ b/lapack-netlib/SRC/ztpmlqt.f
@@ -94,7 +94,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is COMPLEX*16 array, dimension (LDA,K)
+*>          V is COMPLEX*16 array, dimension (LDV,K)
 *>          The i-th row must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          DTPLQT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/ztpmqrt.f b/lapack-netlib/SRC/ztpmqrt.f
index aca7ff00f..530dca458 100644
--- a/lapack-netlib/SRC/ztpmqrt.f
+++ b/lapack-netlib/SRC/ztpmqrt.f
@@ -94,7 +94,7 @@
 *>
 *> \param[in] V
 *> \verbatim
-*>          V is COMPLEX*16 array, dimension (LDA,K)
+*>          V is COMPLEX*16 array, dimension (LDV,K)
 *>          The i-th column must contain the vector which defines the
 *>          elementary reflector H(i), for i = 1,2,...,k, as returned by
 *>          CTPQRT in B.  See Further Details.
diff --git a/lapack-netlib/SRC/ztprfb.f b/lapack-netlib/SRC/ztprfb.f
index 1a62829d5..f96c237ee 100644
--- a/lapack-netlib/SRC/ztprfb.f
+++ b/lapack-netlib/SRC/ztprfb.f
@@ -152,8 +152,8 @@
 *> \verbatim
 *>          LDA is INTEGER
 *>          The leading dimension of the array A.
-*>          If SIDE = 'L', LDC >= max(1,K);
-*>          If SIDE = 'R', LDC >= max(1,M).
+*>          If SIDE = 'L', LDA >= max(1,K);
+*>          If SIDE = 'R', LDA >= max(1,M).
 *> \endverbatim
 *>
 *> \param[in,out] B
diff --git a/lapack-netlib/SRC/zungtsqr.f b/lapack-netlib/SRC/zungtsqr.f
new file mode 100644
index 000000000..7b04e9a29
--- /dev/null
+++ b/lapack-netlib/SRC/zungtsqr.f
@@ -0,0 +1,307 @@
+*> \brief \b ZUNGTSQR
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZUNGTSQR + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zuntsqr.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zungtsqr.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zungtsqr.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+*      $                     INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZUNGTSQR generates an M-by-N complex matrix Q_out with orthonormal
+*> columns, which are the first N columns of a product of comlpex unitary
+*> matrices of order M which are returned by ZLATSQR
+*>
+*>      Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ).
+*>
+*> See the documentation for ZLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A.  M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] MB
+*> \verbatim
+*>          MB is INTEGER
+*>          The row block size used by DLATSQR to return
+*>          arrays A and T. MB > N.
+*>          (Note that if MB > M, then M is used instead of MB
+*>          as the row block size).
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size used by ZLATSQR to return
+*>          arrays A and T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size).
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The elements on and above the diagonal are not accessed.
+*>             The elements below the diagonal represent the unit
+*>             lower-trapezoidal blocked matrix V computed by ZLATSQR
+*>             that defines the input matrices Q_in(k) (ones on the
+*>             diagonal are not stored) (same format as the output A
+*>             below the diagonal in ZLATSQR).
+*>
+*>          On exit:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_out,
+*>             i.e the columns of A are orthogonal unit vectors.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[in] T
+*> \verbatim
+*>          T is COMPLEX*16 array,
+*>          dimension (LDT, N * NIRB)
+*>          where NIRB = Number_of_input_row_blocks
+*>                     = MAX( 1, CEIL((M-N)/(MB-N)) )
+*>          Let NICB = Number_of_input_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          The upper-triangular block reflectors used to define the
+*>          input matrices Q_in(k), k=(1:NIRB*NICB). The block
+*>          reflectors are stored in compact form in NIRB block
+*>          reflector sequences. Each of NIRB block reflector sequences
+*>          is stored in a larger NB-by-N column block of T and consists
+*>          of NICB smaller NB-by-NB upper-triangular column blocks.
+*>          (same format as the output T in ZLATSQR).
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB1,N)).
+*> \endverbatim
+*>
+*> \param[out] WORK
+*> \verbatim
+*>          (workspace) COMPLEX*16 array, dimension (MAX(2,LWORK))
+*>          On exit, if INFO = 0, WORK(1) returns the optimal LWORK.
+*> \endverbatim
+*>
+*> \param[in] LWORK
+*> \verbatim
+*>          The dimension of the array WORK.  LWORK >= (M+NB)*N.
+*>          If LWORK = -1, then a workspace query is assumed.
+*>          The routine only calculates the optimal size of the WORK
+*>          array, returns this value as the first entry of the WORK
+*>          array, and no error message related to LWORK is issued
+*>          by XERBLA.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup comlex16OTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November 2019, Igor Kozachenko,
+*>                Computer Science Division,
+*>                University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZUNGTSQR( M, N, MB, NB, A, LDA, T, LDT, WORK, LWORK,
+     $                     INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, LWORK, M, N, MB, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16        A( LDA, * ), T( LDT, * ), WORK( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            LQUERY
+      INTEGER            IINFO, LDC, LWORKOPT, LC, LW, NBLOCAL, J
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZCOPY, ZLAMTSQR, ZLASET, XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DCMPLX, MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      LQUERY  = LWORK.EQ.-1
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. M.LT.N ) THEN
+         INFO = -2
+      ELSE IF( MB.LE.N ) THEN
+         INFO = -3
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -4
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -6
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -8
+      ELSE
+*
+*        Test the input LWORK for the dimension of the array WORK.
+*        This workspace is used to store array C(LDC, N) and WORK(LWORK)
+*        in the call to ZLAMTSQR. See the documentation for ZLAMTSQR.
+*
+         IF( LWORK.LT.2 .AND. (.NOT.LQUERY) ) THEN
+            INFO = -10
+         ELSE
+*
+*           Set block size for column blocks
+*
+            NBLOCAL = MIN( NB, N )
+*
+*           LWORK = -1, then set the size for the array C(LDC,N)
+*           in ZLAMTSQR call and set the optimal size of the work array
+*           WORK(LWORK) in ZLAMTSQR call.
+*
+            LDC = M
+            LC = LDC*N
+            LW = N * NBLOCAL
+*
+            LWORKOPT = LC+LW
+*
+            IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN
+               INFO = -10
+            END IF
+         END IF
+*
+      END IF
+*
+*     Handle error in the input parameters and return workspace query.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZUNGTSQR', -INFO )
+         RETURN
+      ELSE IF ( LQUERY ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         WORK( 1 ) = DCMPLX( LWORKOPT )
+         RETURN
+      END IF
+*
+*     (1) Form explicitly the tall-skinny M-by-N left submatrix Q1_in
+*     of M-by-M orthogonal matrix Q_in, which is implicitly stored in
+*     the subdiagonal part of input array A and in the input array T.
+*     Perform by the following operation using the routine ZLAMTSQR.
+*
+*         Q1_in = Q_in * ( I ), where I is a N-by-N identity matrix,
+*                        ( 0 )        0 is a (M-N)-by-N zero matrix.
+*
+*     (1a) Form M-by-N matrix in the array WORK(1:LDC*N) with ones
+*     on the diagonal and zeros elsewhere.
+*
+      CALL ZLASET( 'F', M, N, CZERO, CONE, WORK, LDC )
+*
+*     (1b)  On input, WORK(1:LDC*N) stores ( I );
+*                                          ( 0 )
+*
+*           On output, WORK(1:LDC*N) stores Q1_in.
+*
+      CALL ZLAMTSQR( 'L', 'N', M, N, N, MB, NBLOCAL, A, LDA, T, LDT,
+     $               WORK, LDC, WORK( LC+1 ), LW, IINFO )
+*
+*     (2) Copy the result from the part of the work array (1:M,1:N)
+*     with the leading dimension LDC that starts at WORK(1) into
+*     the output array A(1:M,1:N) column-by-column.
+*
+      DO J = 1, N
+         CALL ZCOPY( M, WORK( (J-1)*LDC + 1 ), 1, A( 1, J ), 1 )
+      END DO
+*
+      WORK( 1 ) = DCMPLX( LWORKOPT )
+      RETURN
+*
+*     End of ZUNGTSQR
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/SRC/zunhr_col.f b/lapack-netlib/SRC/zunhr_col.f
new file mode 100644
index 000000000..71039fddb
--- /dev/null
+++ b/lapack-netlib/SRC/zunhr_col.f
@@ -0,0 +1,441 @@
+*> \brief \b ZUNHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*> \htmlonly
+*> Download ZUNHR_COL + dependencies
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.tgz?format=tgz&filename=/lapack/lapack_routine/zunhr_col.f">
+*> [TGZ]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.zip?format=zip&filename=/lapack/lapack_routine/zunhr_col.f">
+*> [ZIP]</a>
+*> <a href="http://www.netlib.org/cgi-bin/netlibfiles.txt?format=txt&filename=/lapack/lapack_routine/zunhr_col.f">
+*> [TXT]</a>
+*>
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           INFO, LDA, LDT, M, N, NB
+*       ..
+*       .. Array Arguments ..
+*       COMPLEX*16        A( LDA, * ), D( * ), T( LDT, * )
+*       ..
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*>  ZUNHR_COL takes an M-by-N complex matrix Q_in with orthonormal columns
+*>  as input, stored in A, and performs Householder Reconstruction (HR),
+*>  i.e. reconstructs Householder vectors V(i) implicitly representing
+*>  another M-by-N matrix Q_out, with the property that Q_in = Q_out*S,
+*>  where S is an N-by-N diagonal matrix with diagonal entries
+*>  equal to +1 or -1. The Householder vectors (columns V(i) of V) are
+*>  stored in A on output, and the diagonal entries of S are stored in D.
+*>  Block reflectors are also returned in T
+*>  (same output format as ZGEQRT).
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          The number of rows of the matrix A. M >= 0.
+*> \endverbatim
+*>
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          The number of columns of the matrix A. M >= N >= 0.
+*> \endverbatim
+*>
+*> \param[in] NB
+*> \verbatim
+*>          NB is INTEGER
+*>          The column block size to be used in the reconstruction
+*>          of Householder column vector blocks in the array A and
+*>          corresponding block reflectors in the array T. NB >= 1.
+*>          (Note that if NB > N, then N is used instead of NB
+*>          as the column block size.)
+*> \endverbatim
+*>
+*> \param[in,out] A
+*> \verbatim
+*>          A is COMPLEX*16 array, dimension (LDA,N)
+*>
+*>          On entry:
+*>
+*>             The array A contains an M-by-N orthonormal matrix Q_in,
+*>             i.e the columns of A are orthogonal unit vectors.
+*>
+*>          On exit:
+*>
+*>             The elements below the diagonal of A represent the unit
+*>             lower-trapezoidal matrix V of Householder column vectors
+*>             V(i). The unit diagonal entries of V are not stored
+*>             (same format as the output below the diagonal in A from
+*>             ZGEQRT). The matrix T and the matrix V stored on output
+*>             in A implicitly define Q_out.
+*>
+*>             The elements above the diagonal contain the factor U
+*>             of the "modified" LU-decomposition:
+*>                Q_in - ( S ) = V * U
+*>                       ( 0 )
+*>             where 0 is a (M-N)-by-(M-N) zero matrix.
+*> \endverbatim
+*>
+*> \param[in] LDA
+*> \verbatim
+*>          LDA is INTEGER
+*>          The leading dimension of the array A.  LDA >= max(1,M).
+*> \endverbatim
+*>
+*> \param[out] T
+*> \verbatim
+*>          T is COMPLEX*16 array,
+*>          dimension (LDT, N)
+*>
+*>          Let NOCB = Number_of_output_col_blocks
+*>                   = CEIL(N/NB)
+*>
+*>          On exit, T(1:NB, 1:N) contains NOCB upper-triangular
+*>          block reflectors used to define Q_out stored in compact
+*>          form as a sequence of upper-triangular NB-by-NB column
+*>          blocks (same format as the output T in ZGEQRT).
+*>          The matrix T and the matrix V stored on output in A
+*>          implicitly define Q_out. NOTE: The lower triangles
+*>          below the upper-triangular blcoks will be filled with
+*>          zeros. See Further Details.
+*> \endverbatim
+*>
+*> \param[in] LDT
+*> \verbatim
+*>          LDT is INTEGER
+*>          The leading dimension of the array T.
+*>          LDT >= max(1,min(NB,N)).
+*> \endverbatim
+*>
+*> \param[out] D
+*> \verbatim
+*>          D is COMPLEX*16 array, dimension min(M,N).
+*>          The elements can be only plus or minus one.
+*>
+*>          D(i) is constructed as D(i) = -SIGN(Q_in_i(i,i)), where
+*>          1 <= i <= min(M,N), and Q_in_i is Q_in after performing
+*>          i-1 steps of “modified” Gaussian elimination.
+*>          See Further Details.
+*> \endverbatim
+*>
+*> \param[out] INFO
+*> \verbatim
+*>          INFO is INTEGER
+*>          = 0:  successful exit
+*>          < 0:  if INFO = -i, the i-th argument had an illegal value
+*> \endverbatim
+*>
+*> \par Further Details:
+*  =====================
+*>
+*> \verbatim
+*>
+*> The computed M-by-M unitary factor Q_out is defined implicitly as
+*> a product of unitary matrices Q_out(i). Each Q_out(i) is stored in
+*> the compact WY-representation format in the corresponding blocks of
+*> matrices V (stored in A) and T.
+*>
+*> The M-by-N unit lower-trapezoidal matrix V stored in the M-by-N
+*> matrix A contains the column vectors V(i) in NB-size column
+*> blocks VB(j). For example, VB(1) contains the columns
+*> V(1), V(2), ... V(NB). NOTE: The unit entries on
+*> the diagonal of Y are not stored in A.
+*>
+*> The number of column blocks is
+*>
+*>     NOCB = Number_of_output_col_blocks = CEIL(N/NB)
+*>
+*> where each block is of order NB except for the last block, which
+*> is of order LAST_NB = N - (NOCB-1)*NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix V is
+*>
+*>
+*>     V = (    VB(1),   VB(2), VB(3) ) =
+*>
+*>       = (   1                      )
+*>         ( v21    1                 )
+*>         ( v31  v32    1            )
+*>         ( v41  v42  v43   1        )
+*>         ( v51  v52  v53  v54    1  )
+*>         ( v61  v62  v63  v54   v65 )
+*>
+*>
+*> For each of the column blocks VB(i), an upper-triangular block
+*> reflector TB(i) is computed. These blocks are stored as
+*> a sequence of upper-triangular column blocks in the NB-by-N
+*> matrix T. The size of each TB(i) block is NB-by-NB, except
+*> for the last block, whose size is LAST_NB-by-LAST_NB.
+*>
+*> For example, if M=6,  N=5 and NB=2, the matrix T is
+*>
+*>     T  = (    TB(1),    TB(2), TB(3) ) =
+*>
+*>        = ( t11  t12  t13  t14   t15  )
+*>          (      t22       t24        )
+*>
+*>
+*> The M-by-M factor Q_out is given as a product of NOCB
+*> unitary M-by-M matrices Q_out(i).
+*>
+*>     Q_out = Q_out(1) * Q_out(2) * ... * Q_out(NOCB),
+*>
+*> where each matrix Q_out(i) is given by the WY-representation
+*> using corresponding blocks from the matrices V and T:
+*>
+*>     Q_out(i) = I - VB(i) * TB(i) * (VB(i))**T,
+*>
+*> where I is the identity matrix. Here is the formula with matrix
+*> dimensions:
+*>
+*>  Q(i){M-by-M} = I{M-by-M} -
+*>    VB(i){M-by-INB} * TB(i){INB-by-INB} * (VB(i))**T {INB-by-M},
+*>
+*> where INB = NB, except for the last block NOCB
+*> for which INB=LAST_NB.
+*>
+*> =====
+*> NOTE:
+*> =====
+*>
+*> If Q_in is the result of doing a QR factorization
+*> B = Q_in * R_in, then:
+*>
+*> B = (Q_out*S) * R_in = Q_out * (S * R_in) = O_out * R_out.
+*>
+*> So if one wants to interpret Q_out as the result
+*> of the QR factorization of B, then corresponding R_out
+*> should be obtained by R_out = S * R_in, i.e. some rows of R_in
+*> should be multiplied by -1.
+*>
+*> For the details of the algorithm, see [1].
+*>
+*> [1] "Reconstructing Householder vectors from tall-skinny QR",
+*>     G. Ballard, J. Demmel, L. Grigori, M. Jacquelin, H.D. Nguyen,
+*>     E. Solomonik, J. Parallel Distrib. Comput.,
+*>     vol. 85, pp. 3-31, 2015.
+*> \endverbatim
+*>
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex16OTHERcomputational
+*
+*> \par Contributors:
+*  ==================
+*>
+*> \verbatim
+*>
+*> November   2019, Igor Kozachenko,
+*>            Computer Science Division,
+*>            University of California, Berkeley
+*>
+*> \endverbatim
+*
+*  =====================================================================
+      SUBROUTINE ZUNHR_COL( M, N, NB, A, LDA, T, LDT, D, INFO )
+      IMPLICIT NONE
+*
+*  -- LAPACK computational routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           INFO, LDA, LDT, M, N, NB
+*     ..
+*     .. Array Arguments ..
+      COMPLEX*16        A( LDA, * ), D( * ), T( LDT, * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, IINFO, J, JB, JBTEMP1, JBTEMP2, JNB,
+     $                   NPLUSONE
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZCOPY, ZLAUNHR_COL_GETRFNP, ZSCAL, ZTRSM,
+     $                   XERBLA
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Executable Statements ..
+*
+*     Test the input parameters
+*
+      INFO = 0
+      IF( M.LT.0 ) THEN
+         INFO = -1
+      ELSE IF( N.LT.0 .OR. N.GT.M ) THEN
+         INFO = -2
+      ELSE IF( NB.LT.1 ) THEN
+         INFO = -3
+      ELSE IF( LDA.LT.MAX( 1, M ) ) THEN
+         INFO = -5
+      ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN
+         INFO = -7
+      END IF
+*
+*     Handle error in the input parameters.
+*
+      IF( INFO.NE.0 ) THEN
+         CALL XERBLA( 'ZUNHR_COL', -INFO )
+         RETURN
+      END IF
+*
+*     Quick return if possible
+*
+      IF( MIN( M, N ).EQ.0 ) THEN
+         RETURN
+      END IF
+*
+*     On input, the M-by-N matrix A contains the unitary
+*     M-by-N matrix Q_in.
+*
+*     (1) Compute the unit lower-trapezoidal V (ones on the diagonal
+*     are not stored) by performing the "modified" LU-decomposition.
+*
+*     Q_in - ( S ) = V * U = ( V1 ) * U,
+*            ( 0 )           ( V2 )
+*
+*     where 0 is an (M-N)-by-N zero matrix.
+*
+*     (1-1) Factor V1 and U.
+
+      CALL ZLAUNHR_COL_GETRFNP( N, N, A, LDA, D, IINFO )
+*
+*     (1-2) Solve for V2.
+*
+      IF( M.GT.N ) THEN
+         CALL ZTRSM( 'R', 'U', 'N', 'N', M-N, N, CONE, A, LDA,
+     $               A( N+1, 1 ), LDA )
+      END IF
+*
+*     (2) Reconstruct the block reflector T stored in T(1:NB, 1:N)
+*     as a sequence of upper-triangular blocks with NB-size column
+*     blocking.
+*
+*     Loop over the column blocks of size NB of the array A(1:M,1:N)
+*     and the array T(1:NB,1:N), JB is the column index of a column
+*     block, JNB is the column block size at each step JB.
+*
+      NPLUSONE = N + 1
+      DO JB = 1, N, NB
+*
+*        (2-0) Determine the column block size JNB.
+*
+         JNB = MIN( NPLUSONE-JB, NB )
+*
+*        (2-1) Copy the upper-triangular part of the current JNB-by-JNB
+*        diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in A(JB:JB+JNB-1,JB:JB+JNB-1) into the upper-triangular part
+*        of the current JNB-by-JNB block T(1:JNB,JB:JB+JNB-1)
+*        column-by-column, total JNB*(JNB+1)/2 elements.
+*
+         JBTEMP1 = JB - 1
+         DO J = JB, JB+JNB-1
+            CALL ZCOPY( J-JBTEMP1, A( JB, J ), 1, T( 1, J ), 1 )
+         END DO
+*
+*        (2-2) Perform on the upper-triangular part of the current
+*        JNB-by-JNB diagonal block U(JB) (of the N-by-N matrix U) stored
+*        in T(1:JNB,JB:JB+JNB-1) the following operation in place:
+*        (-1)*U(JB)*S(JB), i.e the result will be stored in the upper-
+*        triangular part of T(1:JNB,JB:JB+JNB-1). This multiplication
+*        of the JNB-by-JNB diagonal block U(JB) by the JNB-by-JNB
+*        diagonal block S(JB) of the N-by-N sign matrix S from the
+*        right means changing the sign of each J-th column of the block
+*        U(JB) according to the sign of the diagonal element of the block
+*        S(JB), i.e. S(J,J) that is stored in the array element D(J).
+*
+         DO J = JB, JB+JNB-1
+            IF( D( J ).EQ.CONE ) THEN
+               CALL ZSCAL( J-JBTEMP1, -CONE, T( 1, J ), 1 )
+            END IF
+         END DO
+*
+*        (2-3) Perform the triangular solve for the current block
+*        matrix X(JB):
+*
+*               X(JB) * (A(JB)**T) = B(JB), where:
+*
+*               A(JB)**T  is a JNB-by-JNB unit upper-triangular
+*                         coefficient block, and A(JB)=V1(JB), which
+*                         is a JNB-by-JNB unit lower-triangular block
+*                         stored in A(JB:JB+JNB-1,JB:JB+JNB-1).
+*                         The N-by-N matrix V1 is the upper part
+*                         of the M-by-N lower-trapezoidal matrix V
+*                         stored in A(1:M,1:N);
+*
+*               B(JB)     is a JNB-by-JNB  upper-triangular right-hand
+*                         side block, B(JB) = (-1)*U(JB)*S(JB), and
+*                         B(JB) is stored in T(1:JNB,JB:JB+JNB-1);
+*
+*               X(JB)     is a JNB-by-JNB upper-triangular solution
+*                         block, X(JB) is the upper-triangular block
+*                         reflector T(JB), and X(JB) is stored
+*                         in T(1:JNB,JB:JB+JNB-1).
+*
+*             In other words, we perform the triangular solve for the
+*             upper-triangular block T(JB):
+*
+*               T(JB) * (V1(JB)**T) = (-1)*U(JB)*S(JB).
+*
+*             Even though the blocks X(JB) and B(JB) are upper-
+*             triangular, the routine ZTRSM will access all JNB**2
+*             elements of the square T(1:JNB,JB:JB+JNB-1). Therefore,
+*             we need to set to zero the elements of the block
+*             T(1:JNB,JB:JB+JNB-1) below the diagonal before the call
+*             to ZTRSM.
+*
+*        (2-3a) Set the elements to zero.
+*
+         JBTEMP2 = JB - 2
+         DO J = JB, JB+JNB-2
+            DO I = J-JBTEMP2, NB
+               T( I, J ) = CZERO
+            END DO
+         END DO
+*
+*        (2-3b) Perform the triangular solve.
+*
+         CALL ZTRSM( 'R', 'L', 'C', 'U', JNB, JNB, CONE,
+     $               A( JB, JB ), LDA, T( 1, JB ), LDT )
+*
+      END DO
+*
+      RETURN
+*
+*     End of ZUNHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/TESTING/CMakeLists.txt b/lapack-netlib/TESTING/CMakeLists.txt
index ec3d85221..d5ca95013 100644
--- a/lapack-netlib/TESTING/CMakeLists.txt
+++ b/lapack-netlib/TESTING/CMakeLists.txt
@@ -161,7 +161,7 @@ endif()
 # Only run this test if python 2.7 or greater is found
 if(PYTHONINTERP_FOUND)
   message(STATUS "Running Summary")
-  execute_process(COMMAND ${CMAKE_COMMAND} -E copy ${LAPACK_SOURCE_DIR}/lapack_testing.py ${LAPACK_BINARY_DIR})
+  file(COPY ${LAPACK_SOURCE_DIR}/lapack_testing.py DESTINATION ${LAPACK_BINARY_DIR})
   add_test(
     NAME LAPACK_Test_Summary
     WORKING_DIRECTORY ${LAPACK_BINARY_DIR}
diff --git a/lapack-netlib/TESTING/EIG/Makefile b/lapack-netlib/TESTING/EIG/Makefile
index 78046125a..b3efebcd0 100644
--- a/lapack-netlib/TESTING/EIG/Makefile
+++ b/lapack-netlib/TESTING/EIG/Makefile
@@ -1,5 +1,3 @@
-include ../../make.inc
-
 ########################################################################
 #  This is the makefile for the eigenvalue test program from LAPACK.
 #  The test files are organized as follows:
@@ -33,6 +31,9 @@ include ../../make.inc
 #
 ########################################################################
 
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
 AEIGTST = \
    alahdg.o \
    alasum.o \
@@ -117,24 +118,26 @@ ZEIGTST = zchkee.o \
    zsgt01.o zslect.o \
    zstt21.o zstt22.o zunt01.o zunt03.o
 
+.PHONY: all
 all: single complex double complex16
 
+.PHONY: single complex double complex16
 single: xeigtsts
 complex: xeigtstc
 double: xeigtstd
 complex16: xeigtstz
 
-xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xeigtsts: $(SEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xeigtstc: $(CEIGTST) $(SCIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xeigtstd: $(DEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xeigtstz: $(ZEIGTST) $(DZIGTST) $(AEIGTST) $(TMGLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(AEIGTST): $(FRC)
 $(SCIGTST): $(FRC)
@@ -147,6 +150,7 @@ $(ZEIGTST): $(FRC)
 FRC:
 	@FRC=$(FRC)
 
+.PHONY: clean cleanobj cleanexe
 clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
@@ -154,13 +158,10 @@ cleanexe:
 	rm -f xeigtst*
 
 schkee.o: schkee.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 dchkee.o: dchkee.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 cchkee.o: cchkee.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 zchkee.o: zchkee.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
diff --git a/lapack-netlib/TESTING/EIG/cbdt05.f b/lapack-netlib/TESTING/EIG/cbdt05.f
index 192a8d0b6..5a08ccce3 100644
--- a/lapack-netlib/TESTING/EIG/cbdt05.f
+++ b/lapack-netlib/TESTING/EIG/cbdt05.f
@@ -52,6 +52,7 @@
 *> \verbatim
 *>          A is COMPLEX array, dimension (LDA,N)
 *>          The m by n matrix A.
+*> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
diff --git a/lapack-netlib/TESTING/EIG/cchkst.f b/lapack-netlib/TESTING/EIG/cchkst.f
index 471fe9c92..2d25f3fb1 100644
--- a/lapack-netlib/TESTING/EIG/cchkst.f
+++ b/lapack-netlib/TESTING/EIG/cchkst.f
@@ -167,7 +167,7 @@
 *>                                              CSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because CSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z* | / ( |S| n ulp )    CSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/cchkst2stg.f b/lapack-netlib/TESTING/EIG/cchkst2stg.f
index df610c207..5c478577f 100644
--- a/lapack-netlib/TESTING/EIG/cchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/cchkst2stg.f
@@ -188,7 +188,7 @@
 *>                                              CSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because CSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z* | / ( |S| n ulp )    CSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/cdrgsx.f b/lapack-netlib/TESTING/EIG/cdrgsx.f
index 4e0f8b468..746946d07 100644
--- a/lapack-netlib/TESTING/EIG/cdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/cdrgsx.f
@@ -737,7 +737,7 @@
       CALL CLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA )
       CALL CLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA )
 *
-*     Compute the Schur factorization while swaping the
+*     Compute the Schur factorization while swapping the
 *     m-by-m (1,1)-blocks with n-by-n (2,2)-blocks.
 *
       CALL CGGESX( 'V', 'V', 'S', CLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA,
diff --git a/lapack-netlib/TESTING/EIG/cdrvbd.f b/lapack-netlib/TESTING/EIG/cdrvbd.f
index 64bed3b13..7b7b01b47 100644
--- a/lapack-netlib/TESTING/EIG/cdrvbd.f
+++ b/lapack-netlib/TESTING/EIG/cdrvbd.f
@@ -33,8 +33,9 @@
 *>
 *> \verbatim
 *>
-*> CDRVBD checks the singular value decomposition (SVD) driver CGESVD
-*> and CGESDD.
+*> CDRVBD checks the singular value decomposition (SVD) driver CGESVD,
+*> CGESDD, CGESVJ, CGEJSV, CGESVDX, and CGESVDQ.
+*>
 *> CGESVD and CGESDD factors A = U diag(S) VT, where U and VT are
 *> unitary and diag(S) is diagonal with the entries of the array S on
 *> its diagonal. The entries of S are the singular values, nonnegative
@@ -73,81 +74,92 @@
 *>
 *> Test for CGESDD:
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (8)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (9)   | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (10)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (11)  S contains MNMIN nonnegative values in decreasing order.
 *>       (Return 0 if true, 1/ULP if false.)
 *>
-*> (5)   | U - Upartial | / ( M ulp ) where Upartial is a partially
+*> (12)  | U - Upartial | / ( M ulp ) where Upartial is a partially
 *>       computed U.
 *>
-*> (6)   | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
+*> (13)  | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
 *>       computed VT.
 *>
-*> (7)   | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
+*> (14)  | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
 *>       vector of singular values from the partial SVD
 *>
+*> Test for CGESVDQ:
+*>
+*> (36)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*>
+*> (37)  | I - U'U | / ( M ulp )
+*>
+*> (38)  | I - VT VT' | / ( N ulp )
+*>
+*> (39)  S contains MNMIN nonnegative values in decreasing order.
+*>       (Return 0 if true, 1/ULP if false.)
+*>
 *> Test for CGESVJ:
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (15)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (16)  | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (17)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (18)  S contains MNMIN nonnegative values in decreasing order.
 *>       (Return 0 if true, 1/ULP if false.)
 *>
 *> Test for CGEJSV:
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (19)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (20)  | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (21)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (22)  S contains MNMIN nonnegative values in decreasing order.
 *>        (Return 0 if true, 1/ULP if false.)
 *>
 *> Test for CGESVDX( 'V', 'V', 'A' )/CGESVDX( 'N', 'N', 'A' )
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (23)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (24)  | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (25)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (26)  S contains MNMIN nonnegative values in decreasing order.
 *>       (Return 0 if true, 1/ULP if false.)
 *>
-*> (5)   | U - Upartial | / ( M ulp ) where Upartial is a partially
+*> (27)  | U - Upartial | / ( M ulp ) where Upartial is a partially
 *>       computed U.
 *>
-*> (6)   | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
+*> (28)  | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
 *>       computed VT.
 *>
-*> (7)   | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
+*> (29)  | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
 *>       vector of singular values from the partial SVD
 *>
 *> Test for CGESVDX( 'V', 'V', 'I' )
 *>
-*> (8)   | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
+*> (30)  | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
 *>
-*> (9)   | I - U'U | / ( M ulp )
+*> (31)  | I - U'U | / ( M ulp )
 *>
-*> (10)  | I - VT VT' | / ( N ulp )
+*> (32)  | I - VT VT' | / ( N ulp )
 *>
 *> Test for CGESVDX( 'V', 'V', 'V' )
 *>
-*> (11)   | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
+*> (33)   | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
 *>
-*> (12)   | I - U'U | / ( M ulp )
+*> (34)   | I - U'U | / ( M ulp )
 *>
-*> (13)   | I - VT VT' | / ( N ulp )
+*> (35)   | I - VT VT' | / ( N ulp )
 *>
 *> The "sizes" are specified by the arrays MM(1:NSIZES) and
 *> NN(1:NSIZES); the value of each element pair (MM(j),NN(j))
@@ -393,6 +405,8 @@
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     June 2016
+*
+      IMPLICIT NONE
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LDU, LDVT, LWORK, NOUNIT, NSIZES,
@@ -411,7 +425,7 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      REAL              ZERO, ONE, TWO, HALF
+      REAL               ZERO, ONE, TWO, HALF
       PARAMETER          ( ZERO = 0.0E0, ONE = 1.0E0, TWO = 2.0E0,
      $                   HALF = 0.5E0 )
       COMPLEX            CZERO, CONE
@@ -431,10 +445,13 @@
       REAL               ANORM, DIF, DIV, OVFL, RTUNFL, ULP, ULPINV,
      $                   UNFL, VL, VU
 *     ..
+*     .. Local Scalars for CGESVDQ ..
+      INTEGER            LIWORK, NUMRANK
+*     ..
 *     .. Local Arrays ..
       CHARACTER          CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 )
       INTEGER            IOLDSD( 4 ), ISEED2( 4 )
-      REAL               RESULT( 35 )
+      REAL               RESULT( 39 )
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH, SLARND
@@ -442,8 +459,8 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, XERBLA, CBDT01, CBDT05, CGESDD,
-     $                   CGESVD, CGESVJ, CGEJSV, CGESVDX, CLACPY,
-     $                   CLASET, CLATMS, CUNT01, CUNT03
+     $                   CGESVD, CGESVDQ, CGESVJ, CGEJSV, CGESVDX,
+     $                   CLACPY, CLASET, CLATMS, CUNT01, CUNT03
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, MAX, MIN
@@ -838,8 +855,64 @@
   130          CONTINUE
 
 *
-*              Test CGESVJ: Factorize A
-*              Note: CGESVJ does not work for M < N
+*              Test CGESVDQ
+*              Note: CGESVDQ only works for M >= N
+*
+               RESULT( 36 ) = ZERO
+               RESULT( 37 ) = ZERO
+               RESULT( 38 ) = ZERO
+               RESULT( 39 ) = ZERO
+*
+               IF( M.GE.N ) THEN
+                  IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  IF( IWSPC.EQ.4 )
+     $               LSWORK = LWORK
+*
+                  CALL CLACPY( 'F', M, N, ASAV, LDA, A, LDA )
+                  SRNAMT = 'CGESVDQ'
+*
+                  LRWORK = MAX(2, M, 5*N)
+                  LIWORK = MAX( N, 1 )
+                  CALL CGESVDQ( 'H', 'N', 'N', 'A', 'A', 
+     $                          M, N, A, LDA, SSAV, USAV, LDU,
+     $                          VTSAV, LDVT, NUMRANK, IWORK, LIWORK,
+     $                          WORK, LWORK, RWORK, LRWORK, IINFO )
+*
+                  IF( IINFO.NE.0 ) THEN
+                     WRITE( NOUNIT, FMT = 9995 )'CGESVDQ', IINFO, M, N,
+     $               JTYPE, LSWORK, IOLDSD
+                     INFO = ABS( IINFO )
+                     RETURN
+                  END IF
+*
+*                 Do tests 36--39
+*
+                  CALL CBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E,
+     $                         VTSAV, LDVT, WORK, RWORK, RESULT( 36 ) )
+                  IF( M.NE.0 .AND. N.NE.0 ) THEN
+                     CALL CUNT01( 'Columns', M, M, USAV, LDU, WORK,
+     $                            LWORK, RWORK, RESULT( 37 ) )
+                     CALL CUNT01( 'Rows', N, N, VTSAV, LDVT, WORK,
+     $                            LWORK, RWORK, RESULT( 38 ) )
+                  END IF
+                  RESULT( 39 ) = ZERO
+                  DO 199 I = 1, MNMIN - 1
+                     IF( SSAV( I ).LT.SSAV( I+1 ) )
+     $                  RESULT( 39 ) = ULPINV
+                     IF( SSAV( I ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+  199             CONTINUE
+                  IF( MNMIN.GE.1 ) THEN
+                     IF( SSAV( MNMIN ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+                  END IF
+               END IF
+*
+*              Test CGESVJ
+*              Note: CGESVJ only works for M >= N
 *
                RESULT( 15 ) = ZERO
                RESULT( 16 ) = ZERO
@@ -847,13 +920,13 @@
                RESULT( 18 ) = ZERO
 *
                IF( M.GE.N ) THEN
-               IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
-               LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
-               LSWORK = MIN( LSWORK, LWORK )
-               LSWORK = MAX( LSWORK, 1 )
-               LRWORK = MAX(6,N)
-               IF( IWSPC.EQ.4 )
-     $            LSWORK = LWORK
+                  IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  LRWORK = MAX(6,N)
+                  IF( IWSPC.EQ.4 )
+     $               LSWORK = LWORK
 *
                   CALL CLACPY( 'F', M, N, ASAV, LDA, USAV, LDA )
                   SRNAMT = 'CGESVJ'
@@ -861,8 +934,7 @@
      &                        0, A, LDVT, WORK, LWORK, RWORK,
      &                        LRWORK, IINFO )
 *
-*                 CGESVJ retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 CGESVJ returns V not VH
 *
                   DO J=1,N
                      DO I=1,N
@@ -900,31 +972,30 @@
                   END IF
                END IF
 *
-*              Test CGEJSV: Factorize A
-*              Note: CGEJSV does not work for M < N
+*              Test CGEJSV
+*              Note: CGEJSV only works for M >= N
 *
                RESULT( 19 ) = ZERO
                RESULT( 20 ) = ZERO
                RESULT( 21 ) = ZERO
                RESULT( 22 ) = ZERO
                IF( M.GE.N ) THEN
-               IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
-               LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
-               LSWORK = MIN( LSWORK, LWORK )
-               LSWORK = MAX( LSWORK, 1 )
-               IF( IWSPC.EQ.4 )
-     $            LSWORK = LWORK
-               LRWORK = MAX( 7, N + 2*M)
-*
-                 CALL CLACPY( 'F', M, N, ASAV, LDA, VTSAV, LDA )
+                  IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  IF( IWSPC.EQ.4 )
+     $               LSWORK = LWORK
+                  LRWORK = MAX( 7, N + 2*M)
+*
+                  CALL CLACPY( 'F', M, N, ASAV, LDA, VTSAV, LDA )
                   SRNAMT = 'CGEJSV'
                   CALL CGEJSV( 'G', 'U', 'V', 'R', 'N', 'N',
      &                   M, N, VTSAV, LDA, SSAV, USAV, LDU, A, LDVT,
      &                   WORK, LWORK, RWORK,
      &                   LRWORK, IWORK, IINFO )
 *
-*                 CGEJSV retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 CGEJSV returns V not VH
 *
                   DO 133 J=1,N
                      DO 132 I=1,N
@@ -933,7 +1004,7 @@
   133             END DO
 *
                   IF( IINFO.NE.0 ) THEN
-                     WRITE( NOUNIT, FMT = 9995 )'GESVJ', IINFO, M, N,
+                     WRITE( NOUNIT, FMT = 9995 )'GEJSV', IINFO, M, N,
      $               JTYPE, LSWORK, IOLDSD
                      INFO = ABS( IINFO )
                      RETURN
@@ -1160,7 +1231,7 @@
 *
                NTEST = 0
                NFAIL = 0
-               DO 190 J = 1, 35
+               DO 190 J = 1, 39
                   IF( RESULT( J ).GE.ZERO )
      $               NTEST = NTEST + 1
                   IF( RESULT( J ).GE.THRESH )
@@ -1175,7 +1246,7 @@
                   NTESTF = 2
                END IF
 *
-               DO 200 J = 1, 35
+               DO 200 J = 1, 39
                   IF( RESULT( J ).GE.THRESH ) THEN
                      WRITE( NOUNIT, FMT = 9997 )M, N, JTYPE, IWSPC,
      $                  IOLDSD, J, RESULT( J )
@@ -1251,6 +1322,12 @@
      $      / '33 = | U**T A VT**T - diag(S) | / ( |A| max(M,N) ulp )',
      $      / '34 = | I - U**T U | / ( M ulp ) ',
      $      / '35 = | I - VT VT**T | / ( N ulp ) ',
+     $      ' CGESVDQ(H,N,N,A,A',
+     $      / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ',
+     $      / '37 = | I - U**T U | / ( M ulp ) ',
+     $      / '38 = | I - VT VT**T | / ( N ulp ) ',
+     $      / '39 = 0 if S contains min(M,N) nonnegative values in',
+     $      ' decreasing order, else 1/ulp',
      $      / / )
  9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1,
      $      ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 )
diff --git a/lapack-netlib/TESTING/EIG/cerred.f b/lapack-netlib/TESTING/EIG/cerred.f
index f1670e983..a0ceff76e 100644
--- a/lapack-netlib/TESTING/EIG/cerred.f
+++ b/lapack-netlib/TESTING/EIG/cerred.f
@@ -36,6 +36,8 @@
 *>       CGEJSV   compute SVD of an M-by-N matrix A where M >= N
 *>       CGESVDX  compute SVD of an M-by-N matrix A(by bisection
 *>                and inverse iteration)
+*>       CGESVDQ  compute SVD of an M-by-N matrix A(with a 
+*>                QR-Preconditioned )
 *> \endverbatim
 *
 *  Arguments:
@@ -101,7 +103,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, CGEES, CGEESX, CGEEV, CGEEVX, CGEJSV,
-     $                   CGESDD, CGESVD
+     $                   CGESDD, CGESVD, CGESVDX, CGESVDQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAMEN, CSLECT
@@ -495,6 +497,61 @@
          ELSE
             WRITE( NOUT, FMT = 9998 )
          END IF
+*
+*        Test CGESVDQ
+*
+         SRNAMT = 'CGESVDQ'
+         INFOT = 1
+         CALL CGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL CGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL CGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL CGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL CGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 9
+         CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 -1, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, -1, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 17
+         CALL CGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, 1, NS, IW, -5, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'CGESVDQ', INFOT, NOUT, LERR, OK )
+         NT = 11
+         IF( OK ) THEN
+            WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
+     $           NT
+         ELSE
+            WRITE( NOUT, FMT = 9998 )
+         END IF
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/cget51.f b/lapack-netlib/TESTING/EIG/cget51.f
index ce1108aa4..ec58086d4 100644
--- a/lapack-netlib/TESTING/EIG/cget51.f
+++ b/lapack-netlib/TESTING/EIG/cget51.f
@@ -29,12 +29,13 @@
 *>
 *>      CGET51  generally checks a decomposition of the form
 *>
-*>              A = U B VC>
-*>      where * means conjugate transpose and U and V are unitary.
+*>              A = U B V**H
+*>
+*>      where **H means conjugate transpose and U and V are unitary.
 *>
 *>      Specifically, if ITYPE=1
 *>
-*>              RESULT = | A - U B V* | / ( |A| n ulp )
+*>              RESULT = | A - U B V**H | / ( |A| n ulp )
 *>
 *>      If ITYPE=2, then:
 *>
@@ -42,7 +43,7 @@
 *>
 *>      If ITYPE=3, then:
 *>
-*>              RESULT = | I - UU* | / ( n ulp )
+*>              RESULT = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -52,9 +53,9 @@
 *> \verbatim
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
-*>          =1: RESULT = | A - U B V* | / ( |A| n ulp )
+*>          =1: RESULT = | A - U B V**H | / ( |A| n ulp )
 *>          =2: RESULT = | A - B | / ( |A| n ulp )
-*>          =3: RESULT = | I - UU* | / ( n ulp )
+*>          =3: RESULT = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] N
@@ -218,7 +219,7 @@
 *
          IF( ITYPE.EQ.1 ) THEN
 *
-*           ITYPE=1: Compute W = A - UBV'
+*           ITYPE=1: Compute W = A - U B V**H
 *
             CALL CLACPY( ' ', N, N, A, LDA, WORK, N )
             CALL CGEMM( 'N', 'N', N, N, N, CONE, U, LDU, B, LDB, CZERO,
@@ -259,7 +260,7 @@
 *
 *        Tests not scaled by norm(A)
 *
-*        ITYPE=3: Compute  UU' - I
+*        ITYPE=3: Compute  U U**H - I
 *
          CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO,
      $               WORK, N )
diff --git a/lapack-netlib/TESTING/EIG/chbt21.f b/lapack-netlib/TESTING/EIG/chbt21.f
index 90ec74c23..76eb7d115 100644
--- a/lapack-netlib/TESTING/EIG/chbt21.f
+++ b/lapack-netlib/TESTING/EIG/chbt21.f
@@ -28,14 +28,16 @@
 *>
 *> CHBT21  generally checks a decomposition of the form
 *>
-*>         A = U S UC>
-*> where * means conjugate transpose, A is hermitian banded, U is
+*>         A = U S U**H
+*>
+*> where **H means conjugate transpose, A is hermitian banded, U is
 *> unitary, and S is diagonal (if KS=0) or symmetric
 *> tridiagonal (if KS=1).
 *>
 *> Specifically:
 *>
-*>         RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU* | / ( n ulp )
+*>         RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -220,7 +222,7 @@
 *
       ANORM = MAX( CLANHB( '1', CUPLO, N, IKA, A, LDA, RWORK ), UNFL )
 *
-*     Compute error matrix:    Error = A - U S U*
+*     Compute error matrix:    Error = A - U S U**H
 *
 *     Copy A from SB to SP storage format.
 *
@@ -271,7 +273,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK,
      $            N )
diff --git a/lapack-netlib/TESTING/EIG/chet21.f b/lapack-netlib/TESTING/EIG/chet21.f
index 5aff64904..d5c4f1348 100644
--- a/lapack-netlib/TESTING/EIG/chet21.f
+++ b/lapack-netlib/TESTING/EIG/chet21.f
@@ -29,8 +29,9 @@
 *>
 *> CHET21 generally checks a decomposition of the form
 *>
-*>    A = U S UC>
-*> where * means conjugate transpose, A is hermitian, U is unitary, and
+*>    A = U S U**H
+*>
+*> where **H means conjugate transpose, A is hermitian, U is unitary, and
 *> S is diagonal (if KBAND=0) or (real) symmetric tridiagonal (if
 *> KBAND=1).
 *>
@@ -42,18 +43,19 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>    RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC>    RESULT(2) = | I - UU* | / ( n ulp )
+*>    RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>    RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>    RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>    RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>    RESULT(1) = | I - UV* | / ( n ulp )
+*>    RESULT(1) = | I - U V**H | / ( n ulp )
 *>
 *> For ITYPE > 1, the transformation U is expressed as a product
-*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)C> and each
+*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)**H and each
 *> vector v(j) has its first j elements 0 and the remaining n-j elements
 *> stored in V(j+1:n,j).
 *> \endverbatim
@@ -66,14 +68,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense unitary matrix:
-*>             RESULT(1) = | A - U S U* | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU* | / ( n ulp )
+*>             RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>             RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense unitary matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - UV* | / ( n ulp )
+*>             RESULT(1) = | I - U V**H | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -171,7 +174,7 @@
 *> \verbatim
 *>          TAU is COMPLEX array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)* in the Householder transformation H(j) of
+*>          v(j) v(j)**H in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -294,7 +297,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U*
+*        ITYPE=1: error = A - U S U**H
 *
          CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
          CALL CLACPY( CUPLO, N, N, A, LDA, WORK, N )
@@ -304,7 +307,6 @@
    10    CONTINUE
 *
          IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN
-CMK            DO 20 J = 1, N - 1
             DO 20 J = 2, N - 1
                CALL CHER2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1,
      $                     U( 1, J-1 ), 1, WORK, N )
@@ -314,7 +316,7 @@ CMK            DO 20 J = 1, N - 1
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V* - A
+*        ITYPE=2: error = V S V**H - A
 *
          CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
 *
@@ -371,7 +373,7 @@ CMK            DO 20 J = 1, N - 1
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V* - I
+*        ITYPE=3: error = U V**H - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -407,7 +409,7 @@ CMK            DO 20 J = 1, N - 1
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO,
diff --git a/lapack-netlib/TESTING/EIG/chet22.f b/lapack-netlib/TESTING/EIG/chet22.f
index 5087ecbca..354387f2a 100644
--- a/lapack-netlib/TESTING/EIG/chet22.f
+++ b/lapack-netlib/TESTING/EIG/chet22.f
@@ -42,7 +42,8 @@
 *>
 *>      Specifically, if ITYPE=1, then:
 *>
-*>              RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC>              RESULT(2) = | I - U'U | / ( m ulp )
+*>              RESULT(1) = | U**H A U - S | / ( |A| m ulp ) and
+*>              RESULT(2) = | I - U**H U | / ( m ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -52,7 +53,8 @@
 *>  ITYPE   INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**H | / ( |A| n ulp )  and
+*>             RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *>  UPLO    CHARACTER
 *>          If UPLO='U', the upper triangle of A will be used and the
@@ -122,7 +124,7 @@
 *>
 *>  TAU     COMPLEX array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**H in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *>          Not modified.
@@ -215,7 +217,7 @@
 *
 *     Compute error matrix:
 *
-*     ITYPE=1: error = U' A U - S
+*     ITYPE=1: error = U**H A U - S
 *
       CALL CHEMM( 'L', UPLO, N, M, CONE, A, LDA, U, LDU, CZERO, WORK,
      $            N )
@@ -249,7 +251,7 @@
 *
 *     Do Test 2
 *
-*     Compute  U'U - I
+*     Compute  U**H U - I
 *
       IF( ITYPE.EQ.1 )
      $   CALL CUNT01( 'Columns', N, M, U, LDU, WORK, 2*N*N, RWORK,
diff --git a/lapack-netlib/TESTING/EIG/chpt21.f b/lapack-netlib/TESTING/EIG/chpt21.f
index e151a8bd8..f20921bd9 100644
--- a/lapack-netlib/TESTING/EIG/chpt21.f
+++ b/lapack-netlib/TESTING/EIG/chpt21.f
@@ -29,8 +29,9 @@
 *>
 *> CHPT21  generally checks a decomposition of the form
 *>
-*>         A = U S UC>
-*> where * means conjugate transpose, A is hermitian, U is
+*>         A = U S U**H
+*>
+*> where **H means conjugate transpose, A is hermitian, U is
 *> unitary, and S is diagonal (if KBAND=0) or (real) symmetric
 *> tridiagonal (if KBAND=1).  If ITYPE=1, then U is represented as
 *> a dense matrix, otherwise the U is expressed as a product of
@@ -41,15 +42,16 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>         RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU* | / ( n ulp )
+*>         RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>         RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>         RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>         RESULT(1) = | I - UV* | / ( n ulp )
+*>         RESULT(1) = | I - U V**H | / ( n ulp )
 *>
 *> Packed storage means that, for example, if UPLO='U', then the columns
 *> of the upper triangle of A are stored one after another, so that
@@ -70,14 +72,16 @@
 *>
 *>    If UPLO='U', then  V = H(n-1)...H(1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)C>
+*>        H(j) = I  -  tau(j) v(j) v(j)**H
+*>
 *>    and the first j-1 elements of v(j) are stored in V(1:j-1,j+1),
 *>    (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ),
 *>    the j-th element is 1, and the last n-j elements are 0.
 *>
 *>    If UPLO='L', then  V = H(1)...H(n-1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)C>
+*>        H(j) = I  -  tau(j) v(j) v(j)**H
+*>
 *>    and the first j elements of v(j) are 0, the (j+1)-st is 1, and the
 *>    (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e.,
 *>    in VP( (2*n-j)*(j-1)/2 + j+2 : (2*n-j)*(j-1)/2 + n ) .)
@@ -91,14 +95,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense unitary matrix:
-*>             RESULT(1) = | A - U S U* | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU* | / ( n ulp )
+*>             RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>             RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense unitary matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - UV* | / ( n ulp )
+*>             RESULT(1) = | I - U V**H | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -181,7 +186,7 @@
 *> \verbatim
 *>          TAU is COMPLEX array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)* in the Householder transformation H(j) of
+*>          v(j) v(j)**H in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -313,7 +318,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U*
+*        ITYPE=1: error = A - U S U**H
 *
          CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
          CALL CCOPY( LAP, AP, 1, WORK, 1 )
@@ -332,7 +337,7 @@
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V* - A
+*        ITYPE=2: error = V S V**H - A
 *
          CALL CLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
 *
@@ -400,7 +405,7 @@
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V* - I
+*        ITYPE=3: error = U V**H - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -431,7 +436,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO,
diff --git a/lapack-netlib/TESTING/EIG/cstt21.f b/lapack-netlib/TESTING/EIG/cstt21.f
index 47d99ac49..3fdfa1675 100644
--- a/lapack-netlib/TESTING/EIG/cstt21.f
+++ b/lapack-netlib/TESTING/EIG/cstt21.f
@@ -28,14 +28,15 @@
 *>
 *> CSTT21  checks a decomposition of the form
 *>
-*>    A = U S UC>
-*> where * means conjugate transpose, A is real symmetric tridiagonal,
+*>    A = U S U**H
+*>
+*> where **H means conjugate transpose, A is real symmetric tridiagonal,
 *> U is unitary, and S is real and diagonal (if KBAND=0) or symmetric
 *> tridiagonal (if KBAND=1).  Two tests are performed:
 *>
-*>    RESULT(1) = | A - U S U* | / ( |A| n ulp )
+*>    RESULT(1) = | A - U S U**H | / ( |A| n ulp )
 *>
-*>    RESULT(2) = | I - UU* | / ( n ulp )
+*>    RESULT(2) = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -201,7 +202,7 @@
       WORK( N**2 ) = AD( N )
       ANORM = MAX( ANORM, ABS( AD( N ) )+TEMP1, UNFL )
 *
-*     Norm of A - USU*
+*     Norm of A - U S U**H
 *
       DO 20 J = 1, N
          CALL CHER( 'L', N, -SD( J ), U( 1, J ), 1, WORK, N )
@@ -228,7 +229,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       CALL CGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK,
      $            N )
diff --git a/lapack-netlib/TESTING/EIG/dbdt05.f b/lapack-netlib/TESTING/EIG/dbdt05.f
index 3580aec81..356bb5fc8 100644
--- a/lapack-netlib/TESTING/EIG/dbdt05.f
+++ b/lapack-netlib/TESTING/EIG/dbdt05.f
@@ -52,6 +52,7 @@
 *> \verbatim
 *>          A is DOUBLE PRECISION array, dimension (LDA,N)
 *>          The m by n matrix A.
+*> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
diff --git a/lapack-netlib/TESTING/EIG/dchkst.f b/lapack-netlib/TESTING/EIG/dchkst.f
index f08deb529..1b4d85f79 100644
--- a/lapack-netlib/TESTING/EIG/dchkst.f
+++ b/lapack-netlib/TESTING/EIG/dchkst.f
@@ -166,7 +166,7 @@
 *>                                              DSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because DSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z' | / ( |S| n ulp )    DSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/dchkst2stg.f b/lapack-netlib/TESTING/EIG/dchkst2stg.f
index fc015334d..ca31c9d1f 100644
--- a/lapack-netlib/TESTING/EIG/dchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/dchkst2stg.f
@@ -187,7 +187,7 @@
 *>                                              DSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because DSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z' | / ( |S| n ulp )    DSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/ddrgsx.f b/lapack-netlib/TESTING/EIG/ddrgsx.f
index 44c36407f..7fe9dfc14 100644
--- a/lapack-netlib/TESTING/EIG/ddrgsx.f
+++ b/lapack-netlib/TESTING/EIG/ddrgsx.f
@@ -769,7 +769,7 @@
       CALL DLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA )
       CALL DLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA )
 *
-*     Compute the Schur factorization while swaping the
+*     Compute the Schur factorization while swapping the
 *     m-by-m (1,1)-blocks with n-by-n (2,2)-blocks.
 *
       CALL DGGESX( 'V', 'V', 'S', DLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA,
diff --git a/lapack-netlib/TESTING/EIG/ddrvbd.f b/lapack-netlib/TESTING/EIG/ddrvbd.f
index 868679052..bd4ae60da 100644
--- a/lapack-netlib/TESTING/EIG/ddrvbd.f
+++ b/lapack-netlib/TESTING/EIG/ddrvbd.f
@@ -32,7 +32,7 @@
 *> \verbatim
 *>
 *> DDRVBD checks the singular value decomposition (SVD) drivers
-*> DGESVD, DGESDD, DGESVJ, and DGEJSV.
+*> DGESVD, DGESDD, DGESVDQ, DGESVJ, DGEJSV, and DGESVDX.
 *>
 *> Both DGESVD and DGESDD factor A = U diag(S) VT, where U and VT are
 *> orthogonal and diag(S) is diagonal with the entries of the array S
@@ -90,6 +90,17 @@
 *> (14)   | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
 *>        vector of singular values from the partial SVD
 *>
+*> Test for DGESVDQ:
+*>
+*> (36)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*>
+*> (37)   | I - U'U | / ( M ulp )
+*>
+*> (38)   | I - VT VT' | / ( N ulp )
+*>
+*> (39)   S contains MNMIN nonnegative values in decreasing order.
+*>        (Return 0 if true, 1/ULP if false.)
+*>
 *> Test for DGESVJ:
 *>
 *> (15)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
@@ -354,6 +365,8 @@
       SUBROUTINE DDRVBD( NSIZES, MM, NN, NTYPES, DOTYPE, ISEED, THRESH,
      $                   A, LDA, U, LDU, VT, LDVT, ASAV, USAV, VTSAV, S,
      $                   SSAV, E, WORK, LWORK, IWORK, NOUT, INFO )
+*
+      IMPLICIT NONE
 *
 *  -- LAPACK test routine (version 3.7.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@@ -390,13 +403,19 @@
      $                   ITEMP, J, JSIZE, JTYPE, LSWORK, M, MINWRK,
      $                   MMAX, MNMAX, MNMIN, MTYPES, N, NFAIL,
      $                   NMAX, NS, NSI, NSV, NTEST
-      DOUBLE PRECISION  ANORM, DIF, DIV, OVFL, RTUNFL, ULP,
-     $                    ULPINV, UNFL, VL, VU
+      DOUBLE PRECISION   ANORM, DIF, DIV, OVFL, RTUNFL, ULP,
+     $                   ULPINV, UNFL, VL, VU
+*     ..
+*     .. Local Scalars for DGESVDQ ..
+      INTEGER            LIWORK, LRWORK, NUMRANK
+*     ..
+*     .. Local Arrays for DGESVDQ ..
+      DOUBLE PRECISION   RWORK( 2 )
 *     ..
 *     .. Local Arrays ..
       CHARACTER          CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 )
       INTEGER            IOLDSD( 4 ), ISEED2( 4 )
-      DOUBLE PRECISION   RESULT( 40 )
+      DOUBLE PRECISION   RESULT( 39 )
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH, DLARND
@@ -404,8 +423,8 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, DBDT01, DGEJSV, DGESDD, DGESVD,
-     $                   DGESVDX, DGESVJ, DLABAD, DLACPY, DLASET,
-     $                   DLATMS, DORT01, DORT03, XERBLA
+     $                   DGESVDQ, DGESVDX, DGESVJ, DLABAD, DLACPY,
+     $                   DLASET, DLATMS, DORT01, DORT03, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, INT, MAX, MIN
@@ -781,8 +800,64 @@
                   RESULT( 14 ) = MAX( RESULT( 14 ), DIF )
   110          CONTINUE
 *
-*              Test DGESVJ: Factorize A
-*              Note: DGESVJ does not work for M < N
+*              Test DGESVDQ
+*              Note: DGESVDQ only works for M >= N
+*
+               RESULT( 36 ) = ZERO
+               RESULT( 37 ) = ZERO
+               RESULT( 38 ) = ZERO
+               RESULT( 39 ) = ZERO
+*
+               IF( M.GE.N ) THEN
+                  IWTMP = 5*MNMIN*MNMIN + 9*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWS-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  IF( IWS.EQ.4 )
+     $               LSWORK = LWORK
+*
+                  CALL DLACPY( 'F', M, N, ASAV, LDA, A, LDA )
+                  SRNAMT = 'DGESVDQ'
+*
+                  LRWORK = 2
+                  LIWORK = MAX( N, 1 )
+                  CALL DGESVDQ( 'H', 'N', 'N', 'A', 'A', 
+     $                          M, N, A, LDA, SSAV, USAV, LDU,
+     $                          VTSAV, LDVT, NUMRANK, IWORK, LIWORK,
+     $                          WORK, LWORK, RWORK, LRWORK, IINFO )
+*
+                  IF( IINFO.NE.0 ) THEN
+                     WRITE( NOUT, FMT = 9995 )'DGESVDQ', IINFO, M, N,
+     $               JTYPE, LSWORK, IOLDSD
+                     INFO = ABS( IINFO )
+                     RETURN
+                  END IF
+*
+*                 Do tests 36--39
+*
+                  CALL DBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E,
+     $                         VTSAV, LDVT, WORK, RESULT( 36 ) )
+                  IF( M.NE.0 .AND. N.NE.0 ) THEN
+                     CALL DORT01( 'Columns', M, M, USAV, LDU, WORK,
+     $                            LWORK, RESULT( 37 ) )
+                     CALL DORT01( 'Rows', N, N, VTSAV, LDVT, WORK,
+     $                            LWORK, RESULT( 38 ) )
+                  END IF
+                  RESULT( 39 ) = ZERO
+                  DO 199 I = 1, MNMIN - 1
+                     IF( SSAV( I ).LT.SSAV( I+1 ) )
+     $                  RESULT( 39 ) = ULPINV
+                     IF( SSAV( I ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+  199             CONTINUE
+                  IF( MNMIN.GE.1 ) THEN
+                     IF( SSAV( MNMIN ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+                  END IF
+               END IF
+*
+*              Test DGESVJ
+*              Note: DGESVJ only works for M >= N
 *
                RESULT( 15 ) = ZERO
                RESULT( 16 ) = ZERO
@@ -802,8 +877,7 @@
                   CALL DGESVJ( 'G', 'U', 'V', M, N, USAV, LDA, SSAV,
      &                        0, A, LDVT, WORK, LWORK, INFO )
 *
-*                 DGESVJ retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 DGESVJ returns V not VT
 *
                   DO J=1,N
                      DO I=1,N
@@ -841,8 +915,8 @@
                   END IF
                END IF
 *
-*              Test DGEJSV: Factorize A
-*              Note: DGEJSV does not work for M < N
+*              Test DGEJSV
+*              Note: DGEJSV only works for M >= N
 *
                RESULT( 19 ) = ZERO
                RESULT( 20 ) = ZERO
@@ -862,8 +936,7 @@
      &                   M, N, VTSAV, LDA, SSAV, USAV, LDU, A, LDVT,
      &                   WORK, LWORK, IWORK, INFO )
 *
-*                 DGEJSV retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 DGEJSV returns V not VT
 *
                   DO 140 J=1,N
                      DO 130 I=1,N
@@ -872,7 +945,7 @@
   140             END DO
 *
                   IF( IINFO.NE.0 ) THEN
-                     WRITE( NOUT, FMT = 9995 )'GESVJ', IINFO, M, N,
+                     WRITE( NOUT, FMT = 9995 )'GEJSV', IINFO, M, N,
      $               JTYPE, LSWORK, IOLDSD
                      INFO = ABS( IINFO )
                      RETURN
@@ -1086,7 +1159,7 @@
 *
 *              End of Loop -- Check for RESULT(j) > THRESH
 *
-               DO 210 J = 1, 35
+               DO 210 J = 1, 39
                   IF( RESULT( J ).GE.THRESH ) THEN
                      IF( NFAIL.EQ.0 ) THEN
                         WRITE( NOUT, FMT = 9999 )
@@ -1097,7 +1170,7 @@
                      NFAIL = NFAIL + 1
                   END IF
   210          CONTINUE
-               NTEST = NTEST + 35
+               NTEST = NTEST + 39
   220       CONTINUE
   230    CONTINUE
   240 CONTINUE
@@ -1158,6 +1231,12 @@
      $      ' DGESVDX(V,V,V) ',
      $      / '34 = | I - U**T U | / ( M ulp ) ',
      $      / '35 = | I - VT VT**T | / ( N ulp ) ',
+     $      ' DGESVDQ(H,N,N,A,A',
+     $      / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ',
+     $      / '37 = | I - U**T U | / ( M ulp ) ',
+     $      / '38 = | I - VT VT**T | / ( N ulp ) ',
+     $      / '39 = 0 if S contains min(M,N) nonnegative values in',
+     $      ' decreasing order, else 1/ulp',
      $      / / )
  9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1,
      $      ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 )
diff --git a/lapack-netlib/TESTING/EIG/derred.f b/lapack-netlib/TESTING/EIG/derred.f
index 5bde7f67d..94264e256 100644
--- a/lapack-netlib/TESTING/EIG/derred.f
+++ b/lapack-netlib/TESTING/EIG/derred.f
@@ -36,6 +36,8 @@
 *>       DGEJSV   compute SVD of an M-by-N matrix A where M >= N
 *>       DGESVDX  compute SVD of an M-by-N matrix A(by bisection
 *>                and inverse iteration)
+*>       DGESVDQ  compute SVD of an M-by-N matrix A(with a 
+*>                QR-Preconditioned )
 *> \endverbatim
 *
 *  Arguments:
@@ -100,7 +102,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, DGEES, DGEESX, DGEEV, DGEEVX, DGEJSV,
-     $                   DGESDD, DGESVD
+     $                   DGESDD, DGESVD, DGESVDX, DGESVQ
 *     ..
 *     .. External Functions ..
       LOGICAL            DSLECT, LSAMEN
@@ -486,6 +488,61 @@
          ELSE
             WRITE( NOUT, FMT = 9998 )
          END IF
+*
+*        Test DGESVDQ
+*
+         SRNAMT = 'DGESVDQ'
+         INFOT = 1
+         CALL DGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL DGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL DGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL DGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL DGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 9
+         CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 -1, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, -1, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 17
+         CALL DGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, 1, NS, IW, -5, W, 1, W, 1, INFO )
+         CALL CHKXER( 'DGESVDQ', INFOT, NOUT, LERR, OK )
+         NT = 11
+         IF( OK ) THEN
+            WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
+     $           NT
+         ELSE
+            WRITE( NOUT, FMT = 9998 )
+         END IF
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/dget39.f b/lapack-netlib/TESTING/EIG/dget39.f
index 1d0ec1f45..17e66c8e6 100644
--- a/lapack-netlib/TESTING/EIG/dget39.f
+++ b/lapack-netlib/TESTING/EIG/dget39.f
@@ -194,7 +194,7 @@
       VM5( 2 ) = EPS
       VM5( 3 ) = SQRT( SMLNUM )
 *
-*     Initalization
+*     Initialization
 *
       KNT = 0
       RMAX = ZERO
diff --git a/lapack-netlib/TESTING/EIG/dsbt21.f b/lapack-netlib/TESTING/EIG/dsbt21.f
index e7db231a9..54795623b 100644
--- a/lapack-netlib/TESTING/EIG/dsbt21.f
+++ b/lapack-netlib/TESTING/EIG/dsbt21.f
@@ -28,15 +28,16 @@
 *>
 *> DSBT21  generally checks a decomposition of the form
 *>
-*>         A = U S U'
+*>         A = U S U**T
 *>
-*> where ' means transpose, A is symmetric banded, U is
+*> where **T means transpose, A is symmetric banded, U is
 *> orthogonal, and S is diagonal (if KS=0) or symmetric
 *> tridiagonal (if KS=1).
 *>
 *> Specifically:
 *>
-*>         RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU' | / ( n ulp )
+*>         RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**T | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -214,7 +215,7 @@
 *
       ANORM = MAX( DLANSB( '1', CUPLO, N, IKA, A, LDA, WORK ), UNFL )
 *
-*     Compute error matrix:    Error = A - U S U'
+*     Compute error matrix:    Error = A - U S U**T
 *
 *     Copy A from SB to SP storage format.
 *
@@ -265,7 +266,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU' - I
+*     Compute  U U**T - I
 *
       CALL DGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK,
      $            N )
diff --git a/lapack-netlib/TESTING/EIG/dspt21.f b/lapack-netlib/TESTING/EIG/dspt21.f
index 9f87959fe..4b1d360c5 100644
--- a/lapack-netlib/TESTING/EIG/dspt21.f
+++ b/lapack-netlib/TESTING/EIG/dspt21.f
@@ -28,9 +28,9 @@
 *>
 *> DSPT21  generally checks a decomposition of the form
 *>
-*>         A = U S U'
+*>         A = U S U**T
 *>
-*> where ' means transpose, A is symmetric (stored in packed format), U
+*> where **T means transpose, A is symmetric (stored in packed format), U
 *> is orthogonal, and S is diagonal (if KBAND=0) or symmetric
 *> tridiagonal (if KBAND=1).  If ITYPE=1, then U is represented as a
 *> dense matrix, otherwise the U is expressed as a product of
@@ -41,15 +41,16 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>         RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU' | / ( n ulp )
+*>         RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>         RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>         RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>         RESULT(1) = | I - VU' | / ( n ulp )
+*>         RESULT(1) = | I - V U**T | / ( n ulp )
 *>
 *> Packed storage means that, for example, if UPLO='U', then the columns
 *> of the upper triangle of A are stored one after another, so that
@@ -70,7 +71,7 @@
 *>
 *>    If UPLO='U', then  V = H(n-1)...H(1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)'
+*>        H(j) = I  -  tau(j) v(j) v(j)**T
 *>
 *>    and the first j-1 elements of v(j) are stored in V(1:j-1,j+1),
 *>    (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ),
@@ -78,7 +79,7 @@
 *>
 *>    If UPLO='L', then  V = H(1)...H(n-1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)'
+*>        H(j) = I  -  tau(j) v(j) v(j)**T
 *>
 *>    and the first j elements of v(j) are 0, the (j+1)-st is 1, and the
 *>    (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e.,
@@ -93,14 +94,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>             RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense orthogonal matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - VU' | / ( n ulp )
+*>             RESULT(1) = | I - V U**T | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -183,7 +185,7 @@
 *> \verbatim
 *>          TAU is DOUBLE PRECISION array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**T in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -303,7 +305,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U'
+*        ITYPE=1: error = A - U S U**T
 *
          CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
          CALL DCOPY( LAP, AP, 1, WORK, 1 )
@@ -322,7 +324,7 @@
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V' - A
+*        ITYPE=2: error = V S V**T - A
 *
          CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
 *
@@ -389,7 +391,7 @@
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V' - I
+*        ITYPE=3: error = U V**T - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -420,7 +422,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU' - I
+*     Compute  U U**T - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL DGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK,
diff --git a/lapack-netlib/TESTING/EIG/dsyt21.f b/lapack-netlib/TESTING/EIG/dsyt21.f
index 0da3e5882..e00bd0db2 100644
--- a/lapack-netlib/TESTING/EIG/dsyt21.f
+++ b/lapack-netlib/TESTING/EIG/dsyt21.f
@@ -28,9 +28,9 @@
 *>
 *> DSYT21 generally checks a decomposition of the form
 *>
-*>    A = U S U'
+*>    A = U S U**T
 *>
-*> where ' means transpose, A is symmetric, U is orthogonal, and S is
+*> where **T means transpose, A is symmetric, U is orthogonal, and S is
 *> diagonal (if KBAND=0) or symmetric tridiagonal (if KBAND=1).
 *>
 *> If ITYPE=1, then U is represented as a dense matrix; otherwise U is
@@ -41,18 +41,19 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>    RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC>    RESULT(2) = | I - UU' | / ( n ulp )
+*>    RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>    RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>    RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>    RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>    RESULT(1) = | I - VU' | / ( n ulp )
+*>    RESULT(1) = | I - V U**T | / ( n ulp )
 *>
 *> For ITYPE > 1, the transformation U is expressed as a product
-*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)' and each
+*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)**T and each
 *> vector v(j) has its first j elements 0 and the remaining n-j elements
 *> stored in V(j+1:n,j).
 *> \endverbatim
@@ -65,14 +66,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**T | / ( |A| n ulp )  and
+*>             RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense orthogonal matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - VU' | / ( n ulp )
+*>             RESULT(1) = | I - V U**T | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -170,7 +172,7 @@
 *> \verbatim
 *>          TAU is DOUBLE PRECISION array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**T in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -283,7 +285,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U'
+*        ITYPE=1: error = A - U S U**T
 *
          CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
          CALL DLACPY( CUPLO, N, N, A, LDA, WORK, N )
@@ -302,7 +304,7 @@
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V' - A
+*        ITYPE=2: error = V S V**T - A
 *
          CALL DLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
 *
@@ -359,7 +361,7 @@
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V' - I
+*        ITYPE=3: error = U V**T - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -395,7 +397,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU' - I
+*     Compute  U U**T - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL DGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK,
diff --git a/lapack-netlib/TESTING/EIG/dsyt22.f b/lapack-netlib/TESTING/EIG/dsyt22.f
index 479b3ba5e..09e4aeb82 100644
--- a/lapack-netlib/TESTING/EIG/dsyt22.f
+++ b/lapack-netlib/TESTING/EIG/dsyt22.f
@@ -41,7 +41,8 @@
 *>
 *>      Specifically, if ITYPE=1, then:
 *>
-*>              RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC>              RESULT(2) = | I - U'U | / ( m ulp )
+*>              RESULT(1) = | U**T A U - S | / ( |A| m ulp ) and
+*>              RESULT(2) = | I - U**T U | / ( m ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -51,7 +52,8 @@
 *>  ITYPE   INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**T | / ( |A| n ulp )  and
+*>             RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *>  UPLO    CHARACTER
 *>          If UPLO='U', the upper triangle of A will be used and the
@@ -122,7 +124,7 @@
 *>
 *>  TAU     DOUBLE PRECISION array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**T in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *>          Not modified.
@@ -207,7 +209,7 @@
 *
 *     Compute error matrix:
 *
-*     ITYPE=1: error = U' A U - S
+*     ITYPE=1: error = U**T A U - S
 *
       CALL DSYMM( 'L', UPLO, N, M, ONE, A, LDA, U, LDU, ZERO, WORK, N )
       NN = N*N
@@ -240,7 +242,7 @@
 *
 *     Do Test 2
 *
-*     Compute  U'U - I
+*     Compute  U**T U - I
 *
       IF( ITYPE.EQ.1 )
      $   CALL DORT01( 'Columns', N, M, U, LDU, WORK, 2*N*N,
diff --git a/lapack-netlib/TESTING/EIG/sbdt05.f b/lapack-netlib/TESTING/EIG/sbdt05.f
index 972ff952f..e3e79e91e 100644
--- a/lapack-netlib/TESTING/EIG/sbdt05.f
+++ b/lapack-netlib/TESTING/EIG/sbdt05.f
@@ -52,6 +52,7 @@
 *> \verbatim
 *>          A is REAL array, dimension (LDA,N)
 *>          The m by n matrix A.
+*> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
diff --git a/lapack-netlib/TESTING/EIG/schkst.f b/lapack-netlib/TESTING/EIG/schkst.f
index f4ae46832..a851bbbbf 100644
--- a/lapack-netlib/TESTING/EIG/schkst.f
+++ b/lapack-netlib/TESTING/EIG/schkst.f
@@ -166,7 +166,7 @@
 *>                                              SSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because SSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z' | / ( |S| n ulp )    SSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/schkst2stg.f b/lapack-netlib/TESTING/EIG/schkst2stg.f
index 1c18e21bc..f386ab43c 100644
--- a/lapack-netlib/TESTING/EIG/schkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/schkst2stg.f
@@ -187,7 +187,7 @@
 *>                                              SSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because SSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z' | / ( |S| n ulp )    SSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/sdrgsx.f b/lapack-netlib/TESTING/EIG/sdrgsx.f
index bb5af0fd6..58e63e793 100644
--- a/lapack-netlib/TESTING/EIG/sdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/sdrgsx.f
@@ -770,7 +770,7 @@ c        MINWRK = MAX( 10*( NSIZE+1 ), 5*NSIZE*NSIZE / 2-2 )
       CALL SLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA )
       CALL SLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA )
 *
-*     Compute the Schur factorization while swaping the
+*     Compute the Schur factorization while swapping the
 *     m-by-m (1,1)-blocks with n-by-n (2,2)-blocks.
 *
       CALL SGGESX( 'V', 'V', 'S', SLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA,
diff --git a/lapack-netlib/TESTING/EIG/sdrvbd.f b/lapack-netlib/TESTING/EIG/sdrvbd.f
index b5d8a9b9a..101c8ba09 100644
--- a/lapack-netlib/TESTING/EIG/sdrvbd.f
+++ b/lapack-netlib/TESTING/EIG/sdrvbd.f
@@ -32,7 +32,7 @@
 *> \verbatim
 *>
 *> SDRVBD checks the singular value decomposition (SVD) drivers
-*> SGESVD, SGESDD, SGESVJ, and SGEJSV.
+*> SGESVD, SGESDD, SGESVDQ, SGESVJ, SGEJSV, and DGESVDX.
 *>
 *> Both SGESVD and SGESDD factor A = U diag(S) VT, where U and VT are
 *> orthogonal and diag(S) is diagonal with the entries of the array S
@@ -90,6 +90,17 @@
 *> (14)   | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
 *>        vector of singular values from the partial SVD
 *>
+*> Test for SGESVDQ:
+*>
+*> (36)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*>
+*> (37)   | I - U'U | / ( M ulp )
+*>
+*> (38)   | I - VT VT' | / ( N ulp )
+*>
+*> (39)   S contains MNMIN nonnegative values in decreasing order.
+*>        (Return 0 if true, 1/ULP if false.)
+*>
 *> Test for SGESVJ:
 *>
 *> (15)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
@@ -359,6 +370,8 @@
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     June 2016
+*
+      IMPLICIT NONE
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LDU, LDVT, LWORK, NOUT, NSIZES,
@@ -391,12 +404,18 @@
      $                   MMAX, MNMAX, MNMIN, MTYPES, N, NFAIL,
      $                   NMAX, NS, NSI, NSV, NTEST
       REAL               ANORM, DIF, DIV, OVFL, RTUNFL, ULP,
-     $                    ULPINV, UNFL, VL, VU
+     $                   ULPINV, UNFL, VL, VU
+*     ..
+*     .. Local Scalars for DGESVDQ ..
+      INTEGER            LIWORK, LRWORK, NUMRANK
+*     ..
+*     .. Local Arrays for DGESVDQ ..
+      REAL               RWORK( 2 )
 *     ..
 *     .. Local Arrays ..
       CHARACTER          CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 )
       INTEGER            IOLDSD( 4 ), ISEED2( 4 )
-      REAL               RESULT( 40 )
+      REAL               RESULT( 39 )
 *     ..
 *     .. External Functions ..
       REAL               SLAMCH, SLARND
@@ -404,8 +423,8 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, SBDT01, SGEJSV, SGESDD, SGESVD,
-     $                   SGESVDX, SGESVJ, SLABAD, SLACPY, SLASET,
-     $                   SLATMS, SORT01, SORT03, XERBLA
+     $                   SGESVDQ, SGESVDX, SGESVJ, SLABAD, SLACPY,
+     $                   SLASET, SLATMS, SORT01, SORT03, XERBLA
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, REAL, INT, MAX, MIN
@@ -781,8 +800,64 @@
                   RESULT( 14 ) = MAX( RESULT( 14 ), DIF )
   110          CONTINUE
 *
-*              Test SGESVJ: Factorize A
-*              Note: SGESVJ does not work for M < N
+*              Test SGESVDQ
+*              Note: SGESVDQ only works for M >= N
+*
+               RESULT( 36 ) = ZERO
+               RESULT( 37 ) = ZERO
+               RESULT( 38 ) = ZERO
+               RESULT( 39 ) = ZERO
+*
+               IF( M.GE.N ) THEN
+                  IWTMP = 5*MNMIN*MNMIN + 9*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWS-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  IF( IWS.EQ.4 )
+     $               LSWORK = LWORK
+*
+                  CALL SLACPY( 'F', M, N, ASAV, LDA, A, LDA )
+                  SRNAMT = 'SGESVDQ'
+*
+                  LRWORK = 2
+                  LIWORK = MAX( N, 1 )
+                  CALL SGESVDQ( 'H', 'N', 'N', 'A', 'A', 
+     $                          M, N, A, LDA, SSAV, USAV, LDU,
+     $                          VTSAV, LDVT, NUMRANK, IWORK, LIWORK,
+     $                          WORK, LWORK, RWORK, LRWORK, IINFO )
+*
+                  IF( IINFO.NE.0 ) THEN
+                     WRITE( NOUT, FMT = 9995 )'SGESVDQ', IINFO, M, N,
+     $               JTYPE, LSWORK, IOLDSD
+                     INFO = ABS( IINFO )
+                     RETURN
+                  END IF
+*
+*                 Do tests 36--39
+*
+                  CALL SBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E,
+     $                         VTSAV, LDVT, WORK, RESULT( 36 ) )
+                  IF( M.NE.0 .AND. N.NE.0 ) THEN
+                     CALL SORT01( 'Columns', M, M, USAV, LDU, WORK,
+     $                            LWORK, RESULT( 37 ) )
+                     CALL SORT01( 'Rows', N, N, VTSAV, LDVT, WORK,
+     $                            LWORK, RESULT( 38 ) )
+                  END IF
+                  RESULT( 39 ) = ZERO
+                  DO 199 I = 1, MNMIN - 1
+                     IF( SSAV( I ).LT.SSAV( I+1 ) )
+     $                  RESULT( 39 ) = ULPINV
+                     IF( SSAV( I ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+  199             CONTINUE
+                  IF( MNMIN.GE.1 ) THEN
+                     IF( SSAV( MNMIN ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+                  END IF
+               END IF
+*
+*              Test SGESVJ
+*              Note: SGESVJ only works for M >= N
 *
                RESULT( 15 ) = ZERO
                RESULT( 16 ) = ZERO
@@ -802,8 +877,7 @@
                   CALL SGESVJ( 'G', 'U', 'V', M, N, USAV, LDA, SSAV,
      &                        0, A, LDVT, WORK, LWORK, INFO )
 *
-*                 SGESVJ retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 SGESVJ returns V not VT
 *
                   DO J=1,N
                      DO I=1,N
@@ -841,8 +915,8 @@
                   END IF
                END IF
 *
-*              Test SGEJSV: Factorize A
-*              Note: SGEJSV does not work for M < N
+*              Test SGEJSV
+*              Note: SGEJSV only works for M >= N
 *
                RESULT( 19 ) = ZERO
                RESULT( 20 ) = ZERO
@@ -862,8 +936,7 @@
      &                   M, N, VTSAV, LDA, SSAV, USAV, LDU, A, LDVT,
      &                   WORK, LWORK, IWORK, INFO )
 *
-*                 SGEJSV retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 SGEJSV returns V not VT
 *
                   DO 140 J=1,N
                      DO 130 I=1,N
@@ -872,7 +945,7 @@
   140             END DO
 *
                   IF( IINFO.NE.0 ) THEN
-                     WRITE( NOUT, FMT = 9995 )'GESVJ', IINFO, M, N,
+                     WRITE( NOUT, FMT = 9995 )'GEJSV', IINFO, M, N,
      $               JTYPE, LSWORK, IOLDSD
                      INFO = ABS( IINFO )
                      RETURN
@@ -1086,7 +1159,7 @@
 *
 *              End of Loop -- Check for RESULT(j) > THRESH
 *
-               DO 210 J = 1, 35
+               DO 210 J = 1, 39
                   IF( RESULT( J ).GE.THRESH ) THEN
                      IF( NFAIL.EQ.0 ) THEN
                         WRITE( NOUT, FMT = 9999 )
@@ -1097,7 +1170,7 @@
                      NFAIL = NFAIL + 1
                   END IF
   210          CONTINUE
-               NTEST = NTEST + 35
+               NTEST = NTEST + 39
   220       CONTINUE
   230    CONTINUE
   240 CONTINUE
@@ -1158,6 +1231,12 @@
      $      ' SGESVDX(V,V,V) ',
      $      / '34 = | I - U**T U | / ( M ulp ) ',
      $      / '35 = | I - VT VT**T | / ( N ulp ) ',
+     $      ' SGESVDQ(H,N,N,A,A',
+     $      / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ',
+     $      / '37 = | I - U**T U | / ( M ulp ) ',
+     $      / '38 = | I - VT VT**T | / ( N ulp ) ',
+     $      / '39 = 0 if S contains min(M,N) nonnegative values in',
+     $      ' decreasing order, else 1/ulp',
      $      / / )
  9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1,
      $      ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 )
diff --git a/lapack-netlib/TESTING/EIG/serred.f b/lapack-netlib/TESTING/EIG/serred.f
index f478fcdb1..7d3772e84 100644
--- a/lapack-netlib/TESTING/EIG/serred.f
+++ b/lapack-netlib/TESTING/EIG/serred.f
@@ -36,6 +36,8 @@
 *>       SGEJSV   compute SVD of an M-by-N matrix A where M >= N
 *>       SGESVDX  compute SVD of an M-by-N matrix A(by bisection
 *>                and inverse iteration)
+*>       SGESVDQ  compute SVD of an M-by-N matrix A(with a 
+*>                QR-Preconditioned )
 *> \endverbatim
 *
 *  Arguments:
@@ -100,7 +102,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, SGEES, SGEESX, SGEEV, SGEEVX, SGEJSV,
-     $                   SGESDD, SGESVD
+     $                   SGESDD, SGESVD, SGESVDX, SGESVDQ
 *     ..
 *     .. External Functions ..
       LOGICAL            SSLECT, LSAMEN
@@ -486,6 +488,61 @@
          ELSE
             WRITE( NOUT, FMT = 9998 )
          END IF
+*
+*        Test SGESVDQ
+*
+         SRNAMT = 'SGESVDQ'
+         INFOT = 1
+         CALL SGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL SGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL SGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL SGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL SGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 9
+         CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 -1, VT, 0, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, -1, NS, IW, 1, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 17
+         CALL SGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, 1, NS, IW, -5, W, 1, W, 1, INFO )
+         CALL CHKXER( 'SGESVDQ', INFOT, NOUT, LERR, OK )
+         NT = 11
+         IF( OK ) THEN
+            WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
+     $           NT
+         ELSE
+            WRITE( NOUT, FMT = 9998 )
+         END IF
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/sget39.f b/lapack-netlib/TESTING/EIG/sget39.f
index f02c6f856..f6c0f7e7c 100644
--- a/lapack-netlib/TESTING/EIG/sget39.f
+++ b/lapack-netlib/TESTING/EIG/sget39.f
@@ -194,7 +194,7 @@
       VM5( 2 ) = EPS
       VM5( 3 ) = SQRT( SMLNUM )
 *
-*     Initalization
+*     Initialization
 *
       KNT = 0
       RMAX = ZERO
diff --git a/lapack-netlib/TESTING/EIG/ssbt21.f b/lapack-netlib/TESTING/EIG/ssbt21.f
index 50128ddbb..7ef5ad9b3 100644
--- a/lapack-netlib/TESTING/EIG/ssbt21.f
+++ b/lapack-netlib/TESTING/EIG/ssbt21.f
@@ -28,15 +28,16 @@
 *>
 *> SSBT21  generally checks a decomposition of the form
 *>
-*>         A = U S U'
+*>         A = U S U**T
 *>
-*> where ' means transpose, A is symmetric banded, U is
+*> where **T means transpose, A is symmetric banded, U is
 *> orthogonal, and S is diagonal (if KS=0) or symmetric
 *> tridiagonal (if KS=1).
 *>
 *> Specifically:
 *>
-*>         RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU' | / ( n ulp )
+*>         RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**T | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -214,7 +215,7 @@
 *
       ANORM = MAX( SLANSB( '1', CUPLO, N, IKA, A, LDA, WORK ), UNFL )
 *
-*     Compute error matrix:    Error = A - U S U'
+*     Compute error matrix:    Error = A - U S U**T
 *
 *     Copy A from SB to SP storage format.
 *
@@ -265,7 +266,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU' - I
+*     Compute  U U**T - I
 *
       CALL SGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK,
      $            N )
diff --git a/lapack-netlib/TESTING/EIG/sspt21.f b/lapack-netlib/TESTING/EIG/sspt21.f
index 2384c87de..4ecb04c0e 100644
--- a/lapack-netlib/TESTING/EIG/sspt21.f
+++ b/lapack-netlib/TESTING/EIG/sspt21.f
@@ -28,9 +28,9 @@
 *>
 *> SSPT21  generally checks a decomposition of the form
 *>
-*>         A = U S U'
+*>         A = U S U**T
 *>
-*> where ' means transpose, A is symmetric (stored in packed format), U
+*> where **T means transpose, A is symmetric (stored in packed format), U
 *> is orthogonal, and S is diagonal (if KBAND=0) or symmetric
 *> tridiagonal (if KBAND=1).  If ITYPE=1, then U is represented as a
 *> dense matrix, otherwise the U is expressed as a product of
@@ -41,15 +41,16 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>         RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU' | / ( n ulp )
+*>         RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>         RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>         RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>         RESULT(1) = | I - VU' | / ( n ulp )
+*>         RESULT(1) = | I - V U**T | / ( n ulp )
 *>
 *> Packed storage means that, for example, if UPLO='U', then the columns
 *> of the upper triangle of A are stored one after another, so that
@@ -70,7 +71,7 @@
 *>
 *>    If UPLO='U', then  V = H(n-1)...H(1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)'
+*>        H(j) = I  -  tau(j) v(j) v(j)**T
 *>
 *>    and the first j-1 elements of v(j) are stored in V(1:j-1,j+1),
 *>    (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ),
@@ -78,7 +79,7 @@
 *>
 *>    If UPLO='L', then  V = H(1)...H(n-1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)'
+*>        H(j) = I  -  tau(j) v(j) v(j)**T
 *>
 *>    and the first j elements of v(j) are 0, the (j+1)-st is 1, and the
 *>    (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e.,
@@ -93,14 +94,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>             RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense orthogonal matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - VU' | / ( n ulp )
+*>             RESULT(1) = | I - V U**T | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -183,7 +185,7 @@
 *> \verbatim
 *>          TAU is REAL array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**T in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -303,7 +305,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U'
+*        ITYPE=1: error = A - U S U**T
 *
          CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
          CALL SCOPY( LAP, AP, 1, WORK, 1 )
@@ -322,7 +324,7 @@
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V' - A
+*        ITYPE=2: error = V S V**T - A
 *
          CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
 *
@@ -389,7 +391,7 @@
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V' - I
+*        ITYPE=3: error = U V**T - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -420,7 +422,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU' - I
+*     Compute  U U**T - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL SGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK,
diff --git a/lapack-netlib/TESTING/EIG/ssyt21.f b/lapack-netlib/TESTING/EIG/ssyt21.f
index a7add3418..fc7ca6a2a 100644
--- a/lapack-netlib/TESTING/EIG/ssyt21.f
+++ b/lapack-netlib/TESTING/EIG/ssyt21.f
@@ -28,9 +28,9 @@
 *>
 *> SSYT21 generally checks a decomposition of the form
 *>
-*>    A = U S U'
+*>    A = U S U**T
 *>
-*> where ' means transpose, A is symmetric, U is orthogonal, and S is
+*> where **T means transpose, A is symmetric, U is orthogonal, and S is
 *> diagonal (if KBAND=0) or symmetric tridiagonal (if KBAND=1).
 *>
 *> If ITYPE=1, then U is represented as a dense matrix; otherwise U is
@@ -41,18 +41,19 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>    RESULT(1) = | A - U S U' | / ( |A| n ulp ) *andC>    RESULT(2) = | I - UU' | / ( n ulp )
+*>    RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>    RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>    RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>    RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>    RESULT(1) = | I - VU' | / ( n ulp )
+*>    RESULT(1) = | I - V U**T | / ( n ulp )
 *>
 *> For ITYPE > 1, the transformation U is expressed as a product
-*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)' and each
+*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)**T and each
 *> vector v(j) has its first j elements 0 and the remaining n-j elements
 *> stored in V(j+1:n,j).
 *> \endverbatim
@@ -65,14 +66,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>             RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V' | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**T | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense orthogonal matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - VU' | / ( n ulp )
+*>             RESULT(1) = | I - V U**T | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -170,7 +172,7 @@
 *> \verbatim
 *>          TAU is REAL array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**T in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -283,7 +285,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U'
+*        ITYPE=1: error = A - U S U**T
 *
          CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
          CALL SLACPY( CUPLO, N, N, A, LDA, WORK, N )
@@ -302,7 +304,7 @@
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V' - A
+*        ITYPE=2: error = V S V**T - A
 *
          CALL SLASET( 'Full', N, N, ZERO, ZERO, WORK, N )
 *
@@ -359,7 +361,7 @@
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V' - I
+*        ITYPE=3: error = U V**T - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -395,7 +397,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU' - I
+*     Compute  U U**T - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL SGEMM( 'N', 'C', N, N, N, ONE, U, LDU, U, LDU, ZERO, WORK,
diff --git a/lapack-netlib/TESTING/EIG/ssyt22.f b/lapack-netlib/TESTING/EIG/ssyt22.f
index 3b748ec7f..38fc3e555 100644
--- a/lapack-netlib/TESTING/EIG/ssyt22.f
+++ b/lapack-netlib/TESTING/EIG/ssyt22.f
@@ -41,7 +41,8 @@
 *>
 *>      Specifically, if ITYPE=1, then:
 *>
-*>              RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC>              RESULT(2) = | I - U'U | / ( m ulp )
+*>              RESULT(1) = | U**T A U - S | / ( |A| m ulp ) and
+*>              RESULT(2) = | I - U**T U | / ( m ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -51,7 +52,8 @@
 *>  ITYPE   INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**T | / ( |A| n ulp ) and
+*>             RESULT(2) = | I - U U**T | / ( n ulp )
 *>
 *>  UPLO    CHARACTER
 *>          If UPLO='U', the upper triangle of A will be used and the
@@ -122,7 +124,7 @@
 *>
 *>  TAU     REAL array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**T in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *>          Not modified.
@@ -207,7 +209,7 @@
 *
 *     Compute error matrix:
 *
-*     ITYPE=1: error = U' A U - S
+*     ITYPE=1: error = U**T A U - S
 *
       CALL SSYMM( 'L', UPLO, N, M, ONE, A, LDA, U, LDU, ZERO, WORK, N )
       NN = N*N
@@ -240,7 +242,7 @@
 *
 *     Do Test 2
 *
-*     Compute  U'U - I
+*     Compute  U**T U - I
 *
       IF( ITYPE.EQ.1 )
      $   CALL SORT01( 'Columns', N, M, U, LDU, WORK, 2*N*N,
diff --git a/lapack-netlib/TESTING/EIG/zbdt05.f b/lapack-netlib/TESTING/EIG/zbdt05.f
index 7a493292a..bbf0208b7 100644
--- a/lapack-netlib/TESTING/EIG/zbdt05.f
+++ b/lapack-netlib/TESTING/EIG/zbdt05.f
@@ -52,6 +52,7 @@
 *> \verbatim
 *>          A is COMPLEX*16 array, dimension (LDA,N)
 *>          The m by n matrix A.
+*> \endverbatim
 *>
 *> \param[in] LDA
 *> \verbatim
diff --git a/lapack-netlib/TESTING/EIG/zchkst.f b/lapack-netlib/TESTING/EIG/zchkst.f
index 4a8636ad9..cd45e98e1 100644
--- a/lapack-netlib/TESTING/EIG/zchkst.f
+++ b/lapack-netlib/TESTING/EIG/zchkst.f
@@ -167,7 +167,7 @@
 *>                                              ZSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because ZSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z* | / ( |S| n ulp )    ZSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/zchkst2stg.f b/lapack-netlib/TESTING/EIG/zchkst2stg.f
index cd952bc37..167e5f359 100644
--- a/lapack-netlib/TESTING/EIG/zchkst2stg.f
+++ b/lapack-netlib/TESTING/EIG/zchkst2stg.f
@@ -188,7 +188,7 @@
 *>                                              ZSTEMR('V', 'I')
 *>
 *> Tests 29 through 34 are disable at present because ZSTEMR
-*> does not handle partial specturm requests.
+*> does not handle partial spectrum requests.
 *>
 *> (29)    | S - Z D Z* | / ( |S| n ulp )    ZSTEMR('V', 'I')
 *>
diff --git a/lapack-netlib/TESTING/EIG/zdrgev3.f b/lapack-netlib/TESTING/EIG/zdrgev3.f
index 62ddf2b56..11e8562d7 100644
--- a/lapack-netlib/TESTING/EIG/zdrgev3.f
+++ b/lapack-netlib/TESTING/EIG/zdrgev3.f
@@ -389,7 +389,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date Febuary 2015
+*> \date February 2015
 *
 *> \ingroup complex16_eig
 *
diff --git a/lapack-netlib/TESTING/EIG/zdrgsx.f b/lapack-netlib/TESTING/EIG/zdrgsx.f
index 51a7d773f..f5821e520 100644
--- a/lapack-netlib/TESTING/EIG/zdrgsx.f
+++ b/lapack-netlib/TESTING/EIG/zdrgsx.f
@@ -738,7 +738,7 @@
       CALL ZLACPY( 'Full', MPLUSN, MPLUSN, AI, LDA, A, LDA )
       CALL ZLACPY( 'Full', MPLUSN, MPLUSN, BI, LDA, B, LDA )
 *
-*     Compute the Schur factorization while swaping the
+*     Compute the Schur factorization while swapping the
 *     m-by-m (1,1)-blocks with n-by-n (2,2)-blocks.
 *
       CALL ZGGESX( 'V', 'V', 'S', ZLCTSX, 'B', MPLUSN, AI, LDA, BI, LDA,
diff --git a/lapack-netlib/TESTING/EIG/zdrvbd.f b/lapack-netlib/TESTING/EIG/zdrvbd.f
index 4bdbdfe2e..105e9dff7 100644
--- a/lapack-netlib/TESTING/EIG/zdrvbd.f
+++ b/lapack-netlib/TESTING/EIG/zdrvbd.f
@@ -33,8 +33,9 @@
 *>
 *> \verbatim
 *>
-*> ZDRVBD checks the singular value decomposition (SVD) driver ZGESVD
-*> and ZGESDD.
+*> ZDRVBD checks the singular value decomposition (SVD) driver ZGESVD,
+*> ZGESDD, ZGESVJ, ZGEJSV, ZGESVDX, and ZGESVDQ.
+*>
 *> ZGESVD and ZGESDD factors A = U diag(S) VT, where U and VT are
 *> unitary and diag(S) is diagonal with the entries of the array S on
 *> its diagonal. The entries of S are the singular values, nonnegative
@@ -73,81 +74,92 @@
 *>
 *> Test for ZGESDD:
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (8)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (9)   | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (10)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (11)  S contains MNMIN nonnegative values in decreasing order.
 *>       (Return 0 if true, 1/ULP if false.)
 *>
-*> (5)   | U - Upartial | / ( M ulp ) where Upartial is a partially
+*> (12)  | U - Upartial | / ( M ulp ) where Upartial is a partially
 *>       computed U.
 *>
-*> (6)   | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
+*> (13)  | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
 *>       computed VT.
 *>
-*> (7)   | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
+*> (14)  | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
 *>       vector of singular values from the partial SVD
 *>
+*> Test for ZGESVDQ:
+*>
+*> (36)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*>
+*> (37)  | I - U'U | / ( M ulp )
+*>
+*> (38)  | I - VT VT' | / ( N ulp )
+*>
+*> (39)  S contains MNMIN nonnegative values in decreasing order.
+*>       (Return 0 if true, 1/ULP if false.)
+*>
 *> Test for ZGESVJ:
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (15)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (16)  | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (17)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (18)  S contains MNMIN nonnegative values in decreasing order.
 *>       (Return 0 if true, 1/ULP if false.)
 *>
 *> Test for ZGEJSV:
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (19)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (20)  | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (21)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (22)  S contains MNMIN nonnegative values in decreasing order.
 *>        (Return 0 if true, 1/ULP if false.)
 *>
 *> Test for ZGESVDX( 'V', 'V', 'A' )/ZGESVDX( 'N', 'N', 'A' )
 *>
-*> (1)   | A - U diag(S) VT | / ( |A| max(M,N) ulp )
+*> (23)  | A - U diag(S) VT | / ( |A| max(M,N) ulp )
 *>
-*> (2)   | I - U'U | / ( M ulp )
+*> (24)  | I - U'U | / ( M ulp )
 *>
-*> (3)   | I - VT VT' | / ( N ulp )
+*> (25)  | I - VT VT' | / ( N ulp )
 *>
-*> (4)   S contains MNMIN nonnegative values in decreasing order.
+*> (26)  S contains MNMIN nonnegative values in decreasing order.
 *>       (Return 0 if true, 1/ULP if false.)
 *>
-*> (5)   | U - Upartial | / ( M ulp ) where Upartial is a partially
+*> (27)  | U - Upartial | / ( M ulp ) where Upartial is a partially
 *>       computed U.
 *>
-*> (6)   | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
+*> (28)  | VT - VTpartial | / ( N ulp ) where VTpartial is a partially
 *>       computed VT.
 *>
-*> (7)   | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
+*> (29)  | S - Spartial | / ( MNMIN ulp |S| ) where Spartial is the
 *>       vector of singular values from the partial SVD
 *>
 *> Test for ZGESVDX( 'V', 'V', 'I' )
 *>
-*> (8)   | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
+*> (30)  | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
 *>
-*> (9)   | I - U'U | / ( M ulp )
+*> (31)  | I - U'U | / ( M ulp )
 *>
-*> (10)  | I - VT VT' | / ( N ulp )
+*> (32)  | I - VT VT' | / ( N ulp )
 *>
 *> Test for ZGESVDX( 'V', 'V', 'V' )
 *>
-*> (11)   | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
+*> (33)   | U' A VT''' - diag(S) | / ( |A| max(M,N) ulp )
 *>
-*> (12)   | I - U'U | / ( M ulp )
+*> (34)   | I - U'U | / ( M ulp )
 *>
-*> (13)   | I - VT VT' | / ( N ulp )
+*> (35)   | I - VT VT' | / ( N ulp )
 *>
 *> The "sizes" are specified by the arrays MM(1:NSIZES) and
 *> NN(1:NSIZES); the value of each element pair (MM(j),NN(j))
@@ -393,6 +405,8 @@
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     June 2016
+*
+      IMPLICIT NONE
 *
 *     .. Scalar Arguments ..
       INTEGER            INFO, LDA, LDU, LDVT, LWORK, NOUNIT, NSIZES,
@@ -411,7 +425,7 @@
 *  =====================================================================
 *
 *     .. Parameters ..
-      DOUBLE PRECISION  ZERO, ONE, TWO, HALF
+      DOUBLE PRECISION   ZERO, ONE, TWO, HALF
       PARAMETER          ( ZERO = 0.0D0, ONE = 1.0D0, TWO = 2.0D0,
      $                   HALF = 0.5D0 )
       COMPLEX*16         CZERO, CONE
@@ -431,10 +445,13 @@
       DOUBLE PRECISION   ANORM, DIF, DIV, OVFL, RTUNFL, ULP, ULPINV,
      $                   UNFL, VL, VU
 *     ..
+*     .. Local Scalars for ZGESVDQ ..
+      INTEGER            LIWORK, NUMRANK
+*     ..
 *     .. Local Arrays ..
       CHARACTER          CJOB( 4 ), CJOBR( 3 ), CJOBV( 2 )
       INTEGER            IOLDSD( 4 ), ISEED2( 4 )
-      DOUBLE PRECISION   RESULT( 35 )
+      DOUBLE PRECISION   RESULT( 39 )
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION   DLAMCH, DLARND
@@ -442,8 +459,8 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALASVM, XERBLA, ZBDT01, ZBDT05, ZGESDD,
-     $                   ZGESVD, ZGESVJ, ZGEJSV, ZGESVDX, ZLACPY,
-     $                   ZLASET, ZLATMS, ZUNT01, ZUNT03
+     $                   ZGESVD, ZGESVDQ, ZGESVJ, ZGEJSV, ZGESVDX,
+     $                   ZLACPY, ZLASET, ZLATMS, ZUNT01, ZUNT03
 *     ..
 *     .. Intrinsic Functions ..
       INTRINSIC          ABS, DBLE, MAX, MIN
@@ -836,10 +853,65 @@
   120             CONTINUE
                   RESULT( 14 ) = MAX( RESULT( 14 ), DIF )
   130          CONTINUE
-
 *
-*              Test ZGESVJ: Factorize A
-*              Note: ZGESVJ does not work for M < N
+*              Test ZGESVDQ
+*              Note: ZGESVDQ only works for M >= N
+*
+               RESULT( 36 ) = ZERO
+               RESULT( 37 ) = ZERO
+               RESULT( 38 ) = ZERO
+               RESULT( 39 ) = ZERO
+*
+               IF( M.GE.N ) THEN
+                  IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  IF( IWSPC.EQ.4 )
+     $               LSWORK = LWORK
+*
+                  CALL ZLACPY( 'F', M, N, ASAV, LDA, A, LDA )
+                  SRNAMT = 'ZGESVDQ'
+*
+                  LRWORK = MAX(2, M, 5*N)
+                  LIWORK = MAX( N, 1 )
+                  CALL ZGESVDQ( 'H', 'N', 'N', 'A', 'A', 
+     $                          M, N, A, LDA, SSAV, USAV, LDU,
+     $                          VTSAV, LDVT, NUMRANK, IWORK, LIWORK,
+     $                          WORK, LWORK, RWORK, LRWORK, IINFO )
+*
+                  IF( IINFO.NE.0 ) THEN
+                     WRITE( NOUNIT, FMT = 9995 )'ZGESVDQ', IINFO, M, N,
+     $               JTYPE, LSWORK, IOLDSD
+                     INFO = ABS( IINFO )
+                     RETURN
+                  END IF
+*
+*                 Do tests 36--39
+*
+                  CALL ZBDT01( M, N, 0, ASAV, LDA, USAV, LDU, SSAV, E,
+     $                         VTSAV, LDVT, WORK, RWORK, RESULT( 36 ) )
+                  IF( M.NE.0 .AND. N.NE.0 ) THEN
+                     CALL ZUNT01( 'Columns', M, M, USAV, LDU, WORK,
+     $                            LWORK, RWORK, RESULT( 37 ) )
+                     CALL ZUNT01( 'Rows', N, N, VTSAV, LDVT, WORK,
+     $                            LWORK, RWORK, RESULT( 38 ) )
+                  END IF
+                  RESULT( 39 ) = ZERO
+                  DO 199 I = 1, MNMIN - 1
+                     IF( SSAV( I ).LT.SSAV( I+1 ) )
+     $                  RESULT( 39 ) = ULPINV
+                     IF( SSAV( I ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+  199             CONTINUE
+                  IF( MNMIN.GE.1 ) THEN
+                     IF( SSAV( MNMIN ).LT.ZERO )
+     $                  RESULT( 39 ) = ULPINV
+                  END IF
+               END IF
+*
+*              Test ZGESVJ
+*              Note: ZGESVJ only works for M >= N
 *
                RESULT( 15 ) = ZERO
                RESULT( 16 ) = ZERO
@@ -847,13 +919,13 @@
                RESULT( 18 ) = ZERO
 *
                IF( M.GE.N ) THEN
-               IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
-               LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
-               LSWORK = MIN( LSWORK, LWORK )
-               LSWORK = MAX( LSWORK, 1 )
-               LRWORK = MAX(6,N)
-               IF( IWSPC.EQ.4 )
-     $            LSWORK = LWORK
+                  IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  LRWORK = MAX(6,N)
+                  IF( IWSPC.EQ.4 )
+     $               LSWORK = LWORK
 *
                   CALL ZLACPY( 'F', M, N, ASAV, LDA, USAV, LDA )
                   SRNAMT = 'ZGESVJ'
@@ -861,8 +933,7 @@
      &                        0, A, LDVT, WORK, LWORK, RWORK,
      &                        LRWORK, IINFO )
 *
-*                 ZGESVJ retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 ZGESVJ returns V not VH
 *
                   DO J=1,N
                      DO I=1,N
@@ -900,21 +971,21 @@
                   END IF
                END IF
 *
-*              Test ZGEJSV: Factorize A
-*              Note: ZGEJSV does not work for M < N
+*              Test ZGEJSV
+*              Note: ZGEJSV only works for M >= N
 *
                RESULT( 19 ) = ZERO
                RESULT( 20 ) = ZERO
                RESULT( 21 ) = ZERO
                RESULT( 22 ) = ZERO
                IF( M.GE.N ) THEN
-               IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
-               LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
-               LSWORK = MIN( LSWORK, LWORK )
-               LSWORK = MAX( LSWORK, 1 )
-               IF( IWSPC.EQ.4 )
-     $            LSWORK = LWORK
-               LRWORK = MAX( 7, N + 2*M)
+                  IWTMP = 2*MNMIN*MNMIN + 2*MNMIN + MAX( M, N )
+                  LSWORK = IWTMP + ( IWSPC-1 )*( LWORK-IWTMP ) / 3
+                  LSWORK = MIN( LSWORK, LWORK )
+                  LSWORK = MAX( LSWORK, 1 )
+                  IF( IWSPC.EQ.4 )
+     $               LSWORK = LWORK
+                  LRWORK = MAX( 7, N + 2*M)
 *
                   CALL ZLACPY( 'F', M, N, ASAV, LDA, VTSAV, LDA )
                   SRNAMT = 'ZGEJSV'
@@ -923,8 +994,7 @@
      &                   WORK, LWORK, RWORK,
      &                   LRWORK, IWORK, IINFO )
 *
-*                 ZGEJSV retuns V not VT, so we transpose to use the same
-*                 test suite.
+*                 ZGEJSV returns V not VH
 *
                   DO 133 J=1,N
                      DO 132 I=1,N
@@ -933,7 +1003,7 @@
   133             END DO
 *
                   IF( IINFO.NE.0 ) THEN
-                     WRITE( NOUNIT, FMT = 9995 )'GESVJ', IINFO, M, N,
+                     WRITE( NOUNIT, FMT = 9995 )'GEJSV', IINFO, M, N,
      $               JTYPE, LSWORK, IOLDSD
                      INFO = ABS( IINFO )
                      RETURN
@@ -1160,7 +1230,7 @@
 *
                NTEST = 0
                NFAIL = 0
-               DO 190 J = 1, 35
+               DO 190 J = 1, 39
                   IF( RESULT( J ).GE.ZERO )
      $               NTEST = NTEST + 1
                   IF( RESULT( J ).GE.THRESH )
@@ -1175,7 +1245,7 @@
                   NTESTF = 2
                END IF
 *
-               DO 200 J = 1, 35
+               DO 200 J = 1, 39
                   IF( RESULT( J ).GE.THRESH ) THEN
                      WRITE( NOUNIT, FMT = 9997 )M, N, JTYPE, IWSPC,
      $                  IOLDSD, J, RESULT( J )
@@ -1251,6 +1321,12 @@
      $      / '33 = | U**T A VT**T - diag(S) | / ( |A| max(M,N) ulp )',
      $      / '34 = | I - U**T U | / ( M ulp ) ',
      $      / '35 = | I - VT VT**T | / ( N ulp ) ',
+     $      ' ZGESVDQ(H,N,N,A,A',
+     $      / '36 = | A - U diag(S) VT | / ( |A| max(M,N) ulp ) ',
+     $      / '37 = | I - U**T U | / ( M ulp ) ',
+     $      / '38 = | I - VT VT**T | / ( N ulp ) ',
+     $      / '39 = 0 if S contains min(M,N) nonnegative values in',
+     $      ' decreasing order, else 1/ulp',
      $      / / )
  9997 FORMAT( ' M=', I5, ', N=', I5, ', type ', I1, ', IWS=', I1,
      $      ', seed=', 4( I4, ',' ), ' test(', I2, ')=', G11.4 )
diff --git a/lapack-netlib/TESTING/EIG/zerred.f b/lapack-netlib/TESTING/EIG/zerred.f
index 00bfbf261..013dc16c5 100644
--- a/lapack-netlib/TESTING/EIG/zerred.f
+++ b/lapack-netlib/TESTING/EIG/zerred.f
@@ -36,6 +36,8 @@
 *>       ZGEJSV   compute SVD of an M-by-N matrix A where M >= N
 *>       ZGESVDX  compute SVD of an M-by-N matrix A(by bisection
 *>                and inverse iteration)
+*>       ZGESVDQ  compute SVD of an M-by-N matrix A(with a 
+*>                QR-Preconditioned )
 *> \endverbatim
 *
 *  Arguments:
@@ -101,7 +103,7 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           CHKXER, ZGEES, ZGEESX, ZGEEV, ZGEEVX, ZGESVJ,
-     $                   ZGESDD, ZGESVD
+     $                   ZGESDD, ZGESVD, ZGESVDX, ZGESVQ
 *     ..
 *     .. External Functions ..
       LOGICAL            LSAMEN, ZSLECT
@@ -495,6 +497,61 @@
          ELSE
             WRITE( NOUT, FMT = 9998 )
          END IF
+*
+*        Test ZGESVDQ
+*
+         SRNAMT = 'ZGESVDQ'
+         INFOT = 1
+         CALL ZGESVDQ( 'X', 'P', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZGESVDQ( 'A', 'X', 'T', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZGESVDQ( 'A', 'P', 'X', 'A', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 4
+         CALL ZGESVDQ( 'A', 'P', 'T', 'X', 'A', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'X', 0, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 6
+         CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', -1, 0, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 0, 1, A, 1, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 9
+         CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 0, S, U,
+     $                 0, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 12
+         CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 -1, VT, 0, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 14
+         CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, -1, NS, IW, 1, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         INFOT = 17
+         CALL ZGESVDQ( 'A', 'P', 'T', 'A', 'A', 1, 1, A, 1, S, U,
+     $                 1, VT, 1, NS, IW, -5, W, 1, RW, 1, INFO )
+         CALL CHKXER( 'ZGESVDQ', INFOT, NOUT, LERR, OK )
+         NT = 11
+         IF( OK ) THEN
+            WRITE( NOUT, FMT = 9999 )SRNAMT( 1:LEN_TRIM( SRNAMT ) ),
+     $           NT
+         ELSE
+            WRITE( NOUT, FMT = 9998 )
+         END IF
       END IF
 *
 *     Print a summary line.
diff --git a/lapack-netlib/TESTING/EIG/zget51.f b/lapack-netlib/TESTING/EIG/zget51.f
index 96b1dfae4..e019127a3 100644
--- a/lapack-netlib/TESTING/EIG/zget51.f
+++ b/lapack-netlib/TESTING/EIG/zget51.f
@@ -29,12 +29,13 @@
 *>
 *>      ZGET51  generally checks a decomposition of the form
 *>
-*>              A = U B VC>
-*>      where * means conjugate transpose and U and V are unitary.
+*>              A = U B V**H
+*>
+*>      where **H means conjugate transpose and U and V are unitary.
 *>
 *>      Specifically, if ITYPE=1
 *>
-*>              RESULT = | A - U B V* | / ( |A| n ulp )
+*>              RESULT = | A - U B V**H | / ( |A| n ulp )
 *>
 *>      If ITYPE=2, then:
 *>
@@ -42,7 +43,7 @@
 *>
 *>      If ITYPE=3, then:
 *>
-*>              RESULT = | I - UU* | / ( n ulp )
+*>              RESULT = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -52,9 +53,9 @@
 *> \verbatim
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
-*>          =1: RESULT = | A - U B V* | / ( |A| n ulp )
+*>          =1: RESULT = | A - U B V**H | / ( |A| n ulp )
 *>          =2: RESULT = | A - B | / ( |A| n ulp )
-*>          =3: RESULT = | I - UU* | / ( n ulp )
+*>          =3: RESULT = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] N
@@ -218,7 +219,7 @@
 *
          IF( ITYPE.EQ.1 ) THEN
 *
-*           ITYPE=1: Compute W = A - UBV'
+*           ITYPE=1: Compute W = A - U B V**H
 *
             CALL ZLACPY( ' ', N, N, A, LDA, WORK, N )
             CALL ZGEMM( 'N', 'N', N, N, N, CONE, U, LDU, B, LDB, CZERO,
@@ -259,7 +260,7 @@
 *
 *        Tests not scaled by norm(A)
 *
-*        ITYPE=3: Compute  UU' - I
+*        ITYPE=3: Compute  U U**H - I
 *
          CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO,
      $               WORK, N )
diff --git a/lapack-netlib/TESTING/EIG/zhbt21.f b/lapack-netlib/TESTING/EIG/zhbt21.f
index 4cd8ed9f7..68125854c 100644
--- a/lapack-netlib/TESTING/EIG/zhbt21.f
+++ b/lapack-netlib/TESTING/EIG/zhbt21.f
@@ -28,14 +28,16 @@
 *>
 *> ZHBT21  generally checks a decomposition of the form
 *>
-*>         A = U S UC>
-*> where * means conjugate transpose, A is hermitian banded, U is
+*>         A = U S U**H
+*>
+*> where **H means conjugate transpose, A is hermitian banded, U is
 *> unitary, and S is diagonal (if KS=0) or symmetric
 *> tridiagonal (if KS=1).
 *>
 *> Specifically:
 *>
-*>         RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU* | / ( n ulp )
+*>         RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -220,7 +222,7 @@
 *
       ANORM = MAX( ZLANHB( '1', CUPLO, N, IKA, A, LDA, RWORK ), UNFL )
 *
-*     Compute error matrix:    Error = A - U S U*
+*     Compute error matrix:    Error = A - U S U**H
 *
 *     Copy A from SB to SP storage format.
 *
@@ -271,7 +273,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK,
      $            N )
diff --git a/lapack-netlib/TESTING/EIG/zhet21.f b/lapack-netlib/TESTING/EIG/zhet21.f
index f6cb2d70a..cb854a850 100644
--- a/lapack-netlib/TESTING/EIG/zhet21.f
+++ b/lapack-netlib/TESTING/EIG/zhet21.f
@@ -29,8 +29,9 @@
 *>
 *> ZHET21 generally checks a decomposition of the form
 *>
-*>    A = U S UC>
-*> where * means conjugate transpose, A is hermitian, U is unitary, and
+*>    A = U S U**H
+*>
+*> where **H means conjugate transpose, A is hermitian, U is unitary, and
 *> S is diagonal (if KBAND=0) or (real) symmetric tridiagonal (if
 *> KBAND=1).
 *>
@@ -42,18 +43,19 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>    RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC>    RESULT(2) = | I - UU* | / ( n ulp )
+*>    RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>    RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>    RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>    RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>    RESULT(1) = | I - UV* | / ( n ulp )
+*>    RESULT(1) = | I - U V**H | / ( n ulp )
 *>
 *> For ITYPE > 1, the transformation U is expressed as a product
-*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)C> and each
+*> V = H(1)...H(n-2),  where H(j) = I  -  tau(j) v(j) v(j)**H and each
 *> vector v(j) has its first j elements 0 and the remaining n-j elements
 *> stored in V(j+1:n,j).
 *> \endverbatim
@@ -66,14 +68,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense unitary matrix:
-*>             RESULT(1) = | A - U S U* | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU* | / ( n ulp )
+*>             RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>             RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense unitary matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - UV* | / ( n ulp )
+*>             RESULT(1) = | I - U V**H | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -171,7 +174,7 @@
 *> \verbatim
 *>          TAU is COMPLEX*16 array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)* in the Householder transformation H(j) of
+*>          v(j) v(j)**H in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -294,7 +297,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U*
+*        ITYPE=1: error = A - U S U**H
 *
          CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
          CALL ZLACPY( CUPLO, N, N, A, LDA, WORK, N )
@@ -304,7 +307,6 @@
    10    CONTINUE
 *
          IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN
-CMK            DO 20 J = 1, N - 1
             DO 20 J = 2, N - 1
                CALL ZHER2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1,
      $                     U( 1, J-1 ), 1, WORK, N )
@@ -314,7 +316,7 @@ CMK            DO 20 J = 1, N - 1
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V* - A
+*        ITYPE=2: error = V S V**H - A
 *
          CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
 *
@@ -371,7 +373,7 @@ CMK            DO 20 J = 1, N - 1
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V* - I
+*        ITYPE=3: error = U V**H - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -407,7 +409,7 @@ CMK            DO 20 J = 1, N - 1
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO,
diff --git a/lapack-netlib/TESTING/EIG/zhet22.f b/lapack-netlib/TESTING/EIG/zhet22.f
index 7237f43f7..8ef73aef3 100644
--- a/lapack-netlib/TESTING/EIG/zhet22.f
+++ b/lapack-netlib/TESTING/EIG/zhet22.f
@@ -42,7 +42,8 @@
 *>
 *>      Specifically, if ITYPE=1, then:
 *>
-*>              RESULT(1) = | U' A U - S | / ( |A| m ulp ) *andC>              RESULT(2) = | I - U'U | / ( m ulp )
+*>              RESULT(1) = | U**H A U - S | / ( |A| m ulp ) and
+*>              RESULT(2) = | I - U**H U | / ( m ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -52,7 +53,8 @@
 *>  ITYPE   INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense orthogonal matrix:
-*>             RESULT(1) = | A - U S U' | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU' | / ( n ulp )
+*>             RESULT(1) = | A - U S U**H | / ( |A| n ulp )   *and
+*>             RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *>  UPLO    CHARACTER
 *>          If UPLO='U', the upper triangle of A will be used and the
@@ -122,7 +124,7 @@
 *>
 *>  TAU     COMPLEX*16 array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)' in the Householder transformation H(j) of
+*>          v(j) v(j)**H in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *>          Not modified.
@@ -215,7 +217,7 @@
 *
 *     Compute error matrix:
 *
-*     ITYPE=1: error = U' A U - S
+*     ITYPE=1: error = U**H A U - S
 *
       CALL ZHEMM( 'L', UPLO, N, M, CONE, A, LDA, U, LDU, CZERO, WORK,
      $            N )
@@ -249,7 +251,7 @@
 *
 *     Do Test 2
 *
-*     Compute  U'U - I
+*     Compute  U**H U - I
 *
       IF( ITYPE.EQ.1 )
      $   CALL ZUNT01( 'Columns', N, M, U, LDU, WORK, 2*N*N, RWORK,
diff --git a/lapack-netlib/TESTING/EIG/zhpt21.f b/lapack-netlib/TESTING/EIG/zhpt21.f
index ef9e4418d..825d387c7 100644
--- a/lapack-netlib/TESTING/EIG/zhpt21.f
+++ b/lapack-netlib/TESTING/EIG/zhpt21.f
@@ -29,8 +29,9 @@
 *>
 *> ZHPT21  generally checks a decomposition of the form
 *>
-*>         A = U S UC>
-*> where * means conjugate transpose, A is hermitian, U is
+*>         A = U S U**H
+*>
+*> where **H means conjugate transpose, A is hermitian, U is
 *> unitary, and S is diagonal (if KBAND=0) or (real) symmetric
 *> tridiagonal (if KBAND=1).  If ITYPE=1, then U is represented as
 *> a dense matrix, otherwise the U is expressed as a product of
@@ -41,15 +42,16 @@
 *>
 *> Specifically, if ITYPE=1, then:
 *>
-*>         RESULT(1) = | A - U S U* | / ( |A| n ulp ) *andC>         RESULT(2) = | I - UU* | / ( n ulp )
+*>         RESULT(1) = | A - U S U**H | / ( |A| n ulp ) and
+*>         RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *> If ITYPE=2, then:
 *>
-*>         RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>         RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *> If ITYPE=3, then:
 *>
-*>         RESULT(1) = | I - UV* | / ( n ulp )
+*>         RESULT(1) = | I - U V**H | / ( n ulp )
 *>
 *> Packed storage means that, for example, if UPLO='U', then the columns
 *> of the upper triangle of A are stored one after another, so that
@@ -70,14 +72,16 @@
 *>
 *>    If UPLO='U', then  V = H(n-1)...H(1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)C>
+*>        H(j) = I  -  tau(j) v(j) v(j)**H
+*>
 *>    and the first j-1 elements of v(j) are stored in V(1:j-1,j+1),
 *>    (i.e., VP( j*(j+1)/2 + 1 : j*(j+1)/2 + j-1 ) ),
 *>    the j-th element is 1, and the last n-j elements are 0.
 *>
 *>    If UPLO='L', then  V = H(1)...H(n-1),  where
 *>
-*>        H(j) = I  -  tau(j) v(j) v(j)C>
+*>        H(j) = I  -  tau(j) v(j) v(j)**H
+*>
 *>    and the first j elements of v(j) are 0, the (j+1)-st is 1, and the
 *>    (j+2)-nd through n-th elements are stored in V(j+2:n,j) (i.e.,
 *>    in VP( (2*n-j)*(j-1)/2 + j+2 : (2*n-j)*(j-1)/2 + n ) .)
@@ -91,14 +95,15 @@
 *>          ITYPE is INTEGER
 *>          Specifies the type of tests to be performed.
 *>          1: U expressed as a dense unitary matrix:
-*>             RESULT(1) = | A - U S U* | / ( |A| n ulp )   *andC>             RESULT(2) = | I - UU* | / ( n ulp )
+*>             RESULT(1) = | A - U S U**H | / ( |A| n ulp )   and
+*>             RESULT(2) = | I - U U**H | / ( n ulp )
 *>
 *>          2: U expressed as a product V of Housholder transformations:
-*>             RESULT(1) = | A - V S V* | / ( |A| n ulp )
+*>             RESULT(1) = | A - V S V**H | / ( |A| n ulp )
 *>
 *>          3: U expressed both as a dense unitary matrix and
 *>             as a product of Housholder transformations:
-*>             RESULT(1) = | I - UV* | / ( n ulp )
+*>             RESULT(1) = | I - U V**H | / ( n ulp )
 *> \endverbatim
 *>
 *> \param[in] UPLO
@@ -181,7 +186,7 @@
 *> \verbatim
 *>          TAU is COMPLEX*16 array, dimension (N)
 *>          If ITYPE >= 2, then TAU(j) is the scalar factor of
-*>          v(j) v(j)* in the Householder transformation H(j) of
+*>          v(j) v(j)**H in the Householder transformation H(j) of
 *>          the product  U = H(1)...H(n-2)
 *>          If ITYPE < 2, then TAU is not referenced.
 *> \endverbatim
@@ -313,7 +318,7 @@
 *
       IF( ITYPE.EQ.1 ) THEN
 *
-*        ITYPE=1: error = A - U S U*
+*        ITYPE=1: error = A - U S U**H
 *
          CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
          CALL ZCOPY( LAP, AP, 1, WORK, 1 )
@@ -323,7 +328,6 @@
    10    CONTINUE
 *
          IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN
-CMK            DO 20 J = 1, N - 1
             DO 20 J = 2, N - 1
                CALL ZHPR2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1,
      $                     U( 1, J-1 ), 1, WORK )
@@ -333,7 +337,7 @@ CMK            DO 20 J = 1, N - 1
 *
       ELSE IF( ITYPE.EQ.2 ) THEN
 *
-*        ITYPE=2: error = V S V* - A
+*        ITYPE=2: error = V S V**H - A
 *
          CALL ZLASET( 'Full', N, N, CZERO, CZERO, WORK, N )
 *
@@ -401,7 +405,7 @@ CMK            DO 20 J = 1, N - 1
 *
       ELSE IF( ITYPE.EQ.3 ) THEN
 *
-*        ITYPE=3: error = U V* - I
+*        ITYPE=3: error = U V**H - I
 *
          IF( N.LT.2 )
      $      RETURN
@@ -432,7 +436,7 @@ CMK            DO 20 J = 1, N - 1
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       IF( ITYPE.EQ.1 ) THEN
          CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO,
diff --git a/lapack-netlib/TESTING/EIG/zstt21.f b/lapack-netlib/TESTING/EIG/zstt21.f
index ad1fe5529..f2e32a12e 100644
--- a/lapack-netlib/TESTING/EIG/zstt21.f
+++ b/lapack-netlib/TESTING/EIG/zstt21.f
@@ -28,14 +28,15 @@
 *>
 *> ZSTT21  checks a decomposition of the form
 *>
-*>    A = U S UC>
-*> where * means conjugate transpose, A is real symmetric tridiagonal,
+*>    A = U S U**H
+*>
+*> where **H means conjugate transpose, A is real symmetric tridiagonal,
 *> U is unitary, and S is real and diagonal (if KBAND=0) or symmetric
 *> tridiagonal (if KBAND=1).  Two tests are performed:
 *>
-*>    RESULT(1) = | A - U S U* | / ( |A| n ulp )
+*>    RESULT(1) = | A - U S U**H | / ( |A| n ulp )
 *>
-*>    RESULT(2) = | I - UU* | / ( n ulp )
+*>    RESULT(2) = | I - U U**H | / ( n ulp )
 *> \endverbatim
 *
 *  Arguments:
@@ -228,7 +229,7 @@
 *
 *     Do Test 2
 *
-*     Compute  UU* - I
+*     Compute  U U**H - I
 *
       CALL ZGEMM( 'N', 'C', N, N, N, CONE, U, LDU, U, LDU, CZERO, WORK,
      $            N )
diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt
index 50ba8fc28..c941d3577 100644
--- a/lapack-netlib/TESTING/LIN/CMakeLists.txt
+++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt
@@ -39,7 +39,8 @@ set(SLINTST schkaa.f
    strt02.f strt03.f strt05.f strt06.f
    sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f
    schklqt.f schklqtp.f schktsqr.f
-   serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f)
+   serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f
+   schkorhr_col.f serrorhr_col.f sorhr_col01.f)
 
 if(USE_XBLAS)
   list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f
@@ -94,7 +95,8 @@ set(CLINTST cchkaa.f
    sget06.f cgennd.f
    cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f
    cchklqt.f cchklqtp.f cchktsqr.f
-   cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f)
+   cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f
+   cchkunhr_col.f cerrunhr_col.f cunhr_col01.f)
 
 if(USE_XBLAS)
   list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f
@@ -139,7 +141,8 @@ set(DLINTST dchkaa.f
    dgennd.f
    dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f
    dchklq.f dchklqt.f dchklqtp.f dchktsqr.f
-   derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f)
+   derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f
+   dchkorhr_col.f derrorhr_col.f dorhr_col01.f)
 
 if(USE_XBLAS)
   list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f
@@ -194,7 +197,8 @@ set(ZLINTST zchkaa.f
    dget06.f zgennd.f
    zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f
    zchklqt.f zchklqtp.f zchktsqr.f
-   zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f)
+   zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f
+   zchkunhr_col.f zerrunhr_col.f zunhr_col01.f)
 
 if(USE_XBLAS)
   list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f
diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile
index 1a332f70b..6e790aa93 100644
--- a/lapack-netlib/TESTING/LIN/Makefile
+++ b/lapack-netlib/TESTING/LIN/Makefile
@@ -1,5 +1,3 @@
-include ../../make.inc
-
 #######################################################################
 #  This makefile creates the test programs for the linear equation
 #  routines in LAPACK.  The test files are grouped as follows:
@@ -33,10 +31,8 @@ include ../../make.inc
 #
 #######################################################################
 
-ifneq ($(strip $(VARLIB)),)
-    LAPACKLIB := $(VARLIB) ../../$(LAPACKLIB)
-endif
-
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
 
 ALINTST = \
    aladhd.o alaerh.o alaesm.o alahd.o alareq.o \
@@ -77,7 +73,8 @@ SLINTST = schkaa.o \
    strt02.o strt03.o strt05.o strt06.o \
    sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \
    schklqt.o schklqtp.o schktsqr.o \
-   serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o
+   serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \
+   schkorhr_col.o serrorhr_col.o sorhr_col01.o
 
 ifdef USEXBLAS
 SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \
@@ -125,7 +122,8 @@ CLINTST = cchkaa.o \
    sget06.o cgennd.o \
    cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \
    cchklqt.o cchklqtp.o cchktsqr.o \
-   cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o
+   cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \
+   cchkunhr_col.o cerrunhr_col.o cunhr_col01.o
 
 ifdef USEXBLAS
 CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \
@@ -168,7 +166,8 @@ DLINTST = dchkaa.o \
    dgennd.o \
    dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \
    dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \
-   derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o
+   derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \
+   dchkorhr_col.o derrorhr_col.o dorhr_col01.o
 
 ifdef USEXBLAS
 DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \
@@ -215,7 +214,8 @@ ZLINTST = zchkaa.o \
    dget06.o zgennd.o \
    zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \
    zchklqt.o zchklqtp.o zchktsqr.o \
-   zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o
+   zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \
+   zchkunhr_col.o zerrunhr_col.o zunhr_col01.o
 
 ifdef USEXBLAS
 ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \
@@ -254,47 +254,50 @@ ZLINTSTRFP = zchkrfp.o zdrvrfp.o zdrvrf1.o zdrvrf2.o zdrvrf3.o zdrvrf4.o zerrrfp
 	zlatb4.o zlaipd.o zlarhs.o zsbmv.o zget04.o zpot01.o zpot03.o zpot02.o \
 	chkxer.o xerbla.o alaerh.o aladhd.o alahd.o alasvm.o
 
+.PHONY: all
 all: single double complex complex16 proto-single proto-double proto-complex proto-complex16
 
+.PHONY: single double complex complex16
 single: xlintsts
 double: xlintstd
 complex: xlintstc
 complex16: xlintstz
 
+.PHONY: proto-single proto-double proto-complex proto-complex16
 proto-single: xlintstrfs
 proto-double: xlintstds xlintstrfd
 proto-complex: xlintstrfc
 proto-complex16: xlintstzc xlintstrfz
 
-xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintsts: $(ALINTST) $(SLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstc: $(ALINTST) $(CLINTST) $(SCLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstd: $(ALINTST) $(DLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstz: $(ALINTST) $(ZLINTST) $(DZLNTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(XBLASLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstds: $(DSLINTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstds: $(DSLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstzc: $(ZCLINTST) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstzc: $(ZCLINTST) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfs: $(SLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstrfs: $(SLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfd: $(DLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstrfd: $(DLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfc: $(CLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstrfc: $(CLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
-xlintstrfz: $(ZLINTSTRFP) ../../$(TMGLIB) ../../$(LAPACKLIB) $(BLASLIB)
-	$(LOADER) $(LOADOPTS) -o $@ $^
+xlintstrfz: $(ZLINTSTRFP) $(TMGLIB) $(VARLIB) ../$(LAPACKLIB) $(BLASLIB)
+	$(LOADER) $(FFLAGS) $(LDFLAGS) -o $@ $^
 
 $(ALINTST): $(FRC)
 $(SCLNTST): $(FRC)
@@ -307,6 +310,7 @@ $(ZLINTST): $(FRC)
 FRC:
 	@FRC=$(FRC)
 
+.PHONY: clean cleanobj cleanexe
 clean: cleanobj cleanexe
 cleanobj:
 	rm -f *.o
@@ -314,15 +318,12 @@ cleanexe:
 	rm -f xlintst*
 
 schkaa.o: schkaa.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 dchkaa.o: dchkaa.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 cchkaa.o: cchkaa.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 zchkaa.o: zchkaa.f
-	$(FORTRAN) $(DRVOPTS) -c -o $@ $<
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
+	$(FC) $(FFLAGS_DRV) -c -o $@ $<
 
 .NOTPARALLEL:
diff --git a/lapack-netlib/TESTING/LIN/cchkaa.f b/lapack-netlib/TESTING/LIN/cchkaa.f
index d8d5060c3..d36770be7 100644
--- a/lapack-netlib/TESTING/LIN/cchkaa.f
+++ b/lapack-netlib/TESTING/LIN/cchkaa.f
@@ -74,6 +74,8 @@
 *> CEQ
 *> CQT
 *> CQX
+*> CTS
+*> CHH
 *> \endverbatim
 *
 *  Parameters:
@@ -108,14 +110,14 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2017
+*> \date November 2019
 *
 *> \ingroup complex_lin
 *
 *  =====================================================================
       PROGRAM CCHKAA
 *
-*  -- LAPACK test routine (version 3.8.0) --
+*  -- LAPACK test routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
 *     November 2017
@@ -165,15 +167,16 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, CCHKEQ, CCHKGB, CCHKGE, CCHKGT, CCHKHE,
-     $                   CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKLQ,
-     $                   CCHKPB,CCHKPO, CCHKPS, CCHKPP, CCHKPT, CCHKQ3,
-     $                   CCHKQL, CCHKQR, CCHKRQ, CCHKSP, CCHKSY,
-     $                   CCHKSY_ROOK, CCHKSY_RK, CCHKSY_AA, CCHKTB, 
-     $                   CCHKTP, CCHKTR, CCHKTZ, CDRVGB, CDRVGE, CDRVGT,
-     $                   CDRVHE, CDRVHE_ROOK, CDRVHE_RK, CDRVHE_AA, 
-     $                   CDRVHP, CDRVLS, CDRVPB, CDRVPO, CDRVPP, CDRVPT,
-     $                   CDRVSP, CDRVSY, CDRVSY_ROOK, CDRVSY_RK,
-     $                   CDRVSY_AA, ILAVER, CCHKQRT, CCHKQRTP
+     $                   CCHKHE_ROOK, CCHKHE_RK, CCHKHE_AA, CCHKHP,
+     $                   CCHKLQ, CCHKUNHR_COL, CCHKPB, CCHKPO, CCHKPS,
+     $                   CCHKPP, CCHKPT, CCHKQ3, CCHKQL, CCHKQR, CCHKRQ,
+     $                   CCHKSP, CCHKSY, CCHKSY_ROOK, CCHKSY_RK,
+     $                   CCHKSY_AA, CCHKTB,  CCHKTP, CCHKTR, CCHKTZ,
+     $                   CDRVGB, CDRVGE, CDRVGT, CDRVHE, CDRVHE_ROOK,
+     $                   CDRVHE_RK, CDRVHE_AA, CDRVHP, CDRVLS, CDRVPB,
+     $                   CDRVPO, CDRVPP, CDRVPT, CDRVSP, CDRVSY,
+     $                   CDRVSY_ROOK, CDRVSY_RK, CDRVSY_AA, ILAVER,
+     $                   CCHKQRT, CCHKQRTP
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -678,7 +681,7 @@
 *
 *        HK:  Hermitian indefinite matrices,
 *             with bounded Bunch-Kaufman (rook) pivoting algorithm,
-*             differnet matrix storage format than HR path version.
+*             different matrix storage format than HR path version.
 *
          NTYPES = 10
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
@@ -838,7 +841,7 @@
 *
 *        SK:  symmetric indefinite matrices,
 *             with bounded Bunch-Kaufman (rook) pivoting algorithm,
-*             differnet matrix storage format than SR path version.
+*             different matrix storage format than SR path version.
 *
          NTYPES = 11
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
@@ -1165,6 +1168,17 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN
+*
+*        HH:  Householder reconstruction for tall-skinny matrices
+*
+         IF( TSTCHK ) THEN
+            CALL CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 ) PATH
+         END IF
 *
       ELSE
 *
diff --git a/lapack-netlib/TESTING/LIN/cchkunhr_col.f b/lapack-netlib/TESTING/LIN/cchkunhr_col.f
new file mode 100644
index 000000000..00077ddd9
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f
@@ -0,0 +1,239 @@
+*> \brief \b CCHKUNHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+*                                NBVAL, NOUT )
+*
+*       .. Scalar Arguments ..
+*       LOGICAL            TSTERR
+*       INTEGER            NM, NN, NNB, NOUT
+*       REAL               THRESH
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR
+*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested
+*> before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[in] TSTERR
+*> \verbatim
+*>          TSTERR is LOGICAL
+*>          Flag that indicates whether error exits are to be tested.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB contained in the vector NBVAL.
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NBVAL)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex_lin
+*
+*  =====================================================================
+      SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      LOGICAL            TSTERR
+      INTEGER            NM, NN, NNB, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 6 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1,
+     $                   NB2, NFAIL, NERRS, NRUN
+*
+*     .. Local Arrays ..
+      REAL               RESULT( NTESTS )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC  MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants
+*
+      PATH( 1: 1 ) = 'C'
+      PATH( 2: 3 ) = 'HH'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+*
+*     Test the error exits
+*
+      IF( TSTERR ) CALL CERRUNHR_COL( PATH, NOUT )
+      INFOT = 0
+*
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test CUNHR_COL
+*
+                              CALL CUNHR_COL01( M, N, MB1, NB1, NB2,
+     $                                          RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9999 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+      RETURN
+*
+*     End of CCHKUNHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/TESTING/LIN/cdrvls.f b/lapack-netlib/TESTING/LIN/cdrvls.f
index 2c2d9abb8..d24e3885b 100644
--- a/lapack-netlib/TESTING/LIN/cdrvls.f
+++ b/lapack-netlib/TESTING/LIN/cdrvls.f
@@ -237,13 +237,13 @@
       REAL               EPS, NORMA, NORMB, RCOND
 *     ..
 *     .. Local Arrays ..
-      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ
-      REAL               RESULT( NTESTS ), RWQ
-      COMPLEX            WQ
+      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 )
+      REAL               RESULT( NTESTS ), RWQ( 1 )
+      COMPLEX            WQ( 1 )
 *     ..
 *     .. Allocatable Arrays ..
       COMPLEX, ALLOCATABLE :: WORK (:)
-      REAL, ALLOCATABLE :: RWORK (:)
+      REAL, ALLOCATABLE :: RWORK (:), WORK2 (:)
       INTEGER, ALLOCATABLE :: IWORK (:)
 *     ..
 *     .. External Functions ..
@@ -363,32 +363,32 @@
 *                             Compute workspace needed for CGELS
                               CALL CGELS( TRANS, M, N, NRHS, A, LDA,
      $                                    B, LDB, WQ, -1, INFO )
-                              LWORK_CGELS = INT( WQ )
+                              LWORK_CGELS = INT( WQ( 1 ) )
 *                             Compute workspace needed for CGETSLS
                               CALL CGETSLS( TRANS, M, N, NRHS, A, LDA,
      $                                      B, LDB, WQ, -1, INFO )
-                              LWORK_CGETSLS = INT( WQ )
+                              LWORK_CGETSLS = INT( WQ( 1 ) )
                            ENDDO
                         END IF
 *                       Compute workspace needed for CGELSY
                         CALL CGELSY( M, N, NRHS, A, LDA, B, LDB,
      $                               IWQ, RCOND, CRANK, WQ, -1, RWORK,
      $                               INFO )
-                        LWORK_CGELSY = INT( WQ )
+                        LWORK_CGELSY = INT( WQ( 1 ) )
                         LRWORK_CGELSY = 2*N
 *                       Compute workspace needed for CGELSS
                         CALL CGELSS( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1, RWORK, INFO )
-                        LWORK_CGELSS = INT( WQ )
+                        LWORK_CGELSS = INT( WQ( 1 ) )
                         LRWORK_CGELSS = 5*MNMIN
 *                       Compute workspace needed for CGELSD
                         CALL CGELSD( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1, RWQ, IWQ,
      $                               INFO )
-                        LWORK_CGELSD = INT( WQ )
-                        LRWORK_CGELSD = INT( RWQ )
+                        LWORK_CGELSD = INT( WQ( 1 ) )
+                        LRWORK_CGELSD = INT( RWQ ( 1 ) )
 *                       Compute LIWORK workspace needed for CGELSY and CGELSD
-                        LIWORK = MAX( LIWORK, N, IWQ )
+                        LIWORK = MAX( LIWORK, N, IWQ ( 1 ) )
 *                       Compute LRWORK workspace needed for CGELSY, CGELSS and CGELSD
                         LRWORK = MAX( LRWORK, LRWORK_CGELSY,
      $                                LRWORK_CGELSS, LRWORK_CGELSD )
@@ -408,6 +408,7 @@
       ALLOCATE( WORK( LWORK ) )
       ALLOCATE( IWORK( LIWORK ) )
       ALLOCATE( RWORK( LRWORK ) )
+      ALLOCATE( WORK2( 2 * LWORK ) )
 *
       DO 140 IM = 1, NM
          M = MVAL( IM )
@@ -596,7 +597,7 @@
      $                           CALL CLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL CQRT16( TRANS, M, N, NRHS, COPYA,
-     $                                     LDA, B, LDB, C, LDB, WORK,
+     $                                     LDA, B, LDB, C, LDB, WORK2,
      $                                     RESULT( 15 ) )
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
diff --git a/lapack-netlib/TESTING/LIN/cdrvsy_rk.f b/lapack-netlib/TESTING/LIN/cdrvsy_rk.f
index ae313c243..d3ed8c0a9 100644
--- a/lapack-netlib/TESTING/LIN/cdrvsy_rk.f
+++ b/lapack-netlib/TESTING/LIN/cdrvsy_rk.f
@@ -98,8 +98,9 @@
 *> \param[out] E
 *> \verbatim
 *>          E is COMPLEX array, dimension (NMAX)
-*> \param[out] AINV
+*> \endverbatim
 *>
+*> \param[out] AINV
 *> \verbatim
 *>          AINV is COMPLEX array, dimension (NMAX*NMAX)
 *> \endverbatim
diff --git a/lapack-netlib/TESTING/LIN/cerrunhr_col.f b/lapack-netlib/TESTING/LIN/cerrunhr_col.f
new file mode 100644
index 000000000..8fd58a683
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cerrunhr_col.f
@@ -0,0 +1,164 @@
+*> \brief \b CERRUNHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CERRUNHR_COL( PATH, NUNIT )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER*3        PATH
+*       INTEGER            NUNIT
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CERRUNHR_COL tests the error exits for CUNHR_COL that does
+*> Householder reconstruction from the ouput of tall-skinny
+*> factorization CLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] PATH
+*> \verbatim
+*>          PATH is CHARACTER*3
+*>          The LAPACK path name for the routines to be tested.
+*> \endverbatim
+*>
+*> \param[in] NUNIT
+*> \verbatim
+*>          NUNIT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex_lin
+*
+*  =====================================================================
+      SUBROUTINE CERRUNHR_COL( PATH, NUNIT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            NUNIT
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 2 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, INFO, J
+*     ..
+*     .. Local Arrays ..
+      COMPLEX            A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX)
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAESM, CHKXER, CUNHR_COL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NOUT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NOUT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL, CMPLX
+*     ..
+*     .. Executable Statements ..
+*
+      NOUT = NUNIT
+      WRITE( NOUT, FMT = * )
+*
+*     Set the variables to innocuous values.
+*
+      DO J = 1, NMAX
+         DO I = 1, NMAX
+            A( I, J ) = CMPLX( 1.E+0 / REAL( I+J ) )
+            T( I, J ) = CMPLX( 1.E+0 / REAL( I+J ) )
+         END DO
+         D( J ) = ( 0.E+0, 0.E+0 )
+      END DO
+      OK = .TRUE.
+*
+*     Error exits for Householder reconstruction
+*
+*     CUNHR_COL
+*
+      SRNAMT = 'CUNHR_COL'
+*
+      INFOT = 1
+      CALL CUNHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 2
+      CALL CUNHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+      CALL CUNHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 3
+      CALL CUNHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL CUNHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 5
+      CALL CUNHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL CUNHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL CUNHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 7
+      CALL CUNHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL CUNHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL CUNHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO )
+      CALL CHKXER( 'CUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+*     Print a summary line.
+*
+      CALL ALAESM( PATH, OK, NOUT )
+*
+      RETURN
+*
+*     End of CERRUNHR_COL
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/cerrvx.f b/lapack-netlib/TESTING/LIN/cerrvx.f
index d2d3d2a85..7f929f07f 100644
--- a/lapack-netlib/TESTING/LIN/cerrvx.f
+++ b/lapack-netlib/TESTING/LIN/cerrvx.f
@@ -739,7 +739,7 @@
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
-         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1,
+         CALL CHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
@@ -769,7 +769,7 @@
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
-         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1,
+         CALL CSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'CSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
diff --git a/lapack-netlib/TESTING/LIN/clahilb.f b/lapack-netlib/TESTING/LIN/clahilb.f
index f88491a0d..c54884b9f 100644
--- a/lapack-netlib/TESTING/LIN/clahilb.f
+++ b/lapack-netlib/TESTING/LIN/clahilb.f
@@ -164,7 +164,7 @@
       INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D
       PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8)
 *
-*     d's are generated from random permuation of those eight elements.
+*     d's are generated from random permutation of those eight elements.
       COMPLEX D1(8), D2(8), INVD1(8), INVD2(8)
       DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/
       DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/
diff --git a/lapack-netlib/TESTING/LIN/ctsqr01.f b/lapack-netlib/TESTING/LIN/ctsqr01.f
index a3bd9ebc9..6d788ba41 100644
--- a/lapack-netlib/TESTING/LIN/ctsqr01.f
+++ b/lapack-netlib/TESTING/LIN/ctsqr01.f
@@ -114,7 +114,7 @@
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISEED( 4 )
-      COMPLEX            TQUERY( 5 ), WORKQUERY
+      COMPLEX            TQUERY( 5 ), WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
       REAL     SLAMCH, CLANGE, CLANSY
@@ -173,22 +173,22 @@
 *
       CALL CGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ) )
       CALL CGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMQR( 'L', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMQR( 'R', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'CGEQR'
@@ -316,22 +316,22 @@
       ELSE
       CALL CGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ) )
       CALL CGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N,
      $              WORKQUERY, -1, INFO )
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMLQ( 'L', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL CGEMLQ( 'R', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'CGELQ'
diff --git a/lapack-netlib/TESTING/LIN/cunhr_col01.f b/lapack-netlib/TESTING/LIN/cunhr_col01.f
new file mode 100644
index 000000000..d760caba5
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f
@@ -0,0 +1,390 @@
+*> \brief \b CUNHR_COL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       REAL              RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR.
+*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is REAL array, dimension (6)
+*>          Results of each of the six tests below.
+*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
+*>
+*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      REAL              RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      COMPLEX, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+      REAL, ALLOCATABLE :: RWORK(:)
+*
+*     .. Parameters ..
+      REAL               ZERO
+      PARAMETER          ( ZERO = 0.0E+0 )
+      COMPLEX            CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0E+0, 0.0E+0 ),
+     $                     CZERO = ( 0.0E+0, 0.0E+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB
+      REAL               ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      COMPLEX            WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, CLANGE, CLANSY
+      EXTERNAL           SLAMCH, CLANGE, CLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           CLACPY, CLARNV, CLASET, CLATSQR, CUNHR_COL,
+     $                   CUNGTSQR, CSCAL, CGEMM, CGEMQRT, CHERK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, REAL, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = SLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL CLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL CLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in CLATSQR
+*
+      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     CLATSQR requires NB1 to be bounded by N.
+*
+      NB1_UB = MIN( NB1, N)
+*
+*     CGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+      CALL CLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1,
+     $              WORKQUERY, -1, INFO )
+      LWORK = INT( WORKQUERY( 1 ) )
+      CALL CUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1,
+     $               INFO )
+
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
+*
+*     In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'CLATSQR'
+      CALL CLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK,
+     $              INFO )
+*
+*     Copy the factor R into the array R.
+*
+      SRNAMT = 'CLACPY'
+      CALL CLACPY( 'U', M, N, AF, M, R, M )
+*
+*     Reconstruct the orthogonal matrix Q.
+*
+      SRNAMT = 'CUNGTSQR'
+      CALL CUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK,
+     $               INFO )
+*
+*     Perform the Householder reconstruction, the result is stored
+*     the arrays AF and T2.
+*
+      SRNAMT = 'CUNHR_COL'
+      CALL CUNHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO )
+*
+*     Compute the factor R_hr corresponding to the Householder
+*     reconstructed Q_hr and place it in the upper triangle of AF to
+*     match the Q storage format in CGEQRT. R_hr = R_tsqr * S,
+*     this means changing the sign of I-th row of the matrix R_tsqr
+*     according to sign of of I-th diagonal element DIAG(I) of the
+*     matrix S.
+*
+      SRNAMT = 'CLACPY'
+      CALL CLACPY( 'U', M, N, R, M, AF, M )
+*
+      DO I = 1, N
+         IF( DIAG( I ).EQ.-CONE ) THEN
+            CALL CSCAL( N+1-I, -CONE, AF( I, I ), M )
+         END IF
+      END DO
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M )
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M )
+*
+      CALL CLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**H)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
+*
+      ANORM = CLANGE( '1', M, N, A, M, RWORK )
+      RESID = CLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**H)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL CLASET( 'Full', M, M, CZERO, CONE, R, M )
+      CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
+      RESID = CLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL CLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = CLANGE( '1', M, N, C, M, RWORK )
+      CALL CLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = CLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL CLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**H)*C = CF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**H)*C| / ( eps * m * |C|)
+*
+      CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = CLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL CLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = CLANGE( '1', N, M, D, N, RWORK )
+      CALL CLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = CLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL CLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'CGEMQRT'
+      CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**H)| / ( eps * m * |D| )
+*
+      CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = CLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of CUNHR_COL01
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/dchkaa.f b/lapack-netlib/TESTING/LIN/dchkaa.f
index c5fd7afda..03575c4d1 100644
--- a/lapack-netlib/TESTING/LIN/dchkaa.f
+++ b/lapack-netlib/TESTING/LIN/dchkaa.f
@@ -68,6 +68,10 @@
 *> DEQ
 *> DQT
 *> DQX
+*> DTQ
+*> DXQ
+*> DTS
+*> DHH
 *> \endverbatim
 *
 *  Parameters:
@@ -102,17 +106,17 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date April 2012
+*> \date November 2019
 *
 *> \ingroup double_lin
 *
 *  =====================================================================
       PROGRAM DCHKAA
 *
-*  -- LAPACK test routine (version 3.8.0) --
+*  -- LAPACK test routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     April 2012
+*     Novemebr 2019
 *
 *  =====================================================================
 *
@@ -159,15 +163,14 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, DCHKEQ, DCHKGB, DCHKGE, DCHKGT, DCHKLQ,
-     $                   DCHKPB, DCHKPO, DCHKPS, DCHKPP, DCHKPT, DCHKQ3,
-     $                   DCHKQL, DCHKQR, DCHKRQ, DCHKSP, DCHKSY,
-     $                   DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA, DCHKTB,
-     $                   DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE,
-     $                   DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP,
-     $                   DDRVPT, DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK,
-     $                   DDRVSY_AA, ILAVER, DCHKQRT,
-     $                   DCHKQRTP, DCHKLQTP, DCHKTSQR, DCHKLQT
-
+     $                   DCHKORHR_COL, DCHKPB, DCHKPO, DCHKPS, DCHKPP,
+     $                   DCHKPT, DCHKQ3, DCHKQL, DCHKQR, DCHKRQ, DCHKSP,
+     $                   DCHKSY, DCHKSY_ROOK, DCHKSY_RK, DCHKSY_AA,
+     $                   DCHKTB, DCHKTP, DCHKTR, DCHKTZ, DDRVGB, DDRVGE,
+     $                   DDRVGT, DDRVLS, DDRVPB, DDRVPO, DDRVPP, DDRVPT,
+     $                   DDRVSP, DDRVSY, DDRVSY_ROOK, DDRVSY_RK,
+     $                   DDRVSY_AA, ILAVER, DCHKLQTP, DCHKQRT, DCHKQRTP,
+     $                   DCHKLQT,DCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -1007,8 +1010,20 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN
+*
+*        HH:  Householder reconstruction for tall-skinny matrices
+*
+         IF( TSTCHK ) THEN
+            CALL DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 ) PATH
+         END IF
 *
       ELSE
+
 *
          WRITE( NOUT, FMT = 9990 )PATH
       END IF
diff --git a/lapack-netlib/TESTING/LIN/dchkorhr_col.f b/lapack-netlib/TESTING/LIN/dchkorhr_col.f
new file mode 100644
index 000000000..3b3e421eb
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f
@@ -0,0 +1,239 @@
+*> \brief \b DCHKORHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+*                                NBVAL, NOUT )
+*
+*       .. Scalar Arguments ..
+*       LOGICAL            TSTERR
+*       INTEGER            NM, NN, NNB, NOUT
+*       DOUBLE PRECISION   THRESH
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR
+*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested
+*> before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[in] TSTERR
+*> \verbatim
+*>          TSTERR is LOGICAL
+*>          Flag that indicates whether error exits are to be tested.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB contained in the vector NBVAL.
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NBVAL)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup double_lin
+*
+*  =====================================================================
+      SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      LOGICAL            TSTERR
+      INTEGER            NM, NN, NNB, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 6 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1,
+     $                   NB2, NFAIL, NERRS, NRUN
+*
+*     .. Local Arrays ..
+      DOUBLE PRECISION   RESULT( NTESTS )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC  MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants
+*
+      PATH( 1: 1 ) = 'D'
+      PATH( 2: 3 ) = 'HH'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+*
+*     Test the error exits
+*
+      IF( TSTERR ) CALL DERRORHR_COL( PATH, NOUT )
+      INFOT = 0
+*
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test DORHR_COL
+*
+                              CALL DORHR_COL01( M, N, MB1, NB1, NB2,
+     $                                          RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9999 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+      RETURN
+*
+*     End of DCHKORHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/TESTING/LIN/ddrvls.f b/lapack-netlib/TESTING/LIN/ddrvls.f
index 2f4975553..adfd71e09 100644
--- a/lapack-netlib/TESTING/LIN/ddrvls.f
+++ b/lapack-netlib/TESTING/LIN/ddrvls.f
@@ -233,8 +233,8 @@
       DOUBLE PRECISION   EPS, NORMA, NORMB, RCOND
 *     ..
 *     .. Local Arrays ..
-      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ
-      DOUBLE PRECISION   RESULT( NTESTS ), WQ
+      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 )
+      DOUBLE PRECISION   RESULT( NTESTS ), WQ( 1 )
 *     ..
 *     .. Allocatable Arrays ..
       DOUBLE PRECISION, ALLOCATABLE :: WORK (:)
@@ -359,27 +359,27 @@
 *                             Compute workspace needed for DGELS
                               CALL DGELS( TRANS, M, N, NRHS, A, LDA,
      $                                    B, LDB, WQ, -1, INFO )
-                              LWORK_DGELS = INT ( WQ )
+                              LWORK_DGELS = INT ( WQ ( 1 ) )
 *                             Compute workspace needed for DGETSLS
                               CALL DGETSLS( TRANS, M, N, NRHS, A, LDA,
      $                                      B, LDB, WQ, -1, INFO )
-                              LWORK_DGETSLS = INT( WQ )
+                              LWORK_DGETSLS = INT( WQ ( 1 ) )
                            ENDDO
                         END IF
 *                       Compute workspace needed for DGELSY
                         CALL DGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ,
      $                               RCOND, CRANK, WQ, -1, INFO )
-                        LWORK_DGELSY = INT( WQ )
+                        LWORK_DGELSY = INT( WQ ( 1 ) )
 *                       Compute workspace needed for DGELSS
                         CALL DGELSS( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1 , INFO )
-                        LWORK_DGELSS = INT( WQ )
+                        LWORK_DGELSS = INT( WQ ( 1 ) )
 *                       Compute workspace needed for DGELSD
                         CALL DGELSD( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1, IWQ, INFO )
-                        LWORK_DGELSD = INT( WQ )
+                        LWORK_DGELSD = INT( WQ ( 1 ) )
 *                       Compute LIWORK workspace needed for DGELSY and DGELSD
-                        LIWORK = MAX( LIWORK, N, IWQ )
+                        LIWORK = MAX( LIWORK, N, IWQ( 1 ) )
 *                       Compute LWORK workspace needed for all functions
                         LWORK = MAX( LWORK, LWORK_DGELS, LWORK_DGETSLS,
      $                               LWORK_DGELSY, LWORK_DGELSS,
diff --git a/lapack-netlib/TESTING/LIN/derrorhr_col.f b/lapack-netlib/TESTING/LIN/derrorhr_col.f
new file mode 100644
index 000000000..6d545bc91
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/derrorhr_col.f
@@ -0,0 +1,164 @@
+*> \brief \b DERRORHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DERRORHR_COL( PATH, NUNIT )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER*3        PATH
+*       INTEGER            NUNIT
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DERRORHR_COL tests the error exits for DORHR_COL that does
+*> Householder reconstruction from the ouput of tall-skinny
+*> factorization DLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] PATH
+*> \verbatim
+*>          PATH is CHARACTER*3
+*>          The LAPACK path name for the routines to be tested.
+*> \endverbatim
+*>
+*> \param[in] NUNIT
+*> \verbatim
+*>          NUNIT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup double_lin
+*
+*  =====================================================================
+      SUBROUTINE DERRORHR_COL( PATH, NUNIT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            NUNIT
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 2 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, INFO, J
+*     ..
+*     .. Local Arrays ..
+      DOUBLE PRECISION   A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX)
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAESM, CHKXER, DORHR_COL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NOUT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NOUT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE
+*     ..
+*     .. Executable Statements ..
+*
+      NOUT = NUNIT
+      WRITE( NOUT, FMT = * )
+*
+*     Set the variables to innocuous values.
+*
+      DO J = 1, NMAX
+         DO I = 1, NMAX
+            A( I, J ) = 1.D+0 / DBLE( I+J )
+            T( I, J ) = 1.D+0 / DBLE( I+J )
+         END DO
+         D( J ) = 0.D+0
+      END DO
+      OK = .TRUE.
+*
+*     Error exits for Householder reconstruction
+*
+*     DORHR_COL
+*
+      SRNAMT = 'DORHR_COL'
+*
+      INFOT = 1
+      CALL DORHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 2
+      CALL DORHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+      CALL DORHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 3
+      CALL DORHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL DORHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 5
+      CALL DORHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL DORHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL DORHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 7
+      CALL DORHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL DORHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL DORHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO )
+      CALL CHKXER( 'DORHR_COL', INFOT, NOUT, LERR, OK )
+*
+*     Print a summary line.
+*
+      CALL ALAESM( PATH, OK, NOUT )
+*
+      RETURN
+*
+*     End of DERRORHR_COL
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/derrvx.f b/lapack-netlib/TESTING/LIN/derrvx.f
index 3a4a6b7fc..fd1d038a6 100644
--- a/lapack-netlib/TESTING/LIN/derrvx.f
+++ b/lapack-netlib/TESTING/LIN/derrvx.f
@@ -740,7 +740,7 @@
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
-         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1,
+         CALL DSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'DSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
diff --git a/lapack-netlib/TESTING/LIN/dorhr_col01.f b/lapack-netlib/TESTING/LIN/dorhr_col01.f
new file mode 100644
index 000000000..3e48de37f
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f
@@ -0,0 +1,386 @@
+*> \brief \b DORHR_COL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       DOUBLE PRECISION  RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR.
+*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is DOUBLE PRECISION array, dimension (6)
+*>          Results of each of the six tests below.
+*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
+*>
+*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup single_lin
+*
+*  =====================================================================
+      SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      DOUBLE PRECISION  RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      DOUBLE PRECISION, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ONE, ZERO
+      PARAMETER          ( ZERO = 0.0D+0, ONE = 1.0D+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB
+      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      DOUBLE PRECISION   WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, DLANGE, DLANSY
+      EXTERNAL           DLAMCH, DLANGE, DLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           DLACPY, DLARNV, DLASET, DLATSQR, DORHR_COL,
+     $                   DORGTSQR, DSCAL, DGEMM, DGEMQRT, DSYRK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = DLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL DLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL DLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in DLATSQR
+*
+      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     DLATSQR requires NB1 to be bounded by N.
+*
+      NB1_UB = MIN( NB1, N)
+*
+*     DGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+      CALL DLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1,
+     $              WORKQUERY, -1, INFO )
+      LWORK = INT( WORKQUERY( 1 ) )
+      CALL DORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1,
+     $               INFO )
+
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
+*
+*     In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'DLATSQR'
+      CALL DLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK,
+     $              INFO )
+*
+*     Copy the factor R into the array R.
+*
+      SRNAMT = 'DLACPY'
+      CALL DLACPY( 'U', N, N, AF, M, R, M )
+*
+*     Reconstruct the orthogonal matrix Q.
+*
+      SRNAMT = 'DORGTSQR'
+      CALL DORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK,
+     $               INFO )
+*
+*     Perform the Householder reconstruction, the result is stored
+*     the arrays AF and T2.
+*
+      SRNAMT = 'DORHR_COL'
+      CALL DORHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO )
+*
+*     Compute the factor R_hr corresponding to the Householder
+*     reconstructed Q_hr and place it in the upper triangle of AF to
+*     match the Q storage format in DGEQRT. R_hr = R_tsqr * S,
+*     this means changing the sign of I-th row of the matrix R_tsqr
+*     according to sign of of I-th diagonal element DIAG(I) of the
+*     matrix S.
+*
+      SRNAMT = 'DLACPY'
+      CALL DLACPY( 'U', N, N, R, M, AF, M )
+*
+      DO I = 1, N
+         IF( DIAG( I ).EQ.-ONE ) THEN
+            CALL DSCAL( N+1-I, -ONE, AF( I, I ), M )
+         END IF
+      END DO
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M )
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M )
+*
+      CALL DLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
+*
+      ANORM = DLANGE( '1', M, N, A, M, RWORK )
+      RESID = DLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL DLASET( 'Full', M, M, ZERO, ONE, R, M )
+      CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
+      RESID = DLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL DLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = DLANGE( '1', M, N, C, M, RWORK )
+      CALL DLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = DLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL DLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = DLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL DLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = DLANGE( '1', N, M, D, N, RWORK )
+      CALL DLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = DLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL DLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'DGEMQRT'
+      CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = DLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of DORHR_COL01
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/dtsqr01.f b/lapack-netlib/TESTING/LIN/dtsqr01.f
index 7a50009cc..25bf58a81 100644
--- a/lapack-netlib/TESTING/LIN/dtsqr01.f
+++ b/lapack-netlib/TESTING/LIN/dtsqr01.f
@@ -115,7 +115,7 @@
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISEED( 4 )
-      DOUBLE PRECISION   TQUERY( 5 ), WORKQUERY
+      DOUBLE PRECISION   TQUERY( 5 ), WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION DLAMCH, DLANGE, DLANSY
@@ -174,22 +174,22 @@
 *
       CALL DGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ) )
       CALL DGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMQR( 'L', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMQR( 'R', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'DGEQR'
@@ -317,22 +317,22 @@
       ELSE
       CALL DGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ) )
       CALL DGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N,
      $              WORKQUERY, -1, INFO )
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMLQ( 'L', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL DGEMLQ( 'R', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'DGELQ'
diff --git a/lapack-netlib/TESTING/LIN/schkaa.f b/lapack-netlib/TESTING/LIN/schkaa.f
index 33b109aa7..a9c13e442 100644
--- a/lapack-netlib/TESTING/LIN/schkaa.f
+++ b/lapack-netlib/TESTING/LIN/schkaa.f
@@ -68,6 +68,8 @@
 *> SEQ
 *> SQT
 *> SQX
+*> STS
+*> SHH
 *> \endverbatim
 *
 *  Parameters:
@@ -102,17 +104,17 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date April 2012
+*> \date November 2019
 *
 *> \ingroup single_lin
 *
 *  =====================================================================
       PROGRAM SCHKAA
 *
-*  -- LAPACK test routine (version 3.8.0) --
+*  -- LAPACK test routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     April 2012
+*     November 2019
 *
 *  =====================================================================
 *
@@ -159,13 +161,13 @@
 *     ..
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, SCHKEQ, SCHKGB, SCHKGE, SCHKGT, SCHKLQ,
-     $                   SCHKPB, SCHKPO, SCHKPS, SCHKPP, SCHKPT, SCHKQ3,
-     $                   SCHKQL, SCHKQR, SCHKRQ, SCHKSP, SCHKSY,
-     $                   SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA, SCHKTB,
-     $                   SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE, SDRVGT,
-     $                   SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT, SDRVSP,
-     $                   SDRVSY, SDRVSY_ROOK, SDRVSY_RK, SDRVSY_AA,
-     $                   ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP,
+     $                   SCHKORHR_COL, SCHKPB, SCHKPO, SCHKPS, SCHKPP,
+     $                   SCHKPT, SCHKQ3, SCHKQL, SCHKQR, SCHKRQ, SCHKSP,
+     $                   SCHKSY, SCHKSY_ROOK, SCHKSY_RK, SCHKSY_AA,
+     $                   SCHKTB, SCHKTP, SCHKTR, SCHKTZ, SDRVGB, SDRVGE,
+     $                   SDRVGT, SDRVLS, SDRVPB, SDRVPO, SDRVPP, SDRVPT,
+     $                   SDRVSP, SDRVSY, SDRVSY_ROOK, SDRVSY_RK,
+     $                   SDRVSY_AA, ILAVER, SCHKLQTP, SCHKQRT, SCHKQRTP,
      $                   SCHKLQT, SCHKTSQR
 *     ..
 *     .. Scalars in Common ..
@@ -673,7 +675,7 @@
 *
 *        SK:  symmetric indefinite matrices,
 *             with bounded Bunch-Kaufman (rook) pivoting algorithm,
-*             differnet matrix storage format than SR path version.
+*             different matrix storage format than SR path version.
 *
          NTYPES = 10
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
@@ -1004,6 +1006,17 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN
+*
+*        HH:  Householder reconstruction for tall-skinny matrices
+*
+         IF( TSTCHK ) THEN
+            CALL SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 ) PATH
+         END IF
 *
       ELSE
 *
diff --git a/lapack-netlib/TESTING/LIN/schkorhr_col.f b/lapack-netlib/TESTING/LIN/schkorhr_col.f
new file mode 100644
index 000000000..cf6d2d323
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f
@@ -0,0 +1,239 @@
+*> \brief \b SCHKORHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+*                                NBVAL, NOUT )
+*
+*       .. Scalar Arguments ..
+*       LOGICAL            TSTERR
+*       INTEGER            NM, NN, NNB, NOUT
+*       REAL               THRESH
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is REAL
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[in] TSTERR
+*> \verbatim
+*>          TSTERR is LOGICAL
+*>          Flag that indicates whether error exits are to be tested.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB contained in the vector NBVAL.
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NBVAL)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup sigle_lin
+*
+*  =====================================================================
+      SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     June 2019
+*
+*     .. Scalar Arguments ..
+      LOGICAL            TSTERR
+      INTEGER            NM, NN, NNB, NOUT
+      REAL               THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 6 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1,
+     $                   NB2, NFAIL, NERRS, NRUN
+*
+*     .. Local Arrays ..
+      REAL               RESULT( NTESTS )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants
+*
+      PATH( 1: 1 ) = 'S'
+      PATH( 2: 3 ) = 'HH'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+*
+*     Test the error exits
+*
+      IF( TSTERR ) CALL SERRORHR_COL( PATH, NOUT )
+      INFOT = 0
+*
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test SORHR_COL
+*
+                              CALL SORHR_COL01( M, N, MB1, NB1, NB2,
+     $                                          RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9999 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+      RETURN
+*
+*     End of SCHKORHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/TESTING/LIN/sdrvls.f b/lapack-netlib/TESTING/LIN/sdrvls.f
index 2cf3439b5..649ca558c 100644
--- a/lapack-netlib/TESTING/LIN/sdrvls.f
+++ b/lapack-netlib/TESTING/LIN/sdrvls.f
@@ -233,8 +233,8 @@
       REAL               EPS, NORMA, NORMB, RCOND
 *     ..
 *     .. Local Arrays ..
-      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ
-      REAL               RESULT( NTESTS ), WQ
+      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 )
+      REAL               RESULT( NTESTS ), WQ( 1 )
 *     ..
 *     .. Allocatable Arrays ..
       REAL, ALLOCATABLE :: WORK (:)
@@ -358,28 +358,28 @@
 *
 *                             Compute workspace needed for SGELS
                               CALL SGELS( TRANS, M, N, NRHS, A, LDA,
-     $                                    B, LDB, WQ, -1, INFO )
-                              LWORK_SGELS = INT ( WQ )
+     $                                    B, LDB, WQ( 1 ), -1, INFO )
+                              LWORK_SGELS = INT ( WQ( 1 ) )
 *                             Compute workspace needed for SGETSLS
                               CALL SGETSLS( TRANS, M, N, NRHS, A, LDA,
-     $                                      B, LDB, WQ, -1, INFO )
-                              LWORK_SGETSLS = INT( WQ )
+     $                                      B, LDB, WQ( 1 ), -1, INFO )
+                              LWORK_SGETSLS = INT( WQ( 1 ) )
                            ENDDO
                         END IF
 *                       Compute workspace needed for SGELSY
                         CALL SGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ,
      $                               RCOND, CRANK, WQ, -1, INFO )
-                        LWORK_SGELSY = INT( WQ )
+                        LWORK_SGELSY = INT( WQ( 1 ) )
 *                       Compute workspace needed for SGELSS
                         CALL SGELSS( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1 , INFO )
-                        LWORK_SGELSS = INT( WQ )
+                        LWORK_SGELSS = INT( WQ( 1 ) )
 *                       Compute workspace needed for SGELSD
                         CALL SGELSD( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1, IWQ, INFO )
-                        LWORK_SGELSD = INT( WQ )
+                        LWORK_SGELSD = INT( WQ( 1 ) )
 *                       Compute LIWORK workspace needed for SGELSY and SGELSD
-                        LIWORK = MAX( LIWORK, N, IWQ )
+                        LIWORK = MAX( LIWORK, N, IWQ( 1 ) )
 *                       Compute LWORK workspace needed for all functions
                         LWORK = MAX( LWORK, LWORK_SGELS, LWORK_SGETSLS,
      $                               LWORK_SGELSY, LWORK_SGELSS,
diff --git a/lapack-netlib/TESTING/LIN/serrorhr_col.f b/lapack-netlib/TESTING/LIN/serrorhr_col.f
new file mode 100644
index 000000000..e8d81a99c
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/serrorhr_col.f
@@ -0,0 +1,164 @@
+*> \brief \b SERRORHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SERRORHR_COL( PATH, NUNIT )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER*3        PATH
+*       INTEGER            NUNIT
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SERRORHR_COL tests the error exits for SORHR_COL that does
+*> Householder reconstruction from the ouput of tall-skinny
+*> factorization SLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] PATH
+*> \verbatim
+*>          PATH is CHARACTER*3
+*>          The LAPACK path name for the routines to be tested.
+*> \endverbatim
+*>
+*> \param[in] NUNIT
+*> \verbatim
+*>          NUNIT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup singlr_lin
+*
+*  =====================================================================
+      SUBROUTINE SERRORHR_COL( PATH, NUNIT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            NUNIT
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 2 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, INFO, J
+*     ..
+*     .. Local Arrays ..
+      REAL               A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX)
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAESM, CHKXER, SORHR_COL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NOUT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NOUT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          REAL
+*     ..
+*     .. Executable Statements ..
+*
+      NOUT = NUNIT
+      WRITE( NOUT, FMT = * )
+*
+*     Set the variables to innocuous values.
+*
+      DO J = 1, NMAX
+         DO I = 1, NMAX
+            A( I, J ) = 1.E+0 / REAL( I+J )
+            T( I, J ) = 1.E+0 / REAL( I+J )
+         END DO
+         D( J ) = 0.E+0
+      END DO
+      OK = .TRUE.
+*
+*     Error exits for Householder reconstruction
+*
+*     SORHR_COL
+*
+      SRNAMT = 'SORHR_COL'
+*
+      INFOT = 1
+      CALL SORHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 2
+      CALL SORHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+      CALL SORHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 3
+      CALL SORHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL SORHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 5
+      CALL SORHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL SORHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL SORHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 7
+      CALL SORHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL SORHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL SORHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO )
+      CALL CHKXER( 'SORHR_COL', INFOT, NOUT, LERR, OK )
+*
+*     Print a summary line.
+*
+      CALL ALAESM( PATH, OK, NOUT )
+*
+      RETURN
+*
+*     End of SERRORHR_COL
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/serrvx.f b/lapack-netlib/TESTING/LIN/serrvx.f
index a63ed38d7..910bff1e5 100644
--- a/lapack-netlib/TESTING/LIN/serrvx.f
+++ b/lapack-netlib/TESTING/LIN/serrvx.f
@@ -735,7 +735,7 @@
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
-         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1,
+         CALL SSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'SSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
diff --git a/lapack-netlib/TESTING/LIN/sorhr_col01.f b/lapack-netlib/TESTING/LIN/sorhr_col01.f
new file mode 100644
index 000000000..02429041b
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f
@@ -0,0 +1,386 @@
+*> \brief \b SORHR_COL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT)
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       REAL             RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR.
+*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is REAL array, dimension (6)
+*>          Results of each of the six tests below.
+*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
+*>
+*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup single_lin
+*
+*  =====================================================================
+      SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      REAL              RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      REAL, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+*
+*     .. Parameters ..
+      REAL               ONE, ZERO
+      PARAMETER          ( ZERO = 0.0E+0, ONE = 1.0E+0 )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB
+      REAL               ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      REAL               WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      REAL               SLAMCH, SLANGE, SLANSY
+      EXTERNAL           SLAMCH, SLANGE, SLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           SLACPY, SLARNV, SLASET, SLATSQR, SORHR_COL,
+     $                   SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, MAX, MIN, REAL
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = SLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL SLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL SLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in SLATSQR
+*
+      NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     SLATSQR requires NB1 to be bounded by N.
+*
+      NB1_UB = MIN( NB1, N)
+*
+*     SGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+      CALL SLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1,
+     $              WORKQUERY, -1, INFO )
+      LWORK = INT( WORKQUERY( 1 ) )
+      CALL SORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1,
+     $               INFO )
+
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
+*
+*     In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'SLATSQR'
+      CALL SLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK,
+     $              INFO )
+*
+*     Copy the factor R into the array R.
+*
+      SRNAMT = 'SLACPY'
+      CALL SLACPY( 'U', N, N, AF, M, R, M )
+*
+*     Reconstruct the orthogonal matrix Q.
+*
+      SRNAMT = 'SORGTSQR'
+      CALL SORGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK,
+     $               INFO )
+*
+*     Perform the Householder reconstruction, the result is stored
+*     the arrays AF and T2.
+*
+      SRNAMT = 'SORHR_COL'
+      CALL SORHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO )
+*
+*     Compute the factor R_hr corresponding to the Householder
+*     reconstructed Q_hr and place it in the upper triangle of AF to
+*     match the Q storage format in DGEQRT. R_hr = R_tsqr * S,
+*     this means changing the sign of I-th row of the matrix R_tsqr
+*     according to sign of of I-th diagonal element DIAG(I) of the
+*     matrix S.
+*
+      SRNAMT = 'SLACPY'
+      CALL SLACPY( 'U', N, N, R, M, AF, M )
+*
+      DO I = 1, N
+         IF( DIAG( I ).EQ.-ONE ) THEN
+            CALL SSCAL( N+1-I, -ONE, AF( I, I ), M )
+         END IF
+      END DO
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M )
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M )
+*
+      CALL SLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M )
+*
+      ANORM = SLANGE( '1', M, N, A, M, RWORK )
+      RESID = SLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL SLASET( 'Full', M, M, ZERO, ONE, R, M )
+      CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M )
+      RESID = SLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL SLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = SLANGE( '1', M, N, C, M, RWORK )
+      CALL SLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = SLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL SLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**T)*C = CF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**T)*C| / ( eps * m * |C|)
+*
+      CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M )
+      RESID = SLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL SLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = SLANGE( '1', N, M, D, N, RWORK )
+      CALL SLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = SLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL SLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'SGEMQRT'
+      CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**T)| / ( eps * m * |D| )
+*
+      CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N )
+      RESID = SLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of SORHR_COL01
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/stsqr01.f b/lapack-netlib/TESTING/LIN/stsqr01.f
index b661d61f4..8eb69eae7 100644
--- a/lapack-netlib/TESTING/LIN/stsqr01.f
+++ b/lapack-netlib/TESTING/LIN/stsqr01.f
@@ -115,7 +115,7 @@
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISEED( 4 )
-      REAL               TQUERY( 5 ), WORKQUERY
+      REAL               TQUERY( 5 ), WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
       REAL     SLAMCH, SLANGE, SLANSY
@@ -174,22 +174,22 @@
 *
       CALL SGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ) )
       CALL SGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMQR( 'L', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMQR( 'R', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'SGEQR'
@@ -317,22 +317,22 @@
       ELSE
       CALL SGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ))
       CALL SGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N,
      $              WORKQUERY, -1, INFO )
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMLQ( 'L', 'T', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL SGEMLQ( 'R', 'T', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'SGELQ'
diff --git a/lapack-netlib/TESTING/LIN/zchkaa.f b/lapack-netlib/TESTING/LIN/zchkaa.f
index d2be2525d..30d2a084a 100644
--- a/lapack-netlib/TESTING/LIN/zchkaa.f
+++ b/lapack-netlib/TESTING/LIN/zchkaa.f
@@ -74,6 +74,8 @@
 *> ZEQ
 *> ZQT
 *> ZQX
+*> ZTS
+*> ZHH
 *> \endverbatim
 *
 *  Parameters:
@@ -108,17 +110,17 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \date November 2017
+*> \date November 2019
 *
 *> \ingroup complex16_lin
 *
 *  =====================================================================
       PROGRAM ZCHKAA
 *
-*  -- LAPACK test routine (version 3.8.0) --
+*  -- LAPACK test routine (version 3.9.0) --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
 *  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
-*     November 2017
+*     November 2019
 *
 *  =====================================================================
 *
@@ -166,16 +168,16 @@
 *     .. External Subroutines ..
       EXTERNAL           ALAREQ, ZCHKEQ, ZCHKGB, ZCHKGE, ZCHKGT, ZCHKHE,
      $                   ZCHKHE_ROOK, ZCHKHE_RK, ZCHKHE_AA, ZCHKHP,
-     $                   ZCHKLQ, ZCHKPB, ZCHKPO, ZCHKPS, ZCHKPP, ZCHKPT,
-     $                   ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ, ZCHKSP, ZCHKSY,
-     $                   ZCHKSY_ROOK, ZCHKSY_RK, ZCHKSY_AA, ZCHKTB,
-     $                   ZCHKTP, ZCHKTR, ZCHKTZ, ZDRVGB, ZDRVGE, ZDRVGT,
-     $                   ZDRVHE, ZDRVHE_ROOK, ZDRVHE_RK, ZDRVHE_AA,
-     $                   ZDRVHE_AA_2STAGE, ZDRVHP, ZDRVLS, ZDRVPB, 
-     $                   ZDRVPO, ZDRVPP, ZDRVPT, ZDRVSP, ZDRVSY,
-     $                   ZDRVSY_ROOK, ZDRVSY_RK, ZDRVSY_AA,
-     $                   ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT, ZCHKQRTP,
-     $                   ZCHKLQT, ZCHKLQTP, ZCHKTSQR
+     $                   ZCHKLQ, ZCHKUNHR_COL, ZCHKPB, ZCHKPO, ZCHKPS,
+     $                   ZCHKPP, ZCHKPT, ZCHKQ3, ZCHKQL, ZCHKQR, ZCHKRQ,
+     $                   ZCHKSP, ZCHKSY, ZCHKSY_ROOK, ZCHKSY_RK,
+     $                   ZCHKSY_AA, ZCHKTB, ZCHKTP, ZCHKTR, ZCHKTZ,
+     $                   ZDRVGB, ZDRVGE, ZDRVGT, ZDRVHE, ZDRVHE_ROOK,
+     $                   ZDRVHE_RK, ZDRVHE_AA, ZDRVHE_AA_2STAGE, ZDRVHP,
+     $                   ZDRVLS, ZDRVPB,  ZDRVPO, ZDRVPP, ZDRVPT,
+     $                   ZDRVSP, ZDRVSY, ZDRVSY_ROOK, ZDRVSY_RK,
+     $                   ZDRVSY_AA, ZDRVSY_AA_2STAGE, ILAVER, ZCHKQRT,
+     $                   ZCHKQRTP, ZCHKLQT, ZCHKLQTP, ZCHKTSQR
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -679,7 +681,7 @@
 *
 *        HK:  Hermitian indefinite matrices,
 *             with bounded Bunch-Kaufman (rook) pivoting algorithm,
-*             differnet matrix storage format than HR path version.
+*             different matrix storage format than HR path version.
 *
          NTYPES = 10
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
@@ -839,7 +841,7 @@
 *
 *        SK:  symmetric indefinite matrices,
 *             with bounded Bunch-Kaufman (rook) pivoting algorithm,
-*             differnet matrix storage format than SR path version.
+*             different matrix storage format than SR path version.
 *
          NTYPES = 11
          CALL ALAREQ( PATH, NMATS, DOTYPE, NTYPES, NIN, NOUT )
@@ -1201,6 +1203,17 @@
          ELSE
             WRITE( NOUT, FMT = 9989 )PATH
          END IF
+*
+      ELSE IF( LSAMEN( 2, C2, 'HH' ) ) THEN
+*
+*        HH:  Householder reconstruction for tall-skinny matrices
+*
+         IF( TSTCHK ) THEN
+            CALL ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+         ELSE
+            WRITE( NOUT, FMT = 9989 ) PATH
+         END IF
 *
       ELSE
 *
diff --git a/lapack-netlib/TESTING/LIN/zchkunhr_col.f b/lapack-netlib/TESTING/LIN/zchkunhr_col.f
new file mode 100644
index 000000000..ef8f8bcc4
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f
@@ -0,0 +1,239 @@
+*> \brief \b ZCHKUNHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+*                                NBVAL, NOUT )
+*
+*       .. Scalar Arguments ..
+*       LOGICAL            TSTERR
+*       INTEGER            NM, NN, NNB, NOUT
+*       DOUBLE PRECISION   THRESH
+*       ..
+*       .. Array Arguments ..
+*       INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR
+*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested
+*> before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] THRESH
+*> \verbatim
+*>          THRESH is DOUBLE PRECISION
+*>          The threshold value for the test ratios.  A result is
+*>          included in the output file if RESULT >= THRESH.  To have
+*>          every test ratio printed, use THRESH = 0.
+*> \endverbatim
+*>
+*> \param[in] TSTERR
+*> \verbatim
+*>          TSTERR is LOGICAL
+*>          Flag that indicates whether error exits are to be tested.
+*> \endverbatim
+*>
+*> \param[in] NM
+*> \verbatim
+*>          NM is INTEGER
+*>          The number of values of M contained in the vector MVAL.
+*> \endverbatim
+*>
+*> \param[in] MVAL
+*> \verbatim
+*>          MVAL is INTEGER array, dimension (NM)
+*>          The values of the matrix row dimension M.
+*> \endverbatim
+*>
+*> \param[in] NN
+*> \verbatim
+*>          NN is INTEGER
+*>          The number of values of N contained in the vector NVAL.
+*> \endverbatim
+*>
+*> \param[in] NVAL
+*> \verbatim
+*>          NVAL is INTEGER array, dimension (NN)
+*>          The values of the matrix column dimension N.
+*> \endverbatim
+*>
+*> \param[in] NNB
+*> \verbatim
+*>          NNB is INTEGER
+*>          The number of values of NB contained in the vector NBVAL.
+*> \endverbatim
+*>
+*> \param[in] NBVAL
+*> \verbatim
+*>          NBVAL is INTEGER array, dimension (NBVAL)
+*>          The values of the blocksize NB.
+*> \endverbatim
+*>
+*> \param[in] NOUT
+*> \verbatim
+*>          NOUT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB,
+     $                         NBVAL, NOUT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.7.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     December 2016
+*
+*     .. Scalar Arguments ..
+      LOGICAL            TSTERR
+      INTEGER            NM, NN, NNB, NOUT
+      DOUBLE PRECISION   THRESH
+*     ..
+*     .. Array Arguments ..
+      INTEGER            MVAL( * ), NBVAL( * ), NVAL( * )
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NTESTS
+      PARAMETER          ( NTESTS = 6 )
+*     ..
+*     .. Local Scalars ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            I, IMB1, INB1, INB2, J, T, M, N, MB1, NB1,
+     $                   NB2, NFAIL, NERRS, NRUN
+*
+*     .. Local Arrays ..
+      DOUBLE PRECISION   RESULT( NTESTS )
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC  MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NUNIT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NUNIT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Executable Statements ..
+*
+*     Initialize constants
+*
+      PATH( 1: 1 ) = 'Z'
+      PATH( 2: 3 ) = 'HH'
+      NRUN = 0
+      NFAIL = 0
+      NERRS = 0
+*
+*     Test the error exits
+*
+      IF( TSTERR ) CALL ZERRUNHR_COL( PATH, NOUT )
+      INFOT = 0
+*
+*     Do for each value of M in MVAL.
+*
+      DO I = 1, NM
+         M = MVAL( I )
+*
+*        Do for each value of N in NVAL.
+*
+         DO J = 1, NN
+            N = NVAL( J )
+*
+*           Only for M >= N
+*
+            IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN
+*
+*              Do for each possible value of MB1
+*
+               DO IMB1 = 1, NNB
+                  MB1 = NBVAL( IMB1 )
+*
+*                 Only for MB1 > N
+*
+                  IF ( MB1.GT.N ) THEN
+*
+*                    Do for each possible value of NB1
+*
+                     DO INB1 = 1, NNB
+                        NB1 = NBVAL( INB1 )
+*
+*                       Do for each possible value of NB2
+*
+                        DO INB2 = 1, NNB
+                           NB2 = NBVAL( INB2 )
+*
+                           IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN
+*
+*                             Test ZUNHR_COL
+*
+                              CALL ZUNHR_COL01( M, N, MB1, NB1, NB2,
+     $                                          RESULT )
+*
+*                             Print information about the tests that did
+*                             not pass the threshold.
+*
+                              DO T = 1, NTESTS
+                                 IF( RESULT( T ).GE.THRESH ) THEN
+                                    IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 )
+     $                              CALL ALAHD( NOUT, PATH )
+                                    WRITE( NOUT, FMT = 9999 ) M, N, MB1,
+     $                                     NB1, NB2, T, RESULT( T )
+                                    NFAIL = NFAIL + 1
+                                 END IF
+                              END DO
+                              NRUN = NRUN + NTESTS
+                           END IF
+                        END DO
+                     END DO
+                  END IF
+                END DO
+            END IF
+         END DO
+      END DO
+*
+*     Print a summary of the results.
+*
+      CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS )
+*
+ 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5,
+     $        ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 )
+      RETURN
+*
+*     End of ZCHKUNHR_COL
+*
+      END
\ No newline at end of file
diff --git a/lapack-netlib/TESTING/LIN/zdrvhe_rk.f b/lapack-netlib/TESTING/LIN/zdrvhe_rk.f
index 93c3fe61d..355260aad 100644
--- a/lapack-netlib/TESTING/LIN/zdrvhe_rk.f
+++ b/lapack-netlib/TESTING/LIN/zdrvhe_rk.f
@@ -98,6 +98,7 @@
 *> \param[out] E
 *> \verbatim
 *>          E is COMPLEX*16 array, dimension (NMAX)
+*> \endverbatim
 *>
 *> \param[out] AINV
 *> \verbatim
diff --git a/lapack-netlib/TESTING/LIN/zdrvls.f b/lapack-netlib/TESTING/LIN/zdrvls.f
index 681852bc2..4587c5686 100644
--- a/lapack-netlib/TESTING/LIN/zdrvls.f
+++ b/lapack-netlib/TESTING/LIN/zdrvls.f
@@ -237,13 +237,13 @@
       DOUBLE PRECISION   EPS, NORMA, NORMB, RCOND
 *     ..
 *     .. Local Arrays ..
-      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ
-      DOUBLE PRECISION   RESULT( NTESTS ), RWQ
-      COMPLEX*16         WQ
+      INTEGER            ISEED( 4 ), ISEEDY( 4 ), IWQ( 1 )
+      DOUBLE PRECISION   RESULT( NTESTS ), RWQ( 1 )
+      COMPLEX*16         WQ( 1 )
 *     ..
 *     .. Allocatable Arrays ..
       COMPLEX*16, ALLOCATABLE :: WORK (:)
-      DOUBLE PRECISION, ALLOCATABLE :: RWORK (:)
+      DOUBLE PRECISION, ALLOCATABLE :: RWORK (:), WORK2 (:)
       INTEGER, ALLOCATABLE :: IWORK (:)
 *     ..
 *     .. External Functions ..
@@ -363,32 +363,32 @@
 *                             Compute workspace needed for ZGELS
                               CALL ZGELS( TRANS, M, N, NRHS, A, LDA,
      $                                    B, LDB, WQ, -1, INFO )
-                              LWORK_ZGELS = INT ( WQ )
+                              LWORK_ZGELS = INT ( WQ( 1 ) )
 *                             Compute workspace needed for ZGETSLS
                               CALL ZGETSLS( TRANS, M, N, NRHS, A, LDA,
      $                                      B, LDB, WQ, -1, INFO )
-                              LWORK_ZGETSLS = INT( WQ )
+                              LWORK_ZGETSLS = INT( WQ( 1 ) )
                            ENDDO
                         END IF
 *                       Compute workspace needed for ZGELSY
                         CALL ZGELSY( M, N, NRHS, A, LDA, B, LDB, IWQ,
      $                               RCOND, CRANK, WQ, -1, RWORK, INFO )
-                        LWORK_ZGELSY = INT( WQ )
+                        LWORK_ZGELSY = INT( WQ( 1 ) )
                         LRWORK_ZGELSY = 2*N
 *                       Compute workspace needed for ZGELSS
                         CALL ZGELSS( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1 , RWORK,
      $                               INFO )
-                        LWORK_ZGELSS = INT( WQ )
+                        LWORK_ZGELSS = INT( WQ( 1 ) )
                         LRWORK_ZGELSS = 5*MNMIN
 *                       Compute workspace needed for ZGELSD
                         CALL ZGELSD( M, N, NRHS, A, LDA, B, LDB, S,
      $                               RCOND, CRANK, WQ, -1, RWQ, IWQ,
      $                               INFO )
-                        LWORK_ZGELSD = INT( WQ )
-                        LRWORK_ZGELSD = INT( RWQ )
+                        LWORK_ZGELSD = INT( WQ( 1 ) )
+                        LRWORK_ZGELSD = INT( RWQ ( 1 ) )
 *                       Compute LIWORK workspace needed for ZGELSY and ZGELSD
-                        LIWORK = MAX( LIWORK, N, IWQ )
+                        LIWORK = MAX( LIWORK, N, IWQ( 1 ) )
 *                       Compute LRWORK workspace needed for ZGELSY, ZGELSS and ZGELSD
                         LRWORK = MAX( LRWORK, LRWORK_ZGELSY,
      $                                LRWORK_ZGELSS, LRWORK_ZGELSD )
@@ -406,6 +406,7 @@
       LWLSY = LWORK
 *
       ALLOCATE( WORK( LWORK ) )
+      ALLOCATE( WORK2( 2 * LWORK ) )
       ALLOCATE( IWORK( LIWORK ) )
       ALLOCATE( RWORK( LRWORK ) )
 *
@@ -596,7 +597,7 @@
      $                           CALL ZLACPY( 'Full', NROWS, NRHS,
      $                                        COPYB, LDB, C, LDB )
                               CALL ZQRT16( TRANS, M, N, NRHS, COPYA,
-     $                                     LDA, B, LDB, C, LDB, WORK,
+     $                                     LDA, B, LDB, C, LDB, WORK2,
      $                                     RESULT( 15 ) )
 *
                               IF( ( ITRAN.EQ.1 .AND. M.GE.N ) .OR.
diff --git a/lapack-netlib/TESTING/LIN/zerrunhr_col.f b/lapack-netlib/TESTING/LIN/zerrunhr_col.f
new file mode 100644
index 000000000..4fb62734d
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zerrunhr_col.f
@@ -0,0 +1,164 @@
+*> \brief \b ZERRUNHR_COL
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZERRUNHR_COL( PATH, NUNIT )
+*
+*       .. Scalar Arguments ..
+*       CHARACTER*3        PATH
+*       INTEGER            NUNIT
+*       ..
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZERRUNHR_COL tests the error exits for ZUNHR_COL that does
+*> Householder reconstruction from the ouput of tall-skinny
+*> factorization ZLATSQR.
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] PATH
+*> \verbatim
+*>          PATH is CHARACTER*3
+*>          The LAPACK path name for the routines to be tested.
+*> \endverbatim
+*>
+*> \param[in] NUNIT
+*> \verbatim
+*>          NUNIT is INTEGER
+*>          The unit number for output.
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZERRUNHR_COL( PATH, NUNIT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      CHARACTER(LEN=3)   PATH
+      INTEGER            NUNIT
+*     ..
+*
+*  =====================================================================
+*
+*     .. Parameters ..
+      INTEGER            NMAX
+      PARAMETER          ( NMAX = 2 )
+*     ..
+*     .. Local Scalars ..
+      INTEGER            I, INFO, J
+*     ..
+*     .. Local Arrays ..
+      COMPLEX*16         A( NMAX, NMAX ), T( NMAX, NMAX ), D(NMAX)
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ALAESM, CHKXER, ZUNHR_COL
+*     ..
+*     .. Scalars in Common ..
+      LOGICAL            LERR, OK
+      CHARACTER(LEN=32)  SRNAMT
+      INTEGER            INFOT, NOUT
+*     ..
+*     .. Common blocks ..
+      COMMON             / INFOC / INFOT, NOUT, OK, LERR
+      COMMON             / SRNAMC / SRNAMT
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          DBLE, DCMPLX
+*     ..
+*     .. Executable Statements ..
+*
+      NOUT = NUNIT
+      WRITE( NOUT, FMT = * )
+*
+*     Set the variables to innocuous values.
+*
+      DO J = 1, NMAX
+         DO I = 1, NMAX
+            A( I, J ) = DCMPLX( 1.D+0 / DBLE( I+J ) )
+            T( I, J ) = DCMPLX( 1.D+0 / DBLE( I+J ) )
+         END DO
+         D( J ) = ( 0.D+0, 0.D+0 )
+      END DO
+      OK = .TRUE.
+*
+*     Error exits for Householder reconstruction
+*
+*     ZUNHR_COL
+*
+      SRNAMT = 'ZUNHR_COL'
+*
+      INFOT = 1
+      CALL ZUNHR_COL( -1, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 2
+      CALL ZUNHR_COL( 0, -1, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+      CALL ZUNHR_COL( 1, 2, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 3
+      CALL ZUNHR_COL( 0, 0, -1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL ZUNHR_COL( 0, 0, 0, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 5
+      CALL ZUNHR_COL( 0, 0, 1, A, -1, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL ZUNHR_COL( 0, 0, 1, A, 0, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL ZUNHR_COL( 2, 0, 1, A, 1, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      INFOT = 7
+      CALL ZUNHR_COL( 0, 0, 1, A, 1, T, -1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL ZUNHR_COL( 0, 0, 1, A, 1, T, 0, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+      CALL ZUNHR_COL( 4, 3, 2, A, 4, T, 1, D, INFO )
+      CALL CHKXER( 'ZUNHR_COL', INFOT, NOUT, LERR, OK )
+*
+*     Print a summary line.
+*
+      CALL ALAESM( PATH, OK, NOUT )
+*
+      RETURN
+*
+*     End of ZERRUNHR_COL
+*
+      END
diff --git a/lapack-netlib/TESTING/LIN/zerrvx.f b/lapack-netlib/TESTING/LIN/zerrvx.f
index 29ba744ed..7759384e6 100644
--- a/lapack-netlib/TESTING/LIN/zerrvx.f
+++ b/lapack-netlib/TESTING/LIN/zerrvx.f
@@ -94,7 +94,7 @@
      $                   ZHPSV, ZHPSVX, ZPBSV, ZPBSVX, ZPOSV, ZPOSVX,
      $                   ZPPSV, ZPPSVX, ZPTSV, ZPTSVX, ZSPSV, ZSPSVX,
      $                   ZSYSV, ZSYSV_AA, ZSYSV_RK, ZSYSV_ROOK,
-     $                   ZSYSVX, ZSYSV_AA_2STAGE
+     $                   ZSYSVX, ZHESV_AA_2STAGE
 *     ..
 *     .. Scalars in Common ..
       LOGICAL            LERR, OK
@@ -721,7 +721,7 @@
 *
       ELSE IF( LSAMEN( 2, C2, 'H2' ) ) THEN
 *
-*        CHESV_AASEN_2STAGE
+*        ZHESV_AASEN_2STAGE
 *
          SRNAMT = 'ZHESV_AA_2STAGE'
          INFOT = 1
@@ -741,7 +741,7 @@
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 11
-         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 2, IP, IP, B, 1,
+         CALL ZHESV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
          INFOT = 7
@@ -749,6 +749,36 @@
      $                         W, 1, INFO )
          CALL CHKXER( 'ZHESV_AA_2STAGE', INFOT, NOUT, LERR, OK )
 *
+      ELSE IF( LSAMEN( 2, C2, 'S2' ) ) THEN
+*
+*        ZSYSV_AASEN_2STAGE
+*
+         SRNAMT = 'ZSYSV_AA_2STAGE'
+         INFOT = 1
+         CALL ZSYSV_AA_2STAGE( '/', 0, 0, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 2
+         CALL ZSYSV_AA_2STAGE( 'U', -1, 0, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 3
+         CALL ZSYSV_AA_2STAGE( 'U', 0, -1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 5
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 1, A, 1, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 11
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 8, IP, IP, B, 1,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+         INFOT = 7
+         CALL ZSYSV_AA_2STAGE( 'U', 2, 1, A, 2, A, 1, IP, IP, B, 2,
+     $                         W, 1, INFO )
+         CALL CHKXER( 'ZSYSV_AA_2STAGE', INFOT, NOUT, LERR, OK )
+**
       ELSE IF( LSAMEN( 2, C2, 'HP' ) ) THEN
 *
 *        ZHPSV
diff --git a/lapack-netlib/TESTING/LIN/zlahilb.f b/lapack-netlib/TESTING/LIN/zlahilb.f
index a6dc79b20..ba83af825 100644
--- a/lapack-netlib/TESTING/LIN/zlahilb.f
+++ b/lapack-netlib/TESTING/LIN/zlahilb.f
@@ -164,7 +164,7 @@
       INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D
       PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8)
 *
-*     d's are generated from random permuation of those eight elements.
+*     d's are generated from random permutation of those eight elements.
       COMPLEX*16 d1(8), d2(8), invd1(8), invd2(8)
       DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/
       DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/
diff --git a/lapack-netlib/TESTING/LIN/ztsqr01.f b/lapack-netlib/TESTING/LIN/ztsqr01.f
index 094473888..81d7fdb44 100644
--- a/lapack-netlib/TESTING/LIN/ztsqr01.f
+++ b/lapack-netlib/TESTING/LIN/ztsqr01.f
@@ -114,7 +114,7 @@
 *     ..
 *     .. Local Arrays ..
       INTEGER            ISEED( 4 )
-      COMPLEX*16         TQUERY( 5 ), WORKQUERY
+      COMPLEX*16         TQUERY( 5 ), WORKQUERY( 1 )
 *     ..
 *     .. External Functions ..
       DOUBLE PRECISION DLAMCH, ZLANGE, ZLANSY
@@ -173,22 +173,22 @@
 *
       CALL ZGEQR( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ) )
       CALL ZGEMQR( 'L', 'N', M, M, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMQR( 'L', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMQR( 'L', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMQR( 'R', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMQR( 'R', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'ZGEQR'
@@ -316,22 +316,22 @@
       ELSE
       CALL ZGELQ( M, N, AF, M, TQUERY, -1, WORKQUERY, -1, INFO )
       TSIZE = INT( TQUERY( 1 ) )
-      LWORK = INT( WORKQUERY )
+      LWORK = INT( WORKQUERY( 1 ) )
       CALL ZGEMLQ( 'R', 'N', N, N, K, AF, M, TQUERY, TSIZE, Q, N,
      $              WORKQUERY, -1, INFO )
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMLQ( 'L', 'N', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMLQ( 'L', 'C', N, M, K, AF, M, TQUERY, TSIZE, DF, N,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMLQ( 'R', 'N', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       CALL ZGEMLQ( 'R', 'C', M, N, K, AF, M, TQUERY, TSIZE, CF, M,
      $             WORKQUERY, -1, INFO)
-      LWORK = MAX( LWORK, INT( WORKQUERY ) )
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
       ALLOCATE ( T( TSIZE ) )
       ALLOCATE ( WORK( LWORK ) )
       srnamt = 'ZGELQ'
diff --git a/lapack-netlib/TESTING/LIN/zunhr_col01.f b/lapack-netlib/TESTING/LIN/zunhr_col01.f
new file mode 100644
index 000000000..9fb3bf352
--- /dev/null
+++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f
@@ -0,0 +1,390 @@
+*> \brief \b ZUNHR_COL01
+*
+*  =========== DOCUMENTATION ===========
+*
+* Online html documentation available at
+*            http://www.netlib.org/lapack/explore-html/
+*
+*  Definition:
+*  ===========
+*
+*       SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
+*
+*       .. Scalar Arguments ..
+*       INTEGER           M, N, MB1, NB1, NB2
+*       .. Return values ..
+*       DOUBLE PRECISION  RESULT(6)
+*
+*
+*> \par Purpose:
+*  =============
+*>
+*> \verbatim
+*>
+*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR.
+*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR
+*> have to be tested before this test.
+*>
+*> \endverbatim
+*
+*  Arguments:
+*  ==========
+*
+*> \param[in] M
+*> \verbatim
+*>          M is INTEGER
+*>          Number of rows in test matrix.
+*> \endverbatim
+*> \param[in] N
+*> \verbatim
+*>          N is INTEGER
+*>          Number of columns in test matrix.
+*> \endverbatim
+*> \param[in] MB1
+*> \verbatim
+*>          MB1 is INTEGER
+*>          Number of row in row block in an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB1
+*> \verbatim
+*>          NB1 is INTEGER
+*>          Number of columns in column block an input test matrix.
+*> \endverbatim
+*>
+*> \param[in] NB2
+*> \verbatim
+*>          NB2 is INTEGER
+*>          Number of columns in column block in an output test matrix.
+*> \endverbatim
+*>
+*> \param[out] RESULT
+*> \verbatim
+*>          RESULT is DOUBLE PRECISION array, dimension (6)
+*>          Results of each of the six tests below.
+*>          ( C is a M-by-N random matrix, D is a N-by-M random matrix )
+*>
+*>          RESULT(1) = | A - Q * R | / (eps * m * |A|)
+*>          RESULT(2) = | I - (Q**H) * Q | / (eps * m )
+*>          RESULT(3) = | Q * C - Q * C | / (eps * m * |C|)
+*>          RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|)
+*>          RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|)
+*>          RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|)
+*> \endverbatim
+*
+*  Authors:
+*  ========
+*
+*> \author Univ. of Tennessee
+*> \author Univ. of California Berkeley
+*> \author Univ. of Colorado Denver
+*> \author NAG Ltd.
+*
+*> \date November 2019
+*
+*> \ingroup complex16_lin
+*
+*  =====================================================================
+      SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT )
+      IMPLICIT NONE
+*
+*  -- LAPACK test routine (version 3.9.0) --
+*  -- LAPACK is a software package provided by Univ. of Tennessee,    --
+*  -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..--
+*     November 2019
+*
+*     .. Scalar Arguments ..
+      INTEGER           M, N, MB1, NB1, NB2
+*     .. Return values ..
+      DOUBLE PRECISION  RESULT(6)
+*
+*  =====================================================================
+*
+*     ..
+*     .. Local allocatable arrays
+      COMPLEX*16, ALLOCATABLE ::  A(:,:), AF(:,:), Q(:,:), R(:,:),
+     $                   WORK( : ), T1(:,:), T2(:,:), DIAG(:),
+     $                   C(:,:), CF(:,:), D(:,:), DF(:,:)
+      DOUBLE PRECISION, ALLOCATABLE :: RWORK(:)
+*
+*     .. Parameters ..
+      DOUBLE PRECISION   ZERO
+      PARAMETER          ( ZERO = 0.0D+0 )
+      COMPLEX*16         CONE, CZERO
+      PARAMETER          ( CONE = ( 1.0D+0, 0.0D+0 ),
+     $                     CZERO = ( 0.0D+0, 0.0D+0 ) )
+*     ..
+*     .. Local Scalars ..
+      LOGICAL            TESTZEROS
+      INTEGER            INFO, I, J, K, L, LWORK, NB1_UB, NB2_UB, NRB
+      DOUBLE PRECISION   ANORM, EPS, RESID, CNORM, DNORM
+*     ..
+*     .. Local Arrays ..
+      INTEGER            ISEED( 4 )
+      COMPLEX*16         WORKQUERY( 1 )
+*     ..
+*     .. External Functions ..
+      DOUBLE PRECISION   DLAMCH, ZLANGE, ZLANSY
+      EXTERNAL           DLAMCH, ZLANGE, ZLANSY
+*     ..
+*     .. External Subroutines ..
+      EXTERNAL           ZLACPY, ZLARNV, ZLASET, ZLATSQR, ZUNHR_COL,
+     $                   ZUNGTSQR, ZSCAL, ZGEMM, ZGEMQRT, ZHERK
+*     ..
+*     .. Intrinsic Functions ..
+      INTRINSIC          CEILING, DBLE, MAX, MIN
+*     ..
+*     .. Scalars in Common ..
+      CHARACTER(LEN=32)  SRNAMT
+*     ..
+*     .. Common blocks ..
+      COMMON             / SRMNAMC / SRNAMT
+*     ..
+*     .. Data statements ..
+      DATA ISEED / 1988, 1989, 1990, 1991 /
+*
+*     TEST MATRICES WITH HALF OF MATRIX BEING ZEROS
+*
+      TESTZEROS = .FALSE.
+*
+      EPS = DLAMCH( 'Epsilon' )
+      K = MIN( M, N )
+      L = MAX( M, N, 1)
+*
+*     Dynamically allocate local arrays
+*
+      ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L),
+     $           C(M,N), CF(M,N),
+     $           D(N,M), DF(N,M) )
+*
+*     Put random numbers into A and copy to AF
+*
+      DO J = 1, N
+         CALL ZLARNV( 2, ISEED, M, A( 1, J ) )
+      END DO
+      IF( TESTZEROS ) THEN
+         IF( M.GE.4 ) THEN
+            DO J = 1, N
+               CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) )
+            END DO
+         END IF
+      END IF
+      CALL ZLACPY( 'Full', M, N, A, M, AF, M )
+*
+*     Number of row blocks in ZLATSQR
+*
+      NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) )
+*
+      ALLOCATE ( T1( NB1, N * NRB ) )
+      ALLOCATE ( T2( NB2, N ) )
+      ALLOCATE ( DIAG( N ) )
+*
+*     Begin determine LWORK for the array WORK and allocate memory.
+*
+*     ZLATSQR requires NB1 to be bounded by N.
+*
+      NB1_UB = MIN( NB1, N)
+*
+*     ZGEMQRT requires NB2 to be bounded by N.
+*
+      NB2_UB = MIN( NB2, N)
+*
+      CALL ZLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1,
+     $              WORKQUERY, -1, INFO )
+      LWORK = INT( WORKQUERY( 1 ) )
+      CALL ZUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORKQUERY, -1,
+     $               INFO )
+
+      LWORK = MAX( LWORK, INT( WORKQUERY( 1 ) ) )
+*
+*     In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L',
+*                or  M*NB2_UB if SIDE = 'R'.
+*
+      LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M )
+*
+      ALLOCATE ( WORK( LWORK ) )
+*
+*     End allocate memory for WORK.
+*
+*
+*     Begin Householder reconstruction routines
+*
+*     Factor the matrix A in the array AF.
+*
+      SRNAMT = 'ZLATSQR'
+      CALL ZLATSQR( M, N, MB1, NB1_UB, AF, M, T1, NB1, WORK, LWORK,
+     $              INFO )
+*
+*     Copy the factor R into the array R.
+*
+      SRNAMT = 'ZLACPY'
+      CALL ZLACPY( 'U', M, N, AF, M, R, M )
+*
+*     Reconstruct the orthogonal matrix Q.
+*
+      SRNAMT = 'ZUNGTSQR'
+      CALL ZUNGTSQR( M, N, MB1, NB1, AF, M, T1, NB1, WORK, LWORK,
+     $               INFO )
+*
+*     Perform the Householder reconstruction, the result is stored
+*     the arrays AF and T2.
+*
+      SRNAMT = 'ZUNHR_COL'
+      CALL ZUNHR_COL( M, N, NB2, AF, M, T2, NB2, DIAG, INFO )
+*
+*     Compute the factor R_hr corresponding to the Householder
+*     reconstructed Q_hr and place it in the upper triangle of AF to
+*     match the Q storage format in ZGEQRT. R_hr = R_tsqr * S,
+*     this means changing the sign of I-th row of the matrix R_tsqr
+*     according to sign of of I-th diagonal element DIAG(I) of the
+*     matrix S.
+*
+      SRNAMT = 'ZLACPY'
+      CALL ZLACPY( 'U', M, N, R, M, AF, M )
+*
+      DO I = 1, N
+         IF( DIAG( I ).EQ.-CONE ) THEN
+            CALL ZSCAL( N+1-I, -CONE, AF( I, I ), M )
+         END IF
+      END DO
+*
+*     End Householder reconstruction routines.
+*
+*
+*     Generate the m-by-m matrix Q
+*
+      CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M )
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M,
+     $              WORK, INFO )
+*
+*     Copy R
+*
+      CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M )
+*
+      CALL ZLACPY( 'Upper', M, N, AF, M, R, M )
+*
+*     TEST 1
+*     Compute |R - (Q**H)*A| / ( eps * m * |A| ) and store in RESULT(1)
+*
+      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M )
+*
+      ANORM = ZLANGE( '1', M, N, A, M, RWORK )
+      RESID = ZLANGE( '1', M, N, R, M, RWORK )
+      IF( ANORM.GT.ZERO ) THEN
+         RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM )
+      ELSE
+         RESULT( 1 ) = ZERO
+      END IF
+*
+*     TEST 2
+*     Compute |I - (Q**H)*Q| / ( eps * m ) and store in RESULT(2)
+*
+      CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M )
+      CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M )
+      RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK )
+      RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) )
+*
+*     Generate random m-by-n matrix C
+*
+      DO J = 1, N
+         CALL ZLARNV( 2, ISEED, M, C( 1, J ) )
+      END DO
+      CNORM = ZLANGE( '1', M, N, C, M, RWORK )
+      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as Q*C = CF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 3
+*     Compute |CF - Q*C| / ( eps *  m * |C| )
+*
+      CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 3 ) = ZERO
+      END IF
+*
+*     Copy C into CF again
+*
+      CALL ZLACPY( 'Full', M, N, C, M, CF, M )
+*
+*     Apply Q to C as (Q**H)*C = CF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M,
+     $               WORK, INFO )
+*
+*     TEST 4
+*     Compute |CF - (Q**H)*C| / ( eps * m * |C|)
+*
+      CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M )
+      RESID = ZLANGE( '1', M, N, CF, M, RWORK )
+      IF( CNORM.GT.ZERO ) THEN
+         RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM )
+      ELSE
+         RESULT( 4 ) = ZERO
+      END IF
+*
+*     Generate random n-by-m matrix D and a copy DF
+*
+      DO J = 1, M
+         CALL ZLARNV( 2, ISEED, N, D( 1, J ) )
+      END DO
+      DNORM = ZLANGE( '1', N, M, D, N, RWORK )
+      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*Q = DF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 5
+*     Compute |DF - D*Q| / ( eps * m * |D| )
+*
+      CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 5 ) = ZERO
+      END IF
+*
+*     Copy D into DF again
+*
+      CALL ZLACPY( 'Full', N, M, D, N, DF, N )
+*
+*     Apply Q to D as D*QT = DF
+*
+      SRNAMT = 'ZGEMQRT'
+      CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N,
+     $               WORK, INFO )
+*
+*     TEST 6
+*     Compute |DF - D*(Q**H)| / ( eps * m * |D| )
+*
+      CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N )
+      RESID = ZLANGE( '1', N, M, DF, N, RWORK )
+      IF( DNORM.GT.ZERO ) THEN
+         RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM )
+      ELSE
+         RESULT( 6 ) = ZERO
+      END IF
+*
+*     Deallocate all arrays
+*
+      DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG,
+     $             C, D, CF, DF )
+*
+      RETURN
+*
+*     End of ZUNHR_COL01
+*
+      END
diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile
index a1d784fa5..87432fd04 100644
--- a/lapack-netlib/TESTING/MATGEN/Makefile
+++ b/lapack-netlib/TESTING/MATGEN/Makefile
@@ -1,5 +1,3 @@
-include ../../make.inc
-
 #######################################################################
 #  This is the makefile to create a library of the test matrix
 #  generators used in LAPACK.  The files are organized as follows:
@@ -32,6 +30,9 @@ include ../../make.inc
 #
 #######################################################################
 
+TOPSRCDIR = ../..
+include $(TOPSRCDIR)/make.inc
+
 SCATGEN = slatm1.o slatm7.o slaran.o slarnd.o
 
 SMATGEN = slatms.o slatme.o slatmr.o slatmt.o \
@@ -52,32 +53,32 @@ ZMATGEN = zlatms.o zlatme.o zlatmr.o zlatmt.o \
    zlagge.o zlaghe.o zlagsy.o zlakf2.o zlarge.o zlaror.o zlarot.o \
    zlatm1.o zlarnd.o zlatm2.o zlatm3.o zlatm5.o zlatm6.o zlahilb.o
 
-all: ../../$(TMGLIB)
+.PHONY: all
+all: $(TMGLIB)
 
 ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \
 	$(DZATGEN)
 
-.PHONY: ../../$(TMGLIB)
-
-../../$(TMGLIB): $(ALLOBJ)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+$(TMGLIB): $(ALLOBJ)
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
+.PHONY: single complex double complex16
 single: $(SMATGEN) $(SCATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
-	$(RANLIB) ../../$(TMGLIB)
+	$(AR) $(ARFLAGS) $(TMGLIB) $^
+	$(RANLIB) $(TMGLIB)
 
 complex: $(CMATGEN) $(SCATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
-	$(RANLIB) ../../$(TMGLIB)
+	$(AR) $(ARFLAGS) $(TMGLIB) $^
+	$(RANLIB) $(TMGLIB)
 
 double: $(DMATGEN) $(DZATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
-	$(RANLIB) ../../$(TMGLIB)
+	$(AR) $(ARFLAGS) $(TMGLIB) $^
+	$(RANLIB) $(TMGLIB)
 
 complex16: $(ZMATGEN) $(DZATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
-	$(RANLIB) ../../$(TMGLIB)
+	$(AR) $(ARFLAGS) $(TMGLIB) $^
+	$(RANLIB) $(TMGLIB)
 
 $(SCATGEN): $(FRC)
 $(SMATGEN): $(FRC)
@@ -89,14 +90,12 @@ $(ZMATGEN): $(FRC)
 FRC:
 	@FRC=$(FRC)
 
-clean: cleanobj #cleanlib
+.PHONY: clean cleanobj cleanlib
+clean: cleanobj cleanlib
 cleanobj:
 	rm -f *.o
 cleanlib:
-	rm -f ../../$(TMGLIB)
-
-.f.o:
-	$(FORTRAN) $(OPTS) -c -o $@ $<
+	rm -f $(TMGLIB)
 
-slaran.o: slaran.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
-dlaran.o: dlaran.f ; $(FORTRAN) $(NOOPT) -c -o $@ $<
+slaran.o: slaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
+dlaran.o: dlaran.f ; $(FC) $(FFLAGS_NOOPT) -c -o $@ $<
diff --git a/lapack-netlib/TESTING/MATGEN/clahilb.f b/lapack-netlib/TESTING/MATGEN/clahilb.f
index 13902872c..f4481fc78 100644
--- a/lapack-netlib/TESTING/MATGEN/clahilb.f
+++ b/lapack-netlib/TESTING/MATGEN/clahilb.f
@@ -164,7 +164,7 @@
       INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D
       PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8)
 *
-*     d's are generated from random permuation of those eight elements.
+*     d's are generated from random permutation of those eight elements.
       COMPLEX D1(8), D2(8), INVD1(8), INVD2(8)
       DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/
       DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/
diff --git a/lapack-netlib/TESTING/MATGEN/clatm2.f b/lapack-netlib/TESTING/MATGEN/clatm2.f
index 01221e0cc..5bd6b9dc8 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm2.f
+++ b/lapack-netlib/TESTING/MATGEN/clatm2.f
@@ -186,7 +186,7 @@
 *>          SPARSE is REAL
 *>           Value between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/clatm3.f b/lapack-netlib/TESTING/MATGEN/clatm3.f
index 3e07f3ec0..42b453553 100644
--- a/lapack-netlib/TESTING/MATGEN/clatm3.f
+++ b/lapack-netlib/TESTING/MATGEN/clatm3.f
@@ -202,7 +202,7 @@
 *> \verbatim
 *>          SPARSE is REAL between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/clatmr.f b/lapack-netlib/TESTING/MATGEN/clatmr.f
index 11d29a3d0..e80c4a514 100644
--- a/lapack-netlib/TESTING/MATGEN/clatmr.f
+++ b/lapack-netlib/TESTING/MATGEN/clatmr.f
@@ -316,20 +316,6 @@
 *>           Not referenced if PIVTNG = 'N'. Not modified.
 *> \endverbatim
 *>
-*> \param[in] SPARSE
-*> \verbatim
-*>          SPARSE is REAL
-*>           On entry specifies the sparsity of the matrix if a sparse
-*>           matrix is to be generated. SPARSE should lie between
-*>           0 and 1. To generate a sparse matrix, for each matrix entry
-*>           a uniform ( 0, 1 ) random number x is generated and
-*>           compared to SPARSE; if x is larger the matrix entry
-*>           is unchanged and if x is smaller the entry is set
-*>           to zero. Thus on the average a fraction SPARSE of the
-*>           entries will be set to zero.
-*>           Not modified.
-*> \endverbatim
-*>
 *> \param[in] KL
 *> \verbatim
 *>          KL is INTEGER
@@ -350,6 +336,20 @@
 *>           Not modified.
 *> \endverbatim
 *>
+*> \param[in] SPARSE
+*> \verbatim
+*>          SPARSE is REAL
+*>           On entry specifies the sparsity of the matrix if a sparse
+*>           matrix is to be generated. SPARSE should lie between
+*>           0 and 1. To generate a sparse matrix, for each matrix entry
+*>           a uniform ( 0, 1 ) random number x is generated and
+*>           compared to SPARSE; if x is larger the matrix entry
+*>           is unchanged and if x is smaller the entry is set
+*>           to zero. Thus on the average a fraction SPARSE of the
+*>           entries will be set to zero.
+*>           Not modified.
+*> \endverbatim
+*>
 *> \param[in] ANORM
 *> \verbatim
 *>          ANORM is REAL
@@ -416,7 +416,7 @@
 *>           If PACK='C' or 'R', LDA must be at least 1.
 *>           If PACK='B', or 'Q', LDA must be MIN ( KU+1, N )
 *>           If PACK='Z', LDA must be at least KUU+KLL+1, where
-*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 )
+*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 )
 *>           Not modified.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm2.f b/lapack-netlib/TESTING/MATGEN/dlatm2.f
index 446f5a801..d7a6d19f3 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm2.f
+++ b/lapack-netlib/TESTING/MATGEN/dlatm2.f
@@ -182,7 +182,7 @@
 *> \verbatim
 *>          SPARSE is DOUBLE PRECISION between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/dlatm3.f b/lapack-netlib/TESTING/MATGEN/dlatm3.f
index cf6da10f8..15f5ac080 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatm3.f
+++ b/lapack-netlib/TESTING/MATGEN/dlatm3.f
@@ -199,7 +199,7 @@
 *> \verbatim
 *>          SPARSE is DOUBLE PRECISION between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/dlatmr.f b/lapack-netlib/TESTING/MATGEN/dlatmr.f
index e7ea41907..a914481f7 100644
--- a/lapack-netlib/TESTING/MATGEN/dlatmr.f
+++ b/lapack-netlib/TESTING/MATGEN/dlatmr.f
@@ -303,20 +303,6 @@
 *>           Not referenced if PIVTNG = 'N'. Not modified.
 *> \endverbatim
 *>
-*> \param[in] SPARSE
-*> \verbatim
-*>          SPARSE is DOUBLE PRECISION
-*>           On entry specifies the sparsity of the matrix if a sparse
-*>           matrix is to be generated. SPARSE should lie between
-*>           0 and 1. To generate a sparse matrix, for each matrix entry
-*>           a uniform ( 0, 1 ) random number x is generated and
-*>           compared to SPARSE; if x is larger the matrix entry
-*>           is unchanged and if x is smaller the entry is set
-*>           to zero. Thus on the average a fraction SPARSE of the
-*>           entries will be set to zero.
-*>           Not modified.
-*> \endverbatim
-*>
 *> \param[in] KL
 *> \verbatim
 *>          KL is INTEGER
@@ -337,6 +323,20 @@
 *>           Not modified.
 *> \endverbatim
 *>
+*> \param[in] SPARSE
+*> \verbatim
+*>          SPARSE is DOUBLE PRECISION
+*>           On entry specifies the sparsity of the matrix if a sparse
+*>           matrix is to be generated. SPARSE should lie between
+*>           0 and 1. To generate a sparse matrix, for each matrix entry
+*>           a uniform ( 0, 1 ) random number x is generated and
+*>           compared to SPARSE; if x is larger the matrix entry
+*>           is unchanged and if x is smaller the entry is set
+*>           to zero. Thus on the average a fraction SPARSE of the
+*>           entries will be set to zero.
+*>           Not modified.
+*> \endverbatim
+*>
 *> \param[in] ANORM
 *> \verbatim
 *>          ANORM is DOUBLE PRECISION
@@ -398,7 +398,7 @@
 *>           If PACK='C' or 'R', LDA must be at least 1.
 *>           If PACK='B', or 'Q', LDA must be MIN ( KU+1, N )
 *>           If PACK='Z', LDA must be at least KUU+KLL+1, where
-*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 )
+*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 )
 *>           Not modified.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/MATGEN/slatm2.f b/lapack-netlib/TESTING/MATGEN/slatm2.f
index fc7e78126..2473f1f44 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm2.f
+++ b/lapack-netlib/TESTING/MATGEN/slatm2.f
@@ -182,7 +182,7 @@
 *> \verbatim
 *>          SPARSE is REAL between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/slatm3.f b/lapack-netlib/TESTING/MATGEN/slatm3.f
index e61c954bd..18c2c07d5 100644
--- a/lapack-netlib/TESTING/MATGEN/slatm3.f
+++ b/lapack-netlib/TESTING/MATGEN/slatm3.f
@@ -199,7 +199,7 @@
 *> \verbatim
 *>          SPARSE is REAL between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/slatmr.f b/lapack-netlib/TESTING/MATGEN/slatmr.f
index e4705994a..c2cedd21c 100644
--- a/lapack-netlib/TESTING/MATGEN/slatmr.f
+++ b/lapack-netlib/TESTING/MATGEN/slatmr.f
@@ -303,20 +303,6 @@
 *>           Not referenced if PIVTNG = 'N'. Not modified.
 *> \endverbatim
 *>
-*> \param[in] SPARSE
-*> \verbatim
-*>          SPARSE is REAL
-*>           On entry specifies the sparsity of the matrix if a sparse
-*>           matrix is to be generated. SPARSE should lie between
-*>           0 and 1. To generate a sparse matrix, for each matrix entry
-*>           a uniform ( 0, 1 ) random number x is generated and
-*>           compared to SPARSE; if x is larger the matrix entry
-*>           is unchanged and if x is smaller the entry is set
-*>           to zero. Thus on the average a fraction SPARSE of the
-*>           entries will be set to zero.
-*>           Not modified.
-*> \endverbatim
-*>
 *> \param[in] KL
 *> \verbatim
 *>          KL is INTEGER
@@ -337,6 +323,20 @@
 *>           Not modified.
 *> \endverbatim
 *>
+*> \param[in] SPARSE
+*> \verbatim
+*>          SPARSE is REAL
+*>           On entry specifies the sparsity of the matrix if a sparse
+*>           matrix is to be generated. SPARSE should lie between
+*>           0 and 1. To generate a sparse matrix, for each matrix entry
+*>           a uniform ( 0, 1 ) random number x is generated and
+*>           compared to SPARSE; if x is larger the matrix entry
+*>           is unchanged and if x is smaller the entry is set
+*>           to zero. Thus on the average a fraction SPARSE of the
+*>           entries will be set to zero.
+*>           Not modified.
+*> \endverbatim
+*>
 *> \param[in] ANORM
 *> \verbatim
 *>          ANORM is REAL
@@ -398,7 +398,7 @@
 *>           If PACK='C' or 'R', LDA must be at least 1.
 *>           If PACK='B', or 'Q', LDA must be MIN ( KU+1, N )
 *>           If PACK='Z', LDA must be at least KUU+KLL+1, where
-*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 )
+*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 )
 *>           Not modified.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/MATGEN/zlahilb.f b/lapack-netlib/TESTING/MATGEN/zlahilb.f
index 43057931d..e5a317821 100644
--- a/lapack-netlib/TESTING/MATGEN/zlahilb.f
+++ b/lapack-netlib/TESTING/MATGEN/zlahilb.f
@@ -164,7 +164,7 @@
       INTEGER NMAX_EXACT, NMAX_APPROX, SIZE_D
       PARAMETER (NMAX_EXACT = 6, NMAX_APPROX = 11, SIZE_D = 8)
 *
-*     d's are generated from random permuation of those eight elements.
+*     d's are generated from random permutation of those eight elements.
       COMPLEX*16 d1(8), d2(8), invd1(8), invd2(8)
       DATA D1 /(-1,0),(0,1),(-1,-1),(0,-1),(1,0),(-1,1),(1,1),(1,-1)/
       DATA D2 /(-1,0),(0,-1),(-1,1),(0,1),(1,0),(-1,-1),(1,-1),(1,1)/
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm2.f b/lapack-netlib/TESTING/MATGEN/zlatm2.f
index 2de69eeca..ea93431e7 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm2.f
+++ b/lapack-netlib/TESTING/MATGEN/zlatm2.f
@@ -185,7 +185,7 @@
 *> \verbatim
 *>          SPARSE is DOUBLE PRECISION between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/zlatm3.f b/lapack-netlib/TESTING/MATGEN/zlatm3.f
index 42d58c853..25d6233f3 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatm3.f
+++ b/lapack-netlib/TESTING/MATGEN/zlatm3.f
@@ -202,7 +202,7 @@
 *> \verbatim
 *>          SPARSE is DOUBLE PRECISION between 0. and 1.
 *>           On entry specifies the sparsity of the matrix
-*>           if sparse matix is to be generated.
+*>           if sparse matrix is to be generated.
 *>           SPARSE should lie between 0 and 1.
 *>           A uniform ( 0, 1 ) random number x is generated and
 *>           compared to SPARSE; if x is larger the matrix entry
diff --git a/lapack-netlib/TESTING/MATGEN/zlatmr.f b/lapack-netlib/TESTING/MATGEN/zlatmr.f
index 6685a3570..56285e1f4 100644
--- a/lapack-netlib/TESTING/MATGEN/zlatmr.f
+++ b/lapack-netlib/TESTING/MATGEN/zlatmr.f
@@ -316,20 +316,6 @@
 *>           Not referenced if PIVTNG = 'N'. Not modified.
 *> \endverbatim
 *>
-*> \param[in] SPARSE
-*> \verbatim
-*>          SPARSE is DOUBLE PRECISION
-*>           On entry specifies the sparsity of the matrix if a sparse
-*>           matrix is to be generated. SPARSE should lie between
-*>           0 and 1. To generate a sparse matrix, for each matrix entry
-*>           a uniform ( 0, 1 ) random number x is generated and
-*>           compared to SPARSE; if x is larger the matrix entry
-*>           is unchanged and if x is smaller the entry is set
-*>           to zero. Thus on the average a fraction SPARSE of the
-*>           entries will be set to zero.
-*>           Not modified.
-*> \endverbatim
-*>
 *> \param[in] KL
 *> \verbatim
 *>          KL is INTEGER
@@ -350,6 +336,20 @@
 *>           Not modified.
 *> \endverbatim
 *>
+*> \param[in] SPARSE
+*> \verbatim
+*>          SPARSE is DOUBLE PRECISION
+*>           On entry specifies the sparsity of the matrix if a sparse
+*>           matrix is to be generated. SPARSE should lie between
+*>           0 and 1. To generate a sparse matrix, for each matrix entry
+*>           a uniform ( 0, 1 ) random number x is generated and
+*>           compared to SPARSE; if x is larger the matrix entry
+*>           is unchanged and if x is smaller the entry is set
+*>           to zero. Thus on the average a fraction SPARSE of the
+*>           entries will be set to zero.
+*>           Not modified.
+*> \endverbatim
+*>
 *> \param[in] ANORM
 *> \verbatim
 *>          ANORM is DOUBLE PRECISION
@@ -416,7 +416,7 @@
 *>           If PACK='C' or 'R', LDA must be at least 1.
 *>           If PACK='B', or 'Q', LDA must be MIN ( KU+1, N )
 *>           If PACK='Z', LDA must be at least KUU+KLL+1, where
-*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, N-1 )
+*>           KUU = MIN ( KU, N-1 ) and KLL = MIN ( KL, M-1 )
 *>           Not modified.
 *> \endverbatim
 *>
diff --git a/lapack-netlib/TESTING/Makefile b/lapack-netlib/TESTING/Makefile
index 8b883c0fa..bdea2bfaa 100644
--- a/lapack-netlib/TESTING/Makefile
+++ b/lapack-netlib/TESTING/Makefile
@@ -34,8 +34,10 @@
 #
 #######################################################################
 
-include ../make.inc
+TOPSRCDIR = ..
+include $(TOPSRCDIR)/make.inc
 
+.PHONY: all
 all: single complex double complex16 singleproto doubleproto complexproto complex16proto
 
 SEIGTST= snep.out \
@@ -139,10 +141,13 @@ ZLINTST= ztest.out
 
 ZLINTSTPROTO= zctest.out ztest_rfp.out
 
+.PHONY: single complex double complex16
 single:         $(SLINTST) $(SEIGTST)
 complex:        $(CLINTST) $(CEIGTST)
 double:         $(DLINTST) $(DEIGTST)
 complex16:      $(ZLINTST) $(ZEIGTST)
+
+.PHONY: singleproto complexproto doubleproto complex16proto
 singleproto:    $(SLINTSTPROTO)
 complexproto:   $(CLINTSTPROTO)
 doubleproto:    $(DLINTSTPROTO)
@@ -153,61 +158,61 @@ complex16proto: $(ZLINTSTPROTO)
 
 stest.out: stest.in LIN/xlintsts
 	@echo Testing REAL LAPACK linear equation routines
-	./LIN/xlintsts < $< > $@ 2>&1
+	./LIN/xlintsts < stest.in > $@ 2>&1
 #
 # ======== COMPLEX LIN TESTS ==========================
 
 ctest.out: ctest.in LIN/xlintstc
 	@echo Testing COMPLEX LAPACK linear equation routines
-	./LIN/xlintstc < $< > $@ 2>&1
+	./LIN/xlintstc < ctest.in > $@ 2>&1
 #
 # ======== DOUBLE LIN TESTS ===========================
 
 dtest.out: dtest.in LIN/xlintstd
 	@echo Testing DOUBLE PRECISION LAPACK linear equation routines
-	./LIN/xlintstd < $< > $@ 2>&1
+	./LIN/xlintstd < dtest.in > $@ 2>&1
 #
 # ======== COMPLEX16 LIN TESTS ========================
 
 ztest.out: ztest.in LIN/xlintstz
 	@echo Testing COMPLEX16 LAPACK linear equation routines
-	./LIN/xlintstz < $< > $@ 2>&1
+	./LIN/xlintstz < ztest.in > $@ 2>&1
 #
 # ======== SINGLE-DOUBLE PROTO LIN TESTS ==============
 
 dstest.out: dstest.in LIN/xlintstds
 	@echo Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines
-	./LIN/xlintstds < $< > $@ 2>&1
+	./LIN/xlintstds < dstest.in > $@ 2>&1
 #
 # ======== COMPLEX-COMPLEX16 LIN TESTS ========================
 
 zctest.out: zctest.in LIN/xlintstzc
 	@echo Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines
-	./LIN/xlintstzc < $< > $@ 2>&1
+	./LIN/xlintstzc < zctest.in > $@ 2>&1
 #
 # ======== SINGLE RFP LIN TESTS ========================
 
 stest_rfp.out: stest_rfp.in LIN/xlintstrfs
 	@echo Testing REAL LAPACK RFP prototype linear equation routines
-	./LIN/xlintstrfs < $< > $@ 2>&1
+	./LIN/xlintstrfs < stest_rfp.in > $@ 2>&1
 #
 # ======== COMPLEX16 RFP LIN TESTS ========================
 
 dtest_rfp.out: dtest_rfp.in LIN/xlintstrfd
 	@echo Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines
-	./LIN/xlintstrfd < $< > $@ 2>&1
+	./LIN/xlintstrfd < dtest_rfp.in > $@ 2>&1
 #
 # ======== COMPLEX16 RFP LIN TESTS ========================
 
 ctest_rfp.out: ctest_rfp.in LIN/xlintstrfc
 	@echo Testing COMPLEX LAPACK RFP prototype linear equation routines
-	./LIN/xlintstrfc < $< > $@ 2>&1
+	./LIN/xlintstrfc < ctest_rfp.in > $@ 2>&1
 #
 # ======== COMPLEX16 RFP LIN TESTS ========================
 
 ztest_rfp.out: ztest_rfp.in LIN/xlintstrfz
 	@echo Testing COMPLEX16 LAPACK RFP prototype linear equation routines
-	./LIN/xlintstrfz < $< > $@ 2>&1
+	./LIN/xlintstrfz < ztest_rfp.in > $@ 2>&1
 #
 #
 # ======== SINGLE EIG TESTS ===========================
@@ -215,329 +220,329 @@ ztest_rfp.out: ztest_rfp.in LIN/xlintstrfz
 
 snep.out: nep.in EIG/xeigtsts
 	@echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < nep.in > $@ 2>&1
 
 ssep.out: sep.in EIG/xeigtsts
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sep.in > $@ 2>&1
 
 sse2.out: se2.in EIG/xeigtsts
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < se2.in > $@ 2>&1
 
 ssvd.out: svd.in EIG/xeigtsts
 	@echo SVD: Testing Singular Value Decomposition routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < svd.in > $@ 2>&1
 
 sec.out: sec.in EIG/xeigtsts
 	@echo SEC: Testing REAL Eigen Condition Routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sec.in > $@ 2>&1
 
 sed.out: sed.in EIG/xeigtsts
 	@echo SEV: Testing REAL Nonsymmetric Eigenvalue Driver
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sed.in > $@ 2>&1
 
 sgg.out: sgg.in EIG/xeigtsts
 	@echo SGG: Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sgg.in > $@ 2>&1
 
 sgd.out: sgd.in EIG/xeigtsts
 	@echo SGD: Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sgd.in > $@ 2>&1
 
 ssb.out: ssb.in EIG/xeigtsts
 	@echo SSB: Testing REAL Symmetric Eigenvalue Problem routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < ssb.in > $@ 2>&1
 
 ssg.out: ssg.in EIG/xeigtsts
 	@echo SSG: Testing REAL Symmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < ssg.in > $@ 2>&1
 
 sbal.out: sbal.in EIG/xeigtsts
 	@echo SGEBAL: Testing the balancing of a REAL general matrix
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sbal.in > $@ 2>&1
 
 sbak.out: sbak.in EIG/xeigtsts
 	@echo SGEBAK: Testing the back transformation of a REAL balanced matrix
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sbak.in > $@ 2>&1
 
 sgbal.out: sgbal.in EIG/xeigtsts
 	@echo SGGBAL: Testing the balancing of a pair of REAL general matrices
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sgbal.in > $@ 2>&1
 
 sgbak.out: sgbak.in EIG/xeigtsts
 	@echo SGGBAK: Testing the back transformation of a pair of REAL balanced matrices
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sgbak.in > $@ 2>&1
 
 sbb.out: sbb.in EIG/xeigtsts
 	@echo SBB: Testing banded Singular Value Decomposition routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < sbb.in > $@ 2>&1
 
 sglm.out: glm.in EIG/xeigtsts
 	@echo GLM: Testing Generalized Linear Regression Model routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < glm.in > $@ 2>&1
 
 sgqr.out: gqr.in EIG/xeigtsts
 	@echo GQR: Testing Generalized QR and RQ factorization routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < gqr.in > $@ 2>&1
 
 sgsv.out: gsv.in EIG/xeigtsts
 	@echo GSV: Testing Generalized Singular Value Decomposition routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < gsv.in > $@ 2>&1
 
 scsd.out: csd.in EIG/xeigtsts
 	@echo CSD: Testing CS Decomposition routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < csd.in > $@ 2>&1
 
 slse.out: lse.in EIG/xeigtsts
 	@echo LSE: Testing Constrained Linear Least Squares routines
-	./EIG/xeigtsts < $< > $@ 2>&1
+	./EIG/xeigtsts < lse.in > $@ 2>&1
 #
 # ======== COMPLEX EIG TESTS ===========================
 
 cnep.out: nep.in EIG/xeigtstc
 	@echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < nep.in > $@ 2>&1
 
 csep.out: sep.in EIG/xeigtstc
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < sep.in > $@ 2>&1
 
 cse2.out: se2.in EIG/xeigtstc
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < se2.in > $@ 2>&1
 
 csvd.out: svd.in EIG/xeigtstc
 	@echo SVD: Testing Singular Value Decomposition routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < svd.in > $@ 2>&1
 
 cec.out: cec.in EIG/xeigtstc
 	@echo CEC: Testing COMPLEX Eigen Condition Routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cec.in > $@ 2>&1
 
 ced.out: ced.in EIG/xeigtstc
 	@echo CES: Testing COMPLEX Nonsymmetric Schur Form Driver
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < ced.in > $@ 2>&1
 
 cgg.out: cgg.in EIG/xeigtstc
 	@echo CGG: Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cgg.in > $@ 2>&1
 
 cgd.out: cgd.in EIG/xeigtstc
 	@echo CGD: Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cgd.in > $@ 2>&1
 
 csb.out: csb.in EIG/xeigtstc
 	@echo CHB: Testing Hermitian Eigenvalue Problem routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < csb.in > $@ 2>&1
 
 csg.out: csg.in EIG/xeigtstc
 	@echo CSG: Testing Symmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < csg.in > $@ 2>&1
 
 cbal.out: cbal.in EIG/xeigtstc
 	@echo CGEBAL: Testing the balancing of a COMPLEX general matrix
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cbal.in > $@ 2>&1
 
 cbak.out: cbak.in EIG/xeigtstc
 	@echo CGEBAK: Testing the back transformation of a COMPLEX balanced matrix
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cbak.in > $@ 2>&1
 
 cgbal.out: cgbal.in EIG/xeigtstc
 	@echo CGGBAL: Testing the balancing of a pair of COMPLEX general matrices
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cgbal.in > $@ 2>&1
 
 cgbak.out: cgbak.in EIG/xeigtstc
 	@echo CGGBAK: Testing the back transformation of a pair of COMPLEX balanced matrices
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cgbak.in > $@ 2>&1
 
 cbb.out: cbb.in EIG/xeigtstc
 	@echo CBB: Testing banded Singular Value Decomposition routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < cbb.in > $@ 2>&1
 
 cglm.out: glm.in EIG/xeigtstc
 	@echo GLM: Testing Generalized Linear Regression Model routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < glm.in > $@ 2>&1
 
 cgqr.out: gqr.in EIG/xeigtstc
 	@echo GQR: Testing Generalized QR and RQ factorization routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < gqr.in > $@ 2>&1
 
 cgsv.out: gsv.in EIG/xeigtstc
 	@echo GSV: Testing Generalized Singular Value Decomposition routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < gsv.in > $@ 2>&1
 
 ccsd.out: csd.in EIG/xeigtstc
 	@echo CSD: Testing CS Decomposition routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < csd.in > $@ 2>&1
 
 clse.out: lse.in EIG/xeigtstc
 	@echo LSE: Testing Constrained Linear Least Squares routines
-	./EIG/xeigtstc < $< > $@ 2>&1
+	./EIG/xeigtstc < lse.in > $@ 2>&1
 #
 # ======== DOUBLE EIG TESTS ===========================
 
 dnep.out: nep.in EIG/xeigtstd
 	@echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < nep.in > $@ 2>&1
 
 dsep.out: sep.in EIG/xeigtstd
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < sep.in > $@ 2>&1
 
 dse2.out: se2.in EIG/xeigtstd
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < se2.in > $@ 2>&1
 
 dsvd.out: svd.in EIG/xeigtstd
 	@echo SVD: Testing Singular Value Decomposition routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < svd.in > $@ 2>&1
 
 dec.out: dec.in EIG/xeigtstd
 	@echo DEC: Testing DOUBLE PRECISION Eigen Condition Routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dec.in > $@ 2>&1
 
 ded.out: ded.in EIG/xeigtstd
 	@echo DEV: Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < ded.in > $@ 2>&1
 
 dgg.out: dgg.in EIG/xeigtstd
 	@echo DGG: Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dgg.in > $@ 2>&1
 
 dgd.out: dgd.in EIG/xeigtstd
 	@echo DGD: Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dgd.in > $@ 2>&1
 
 dsb.out: dsb.in EIG/xeigtstd
 	@echo DSB: Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dsb.in > $@ 2>&1
 
 dsg.out: dsg.in EIG/xeigtstd
 	@echo DSG: Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dsg.in > $@ 2>&1
 
 dbal.out: dbal.in EIG/xeigtstd
 	@echo DGEBAL: Testing the balancing of a DOUBLE PRECISION general matrix
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dbal.in > $@ 2>&1
 
 dbak.out: dbak.in EIG/xeigtstd
 	@echo DGEBAK: Testing the back transformation of a DOUBLE PRECISION balanced matrix
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dbak.in > $@ 2>&1
 
 dgbal.out: dgbal.in EIG/xeigtstd
 	@echo DGGBAL: Testing the balancing of a pair of DOUBLE PRECISION general matrices
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dgbal.in > $@ 2>&1
 
 dgbak.out: dgbak.in EIG/xeigtstd
 	@echo DGGBAK: Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dgbak.in > $@ 2>&1
 
 dbb.out: dbb.in EIG/xeigtstd
 	@echo DBB: Testing banded Singular Value Decomposition routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < dbb.in > $@ 2>&1
 
 dglm.out: glm.in EIG/xeigtstd
 	@echo GLM: Testing Generalized Linear Regression Model routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < glm.in > $@ 2>&1
 
 dgqr.out: gqr.in EIG/xeigtstd
 	@echo GQR: Testing Generalized QR and RQ factorization routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < gqr.in > $@ 2>&1
 
 dgsv.out: gsv.in EIG/xeigtstd
 	@echo GSV: Testing Generalized Singular Value Decomposition routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < gsv.in > $@ 2>&1
 
 dcsd.out: csd.in EIG/xeigtstd
 	@echo CSD: Testing CS Decomposition routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < csd.in > $@ 2>&1
 
 dlse.out: lse.in EIG/xeigtstd
 	@echo LSE: Testing Constrained Linear Least Squares routines
-	./EIG/xeigtstd < $< > $@ 2>&1
+	./EIG/xeigtstd < lse.in > $@ 2>&1
 #
 # ======== COMPLEX16 EIG TESTS ===========================
 
 znep.out: nep.in EIG/xeigtstz
 	@echo NEP: Testing Nonsymmetric Eigenvalue Problem routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < nep.in > $@ 2>&1
 
 zsep.out: sep.in EIG/xeigtstz
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < sep.in > $@ 2>&1
 
 zse2.out: se2.in EIG/xeigtstz
 	@echo SEP: Testing Symmetric Eigenvalue Problem routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < se2.in > $@ 2>&1
 
 zsvd.out: svd.in EIG/xeigtstz
 	@echo SVD: Testing Singular Value Decomposition routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < svd.in > $@ 2>&1
 
 zec.out: zec.in EIG/xeigtstz
 	@echo ZEC: Testing COMPLEX16 Eigen Condition Routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zec.in > $@ 2>&1
 
 zed.out: zed.in EIG/xeigtstz
 	@echo ZES: Testing COMPLEX16 Nonsymmetric Schur Form Driver
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zed.in > $@ 2>&1
 
 zgg.out: zgg.in EIG/xeigtstz
 	@echo ZGG: Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zgg.in > $@ 2>&1
 
 zgd.out: zgd.in EIG/xeigtstz
 	@echo ZGD: Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zgd.in > $@ 2>&1
 
 zsb.out: zsb.in EIG/xeigtstz
 	@echo ZHB: Testing Hermitian Eigenvalue Problem routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zsb.in > $@ 2>&1
 
 zsg.out: zsg.in EIG/xeigtstz
 	@echo ZSG: Testing Symmetric Generalized Eigenvalue Problem routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zsg.in > $@ 2>&1
 
 zbal.out: zbal.in EIG/xeigtstz
 	@echo ZGEBAL: Testing the balancing of a COMPLEX16 general matrix
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zbal.in > $@ 2>&1
 
 zbak.out: zbak.in EIG/xeigtstz
 	@echo ZGEBAK: Testing the back transformation of a COMPLEX16 balanced matrix
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zbak.in > $@ 2>&1
 
 zgbal.out: zgbal.in EIG/xeigtstz
 	@echo ZGGBAL: Testing the balancing of a pair of COMPLEX general matrices
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zgbal.in > $@ 2>&1
 
 zgbak.out: zgbak.in EIG/xeigtstz
 	@echo ZGGBAK: Testing the back transformation of a pair of COMPLEX16 balanced matrices
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zgbak.in > $@ 2>&1
 
 zbb.out: zbb.in EIG/xeigtstz
 	@echo ZBB: Testing banded Singular Value Decomposition routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < zbb.in > $@ 2>&1
 
 zglm.out: glm.in EIG/xeigtstz
 	@echo GLM: Testing Generalized Linear Regression Model routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < glm.in > $@ 2>&1
 
 zgqr.out: gqr.in EIG/xeigtstz
 	@echo GQR: Testing Generalized QR and RQ factorization routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < gqr.in > $@ 2>&1
 
 zgsv.out: gsv.in EIG/xeigtstz
 	@echo GSV: Testing Generalized Singular Value Decomposition routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < gsv.in > $@ 2>&1
 
 zcsd.out: csd.in EIG/xeigtstz
 	@echo CSD: Testing CS Decomposition routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < csd.in > $@ 2>&1
 
 zlse.out: lse.in EIG/xeigtstz
 	@echo LSE: Testing Constrained Linear Least Squares routines
-	./EIG/xeigtstz < $< > $@ 2>&1
+	./EIG/xeigtstz < lse.in > $@ 2>&1
 # ==============================================================================
 
 LIN/xlintsts: $(FRCLIN) $(FRC)
@@ -582,6 +587,7 @@ EIG/xeigtstd: $(FRCEIG) $(FRC)
 EIG/xeigtstz: $(FRCEIG) $(FRC)
 	$(MAKE) -C EIG xeigtstz
 
+.PHONY: clean cleantest
 clean: cleantest
 cleantest:
 	rm -f *.out core
diff --git a/lapack-netlib/TESTING/ctest.in b/lapack-netlib/TESTING/ctest.in
index 2f3853a03..a3588b4a1 100644
--- a/lapack-netlib/TESTING/ctest.in
+++ b/lapack-netlib/TESTING/ctest.in
@@ -50,3 +50,4 @@ CQX
 CXQ
 CTQ
 CTS
+CHH
diff --git a/lapack-netlib/TESTING/dtest.in b/lapack-netlib/TESTING/dtest.in
index a7a16ee41..29bb8b92e 100644
--- a/lapack-netlib/TESTING/dtest.in
+++ b/lapack-netlib/TESTING/dtest.in
@@ -44,3 +44,4 @@ DQX
 DXQ
 DTQ
 DTS
+DHH
diff --git a/lapack-netlib/TESTING/stest.in b/lapack-netlib/TESTING/stest.in
index d32047047..27ac30040 100644
--- a/lapack-netlib/TESTING/stest.in
+++ b/lapack-netlib/TESTING/stest.in
@@ -44,3 +44,4 @@ SQX
 SXQ
 STQ
 STS
+SHH
diff --git a/lapack-netlib/TESTING/ztest.in b/lapack-netlib/TESTING/ztest.in
index 520253941..58da33d60 100644
--- a/lapack-netlib/TESTING/ztest.in
+++ b/lapack-netlib/TESTING/ztest.in
@@ -50,3 +50,4 @@ ZQX
 ZXQ
 ZTQ
 ZTS
+ZHH
diff --git a/lapack-netlib/appveyor.yml b/lapack-netlib/appveyor.yml
deleted file mode 100644
index 7fc3fbdd7..000000000
--- a/lapack-netlib/appveyor.yml
+++ /dev/null
@@ -1,64 +0,0 @@
-# Windows testing.
-# Syntax for this file:
-# http://www.appveyor.com/docs/appveyor-yml
-
-shallow_clone: true
-
-platform: x64
-
-cache:
-  - x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z
-  - i686-4.9.2-release-win32-dwarf-rt_v4-rev4.7z
-
-environment:
-  CTEST_OUTPUT_ON_FAILURE: 1
-  matrix:
-  - MINGW_DIR: mingw64
-    MINGW_URL: https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win64/Personal%20Builds/mingw-builds/4.9.2/threads-win32/seh/x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z/download
-    MINGW_ARCHIVE: x86_64-4.9.2-release-win32-seh-rt_v4-rev4.7z
-  - MINGW_DIR: mingw32
-    MINGW_URL: https://sourceforge.net/projects/mingw-w64/files/Toolchains%20targetting%20Win32/Personal%20Builds/mingw-builds/4.9.2/threads-win32/dwarf/i686-4.9.2-release-win32-dwarf-rt_v4-rev4.7z/download
-    MINGW_ARCHIVE: i686-4.9.2-release-win32-dwarf-rt_v4-rev4.7z
-
-install:
-  - if not exist "%MINGW_ARCHIVE%" appveyor DownloadFile "%MINGW_URL%" -FileName "%MINGW_ARCHIVE%"
-  - 7z x -y "%MINGW_ARCHIVE%" > nul
-  # CMake refuses to generate MinGW Makefiles if sh.exe is in the Path
-  - ps: Get-Command sh.exe -All | Remove-Item
-
-build_script:
-  - echo "NUMBER_OF_PROCESSORS=%NUMBER_OF_PROCESSORS%"
-  - set PATH=%CD%\%MINGW_DIR%\bin;%PATH%
-  - g++ --version
-  - mingw32-make --version
-  - cmake --version
-  - if "%APPVEYOR_REPO_TAG%"=="true" (set CMAKE_BUILD_TYPE=Release) else (set CMAKE_BUILD_TYPE=Debug)
-  - set SRC_DIR=%CD%
-  - echo %SRC_DIR%
-  - set BLD_DIR=%SRC_DIR%\..\lapack-appveyor-bld
-  - set INST_DIR=%SRC_DIR%\..\lapack-appveyor-install
-  - mkdir -p %BLD_DIR%
-  - cd %BLD_DIR%
-    # See issue #17 on github dashboard.  Once resolved, use -DCBLAS=ON
-    #  - cmake -DCMAKE_INSTALL_PREFIX=${INST_DIR} -DLAPACKE=ON ${SRC_DIR}
-  - cmake
-    -G "MinGW Makefiles"
-    -DBUILDNAME:STRING="appveyor-%MINGW_DIR%-%APPVEYOR_REPO_BRANCH%"
-    -DCMAKE_BUILD_TYPE=%CMAKE_BUILD_TYPE%
-    -DCMAKE_INSTALL_PREFIX=%INST_DIR%
-    -DCBLAS:BOOL=ON
-    -DLAPACKE:BOOL=ON
-    -DBUILD_TESTING=ON
-    -DLAPACKE_WITH_TMG:BOOL=ON
-    %SRC_DIR%
-  - mingw32-make -j%NUMBER_OF_PROCESSORS%
-
-test_script:
-  - ctest -D ExperimentalStart
-  - ctest -D ExperimentalConfigure
-  - ctest -D ExperimentalBuild -j%NUMBER_OF_PROCESSORS%
-  - ctest -D ExperimentalTest --schedule-random -j%NUMBER_OF_PROCESSORS% --output-on-failure --timeout 100 -E "CBLAS\-.*cblat1"
-  - ctest -D ExperimentalSubmit
-
-after_test:
-  - mingw32-make install -j%NUMBER_OF_PROCESSORS%
diff --git a/lapack-netlib/lapack_build.cmake b/lapack-netlib/lapack_build.cmake
index 68744cc4c..39878cb24 100644
--- a/lapack-netlib/lapack_build.cmake
+++ b/lapack-netlib/lapack_build.cmake
@@ -69,7 +69,8 @@ find_program(HOSTNAME NAMES hostname)
 find_program(UNAME NAMES uname)
 
 # Get the build name and hostname
-exec_program(${HOSTNAME} ARGS OUTPUT_VARIABLE hostname)
+execute_process(${HOSTNAME}
+  OUTPUT_VARIABLE hostname)
 string(REGEX REPLACE "[/\\\\+<> #]" "-" hostname "${hostname}")
 
 message("HOSTNAME: ${hostname}")
@@ -83,7 +84,8 @@ find_package(Git REQUIRED)
 set(CTEST_GIT_COMMAND     ${GIT_EXECUTABLE})
 set(CTEST_UPDATE_COMMAND  ${GIT_EXECUTABLE})
 macro(getuname name flag)
-  exec_program("${UNAME}" ARGS "${flag}" OUTPUT_VARIABLE "${name}")
+  execute_process(COMMAND "${UNAME}" "${flag}"
+    OUTPUT_VARIABLE "${name}")
   string(REGEX REPLACE "[/\\\\+<> #]" "-" "${name}" "${${name}}")
   string(REGEX REPLACE "^(......|.....|....|...|..|.).*" "\\1" "${name}" "${${name}}")
 endmacro()
@@ -167,7 +169,7 @@ endif()
 # dashboard then set this variable to the directory
 # the dashboard should be in
 make_directory("${CTEST_DASHBOARD_ROOT}")
-# these are the the name of the source and binary directory on disk.
+# these are the names of the source and binary directory on disk.
 # They will be appended to DASHBOARD_ROOT
 set(CTEST_SOURCE_DIRECTORY  "${CTEST_DASHBOARD_ROOT}/${CTEST_DIR_NAME}")
 set(CTEST_BINARY_DIRECTORY  "${CTEST_SOURCE_DIRECTORY}-${CTEST_BUILD_NAME}")
diff --git a/lapack-netlib/lapack_testing.py b/lapack-netlib/lapack_testing.py
index 5d07e1e87..5582744a0 100755
--- a/lapack-netlib/lapack_testing.py
+++ b/lapack-netlib/lapack_testing.py
@@ -12,8 +12,8 @@ import os, sys, math
 import getopt
 # Arguments
 try:
-    opts, args = getopt.getopt(sys.argv[1:], "hd:srep:t:n",
-                               ["help", "dir", "short", "run", "error","prec=","test=","number"])
+    opts, args = getopt.getopt(sys.argv[1:], "hd:b:srep:t:n",
+                               ["help", "dir", "bin", "short", "run", "error","prec=","test=","number"])
 
 except getopt.error as msg:
     print(msg)
@@ -29,14 +29,13 @@ only_numbers=0
 test_dir='TESTING'
 bin_dir='bin/Release'
 
-abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir))
-
 for o, a in opts:
     if o in ("-h", "--help"):
         print(sys.argv[0]+" [-h|--help] [-d dir |--dir dir] [-s |--short] [-r |--run] [-e |--error] [-p p |--prec p] [-t test |--test test] [-n | --number]")
         print("     - h is to print this message")
         print("     - r is to use to run the LAPACK tests then analyse the output (.out files). By default, the script will not run all the LAPACK tests")
         print("     - d [dir] is to indicate where is the LAPACK testing directory (.out files). By default, the script will use .")
+        print("     - b [bin] is to indicate where is the LAPACK binary files are located. By default, the script will use .")
         print(" LEVEL OF OUTPUT")
         print("     - x is to print a detailed summary")
         print("     - e is to print only the error summary")
@@ -75,6 +74,8 @@ for o, a in opts:
             just_errors = 1
         if o in ( '-p', '--prec' ):
             prec = a
+        if o in ( '-b', '--bin' ):
+            bin_dir = a
         if o in ( '-d', '--dir' ):
             test_dir = a
         if o in ( '-t', '--test' ):
@@ -85,6 +86,8 @@ for o, a in opts:
 
 # process options
 
+abs_bin_dir=os.path.normpath(os.path.join(os.getcwd(),bin_dir))
+
 os.chdir(test_dir)
 
 execution=1
@@ -114,10 +117,7 @@ def run_summary_test( f, cmdline, short_summary):
             pipe = open(cmdline,'r')
             r=0
     else:
-        if os.name != 'nt':
-            cmdline='./' + cmdline
-        else :
-            cmdline=abs_bin_dir+os.path.sep+cmdline
+        cmdline = os.path.join(abs_bin_dir, cmdline)
 
         outfile=cmdline.split()[4]
         #pipe = open(outfile,'w')
diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example
index d780c3a23..57fd51ebe 100644
--- a/lapack-netlib/make.inc.example
+++ b/lapack-netlib/make.inc.example
@@ -8,10 +8,10 @@ SHELL = /bin/sh
 
 #  CC is the C compiler, normally invoked with options CFLAGS.
 #
-CC     = gcc
+CC = gcc
 CFLAGS = -O3
 
-#  Modify the FORTRAN and OPTS definitions to refer to the compiler
+#  Modify the FC and FFLAGS definitions to the desired compiler
 #  and desired compiler options for your machine.  NOOPT refers to
 #  the compiler options desired when NO OPTIMIZATION is selected.
 #
@@ -19,23 +19,21 @@ CFLAGS = -O3
 #  and handle these quantities appropriately. As a consequence, one
 #  should not compile LAPACK with flags such as -ffpe-trap=overflow.
 #
-FORTRAN = gfortran
-OPTS    = -O2 -frecursive
-DRVOPTS = $(OPTS)
-NOOPT   = -O0 -frecursive
+FC = gfortran
+FFLAGS = -O2 -frecursive
+FFLAGS_DRV = $(FFLAGS)
+FFLAGS_NOOPT = -O0 -frecursive
 
-#  Define LOADER and LOADOPTS to refer to the loader and desired
-#  load options for your machine.
+#  Define LDFLAGS to the desired linker options for your machine.
 #
-LOADER   = gfortran
-LOADOPTS =
+LDFLAGS =
 
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
-RANLIB    = ranlib
+AR = ar
+ARFLAGS = cr
+RANLIB = ranlib
 
 #  Timer for the SECOND and DSECND routines
 #
@@ -78,8 +76,8 @@ TIMER = INT_ETIME
 #  machine-specific, optimized BLAS library should be used whenever
 #  possible.)
 #
-BLASLIB      = ../../librefblas.a
-CBLASLIB     = ../../libcblas.a
-LAPACKLIB    = liblapack.a
-TMGLIB       = libtmglib.a
-LAPACKELIB   = liblapacke.a
+BLASLIB      = $(TOPSRCDIR)/librefblas.a
+CBLASLIB     = $(TOPSRCDIR)/libcblas.a
+LAPACKLIB    = $(TOPSRCDIR)/liblapack.a
+TMGLIB       = $(TOPSRCDIR)/libtmglib.a
+LAPACKELIB   = $(TOPSRCDIR)/liblapacke.a
diff --git a/lapack-netlib/meson.build b/lapack-netlib/meson.build
new file mode 100644
index 000000000..b1e9c6bc1
--- /dev/null
+++ b/lapack-netlib/meson.build
@@ -0,0 +1,28 @@
+# cd build
+# meson --buildtype release --prefix=$HOME/.local/lapack ..
+# ninja
+# ninja install
+
+project('LAPACK', 'fortran',
+  default_options : ['default_library=static', 'libdir=lib/'],
+  version : '3.8.0')
+
+subdir('BLAS/SRC')
+subdir('SRC')
+
+prec = get_option('realkind')
+
+
+if prec == 'd'
+  bsrc = DBLAS1 + DBLAS2 + DBLAS3
+  lsrc = DZLAUX + DSLASRC
+elif prec == 's'
+  bsrc = SBLAS1 + SBLAS2 + SBLAS3
+  lsrc = SCLAUX + SLASRC
+endif
+
+blas = library('blas', bsrc,
+  install : true)
+
+lapack = library('lapack', lsrc, ALLAUX,
+  install : true)
diff --git a/lapack-netlib/meson_options.txt b/lapack-netlib/meson_options.txt
new file mode 100644
index 000000000..b378e3329
--- /dev/null
+++ b/lapack-netlib/meson_options.txt
@@ -0,0 +1,3 @@
+option('realkind', type : 'string', value : 'd', 
+  description : 's: real32  d: real64  c: complex32  z: complex64')
+

From 52de4cc8fdf976e0e09f81904e4f427b4dc64015 Mon Sep 17 00:00:00 2001
From: chenxuqiang <chenxuqiang3@hisilicon.com>
Date: Wed, 1 Jan 2020 21:50:45 -0500
Subject: [PATCH 166/210] kernel/arm64/dgemm_beta.S: add beta == zero branch

added beta == zero branch, and no need to load C matrix.

Signed by: Xuqiang Chen <chenxuqiang3@hisilicon.com>
---
 kernel/arm64/dgemm_beta.S | 69 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

diff --git a/kernel/arm64/dgemm_beta.S b/kernel/arm64/dgemm_beta.S
index 1ce452212..20011c343 100644
--- a/kernel/arm64/dgemm_beta.S
+++ b/kernel/arm64/dgemm_beta.S
@@ -80,6 +80,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	sp, sp, #(11*16)
 .endm
 
+.macro INIT_ZERO
+	fmul v0.2d, v0.2d, betaV0
+	fmul v1.2d, v1.2d, betaV0
+	fmul v2.2d, v2.2d, betaV0
+	fmul v3.2d, v3.2d, betaV0
+	fmul v4.2d, v4.2d, betaV0
+	fmul v5.2d, v5.2d, betaV0
+	fmul v6.2d, v6.2d, betaV0
+	fmul v7.2d, v7.2d, betaV0
+.endm
+
 /**************************************************************************************
 * End of macro definitions
 **************************************************************************************/
@@ -97,6 +108,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	cmp  N, #0
 	ble	.Lgemm_beta_L999
 
+	fcmp BETA, #0.0
+	beq .Lgemm_beta_zero_01
+
 .Lgemm_beta_01:
 
 	lsl LDC, LDC, #3
@@ -180,4 +194,59 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	RESTORE_REGS
 	ret
 
+.Lgemm_beta_zero_01:
+	INIT_ZERO
+	lsl LDC, LDC, #3
+
+	.align 5
+.Lgemm_beta_zero_02:
+	mov A01, C00
+	add C00, C00, LDC
+
+	asr I, M, #4
+	cmp I, #0
+	ble .Lgemm_beta_zero_04
+
+	add A02, A01, #64
+
+	.align 5
+.Lgemm_beta_zero_03:
+	
+	st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [A01]
+	add A01, A01, calc_size
+	st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [A02]
+	add A02, A02, calc_size
+	
+	subs I, I, #1
+	bne .Lgemm_beta_zero_03
+
+	.align 5
+.Lgemm_beta_zero_04:
+
+	and I, M, #15
+	cmp I, #0
+	ble .Lgemm_beta_zero_06
+
+	.align 5
+.Lgemm_beta_zero_05:
+
+	str beta0, [A01]
+	add A01, A01, #8
+
+	subs I, I, #1
+	bne .Lgemm_beta_zero_05
+
+	.align 5
+.Lgemm_beta_zero_06:
+
+	subs N, N, #1
+	bne .Lgemm_beta_zero_02
+
+	.align 5
+.Lgemm_beta_zero_L999:
+
+	mov x0, #0
+	RESTORE_REGS
+	ret
+
 	EPILOGUE

From 893e6e57c46c5f2768468def5c8a77723c82df4a Mon Sep 17 00:00:00 2001
From: shengyang <shengyang1@huawei.com>
Date: Fri, 3 Jan 2020 10:03:33 +0800
Subject: [PATCH 167/210] 	modified:   ctest/din3 ctest/sin3

---
 ctest/din3 | 2 +-
 ctest/sin3 | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ctest/din3 b/ctest/din3
index 23fedfe32..9919774ac 100644
--- a/ctest/din3
+++ b/ctest/din3
@@ -5,7 +5,7 @@ T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
-6                 NUMBER OF VALUES OF N
+7                 NUMBER OF VALUES OF N
 1 2 3 5 7 9 35    VALUES OF N
 3                 NUMBER OF VALUES OF ALPHA
 0.0 1.0 0.7       VALUES OF ALPHA
diff --git a/ctest/sin3 b/ctest/sin3
index 644083f22..b74206b70 100644
--- a/ctest/sin3
+++ b/ctest/sin3
@@ -5,7 +5,7 @@ T        LOGICAL FLAG, T TO STOP ON FAILURES.
 T        LOGICAL FLAG, T TO TEST ERROR EXITS.
 2        0 TO TEST COLUMN-MAJOR, 1 TO TEST ROW-MAJOR, 2 TO TEST BOTH
 16.0     THRESHOLD VALUE OF TEST RATIO
-6                 NUMBER OF VALUES OF N
+7                 NUMBER OF VALUES OF N
 0 1 2 3 5 9 35    VALUES OF N
 3                 NUMBER OF VALUES OF ALPHA
 0.0 1.0 0.7       VALUES OF ALPHA

From 2ea2bd99c7ba038d5366e8309359d32f0d5f6cd7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 3 Jan 2020 11:10:00 +0100
Subject: [PATCH 168/210] Apply LAPACKE fix for eigenvector transposition in
 symmetric eigensolvers

from Reference-LAPACK PR 330
---
 lapack-netlib/LAPACKE/src/lapacke_cheev_work.c         | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c        | 7 +++++--
 lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c         | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c        | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c         | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c        | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_zheev_work.c         | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c | 6 +++++-
 lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c        | 6 +++++-
 12 files changed, 60 insertions(+), 13 deletions(-)

diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
index f505dfab0..aa78e678e 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c
@@ -78,7 +78,11 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
index e9e6a5d1d..d26c84785 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c
@@ -79,7 +79,11 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); 
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
index 4c5f352a8..e8f212efb 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c
@@ -79,8 +79,11 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
-
+        if ( jobz == 'V') {
+            LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else { 
+            LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
index 5a416ff45..f696c608f 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c
@@ -72,7 +72,11 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
index 90d8ce8dc..6f9c02f6a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c
@@ -76,7 +76,11 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
index fff476445..81ba2acb3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c
@@ -76,7 +76,11 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
index 6a2f8fce3..abd62ddf3 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c
@@ -72,7 +72,11 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
index 9394f822f..d9fe47599 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c
@@ -76,7 +76,11 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); 
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
index 12d9e84e6..bfbf49aee 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c
@@ -76,7 +76,11 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
index ce278b272..d4e93aed2 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c
@@ -78,7 +78,11 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
index bf2e2c828..fb33c3e2a 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c
@@ -79,7 +79,11 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else {
+            LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:
diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
index f09cfe49d..5af2a1269 100644
--- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
+++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c
@@ -79,7 +79,11 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo,
             info = info - 1;
         }
         /* Transpose output matrices */
-        LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        if ( jobz == 'V') {
+            LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda );
+        } else { 
+            LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda );
+        }
         /* Release memory and exit */
         LAPACKE_free( a_t );
 exit_level_0:

From eb3c9f1db94543367cb32a6d656635dd3b99d38c Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 6 Jan 2020 12:07:02 +0800
Subject: [PATCH 169/210] optimize AVX2 SGEMM

---
 kernel/x86_64/sgemm_kernel_8x4_haswell.c | 490 +++++++++++++++++++++++
 1 file changed, 490 insertions(+)
 create mode 100644 kernel/x86_64/sgemm_kernel_8x4_haswell.c

diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
new file mode 100644
index 000000000..87d9aa394
--- /dev/null
+++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
@@ -0,0 +1,490 @@
+/* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store, %6 = &alpha, %7 = b_pref */
+/* r11 = m_counter, r12 = k << 2(const), r13 = k_skip << 2, r14 = b_head_pos(const), r15 for assisting prefetch */
+
+//recommended settings: GEMM_P = 320, GEMM_Q = 320.
+
+#ifdef TRMMKERNEL
+  #define mult_alpha(acc,alpha,...) "vmulps "#acc","#alpha","#acc";"
+#else
+  #define mult_alpha(acc,alpha,...) "vfmadd213ps ("#__VA_ARGS__"),"#alpha","#acc";"
+#endif
+
+#if defined TRMMKERNEL && !defined LEFT
+  #ifdef TRANSA
+    #define HEAD_SET_OFFSET(ndim) {}
+    #define TAIL_SET_OFFSET(ndim) {off+=ndim;}
+  #else
+    #define HEAD_SET_OFFSET(ndim) {off+=(ndim>4?4:ndim);}
+    #define TAIL_SET_OFFSET(ndim) {off+=(ndim>4?(ndim-4):0);}
+  #endif
+#else
+  #define HEAD_SET_OFFSET(ndim) {}
+  #define TAIL_SET_OFFSET(ndim) {}
+#endif
+
+#if defined TRMMKERNEL && defined LEFT
+  #ifdef TRANSA
+    #define init_update_kskip(val) "subq $"#val",%%r13;"
+    #define save_update_kskip(val) ""
+  #else
+    #define init_update_kskip(val) ""
+    #define save_update_kskip(val) "addq $"#val",%%r13;"
+  #endif
+#else
+  #define init_update_kskip(val) ""
+  #define save_update_kskip(val) ""
+#endif
+
+#ifdef TRMMKERNEL
+  #define init_set_k "movq %%r12,%4; subq %%r13,%4;"
+  #if LEFT != TRANSA
+    #define INIT_SET_KSKIP "movq %9,%%r13; salq $2,%%r13;"
+    #define init_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;"
+    #define save_set_pointers(a_copy,b_copy) ""
+  #else
+    #define INIT_SET_KSKIP "movq %4,%%r13; subq %9,%%r13; salq $2,%%r13;"
+    #define init_set_pointers(a_copy,b_copy) ""
+    #define save_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;"
+  #endif
+#else
+  #define INIT_SET_KSKIP "xorq %%r13,%%r13;"
+  #define init_set_k "movq %%r12,%4;"
+  #define init_set_pointers(a_copy,b_copy) ""
+  #define save_set_pointers(a_copy,b_copy) ""
+#endif
+#define init_set_pa_pb_n12(mdim) init_set_pointers(mdim,4)
+#define init_set_pa_pb_n8(mdim) init_set_pointers(mdim,4)
+#define init_set_pa_pb_n4(mdim) init_set_pointers(mdim,4)
+#define init_set_pa_pb_n2(mdim) init_set_pointers(mdim,2)
+#define init_set_pa_pb_n1(mdim) init_set_pointers(mdim,1)
+#define save_set_pa_pb_n12(mdim) save_set_pointers(mdim,4)
+#define save_set_pa_pb_n8(mdim) save_set_pointers(mdim,4)
+#define save_set_pa_pb_n4(mdim) save_set_pointers(mdim,4)
+#define save_set_pa_pb_n2(mdim) save_set_pointers(mdim,2)
+#define save_set_pa_pb_n1(mdim) save_set_pointers(mdim,1)
+
+#if defined TRMMKERNEL && !defined LEFT && defined TRANSA
+  #define kernel_kstart_n8(mdim) \
+    KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $16,%4;"
+  #define kernel_kstart_n12(mdim) \
+    KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4\
+    KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 "subq $32,%4;"
+#else
+  #define kernel_kstart_n8(mdim) ""
+  #define kernel_kstart_n12(mdim) ""
+#endif
+#define kernel_kstart_n4(mdim) ""
+#define kernel_kstart_n2(mdim) ""
+#define kernel_kstart_n1(mdim) ""
+
+/* m = 8 *//* ymm0 for alpha, ymm1-ymm3 for temporary use, ymm4-ymm15 for accumulators */
+#define KERNEL_k1m8n1 \
+    "vmovups (%0),%%ymm1; addq $32,%0;"\
+    "vbroadcastss (%1),%%ymm2; vfmadd231ps %%ymm1,%%ymm2,%%ymm4;"\
+    "addq $4,%1;"
+#define KERNEL_h_k1m8n2 \
+    "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2; addq $32,%0;"\
+    "vbroadcastsd (%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm4; vfmadd231ps %%ymm2,%%ymm3,%%ymm5;"
+#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $8,%1;"
+#define KERNEL_h_k1m8n4 \
+    KERNEL_h_k1m8n2 "vbroadcastsd 8(%1),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,%%ymm6; vfmadd231ps %%ymm2,%%ymm3,%%ymm7;"
+#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
+#define unit_kernel_k1m8n4(c1,c2,c3,c4,boff1,boff2,...) \
+    "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c1"; vfmadd231ps %%ymm2,%%ymm3,"#c2";"\
+    "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%ymm3; vfmadd231ps %%ymm1,%%ymm3,"#c3"; vfmadd231ps %%ymm2,%%ymm3,"#c4";"
+#define KERNEL_h_k1m8n8 KERNEL_h_k1m8n4 unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,8,%1,%%r12,4)
+#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%1;"
+#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n8 unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,8,%1,%%r12,8)
+#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%1;"
+#define KERNEL_k2m8n1 KERNEL_k1m8n1 KERNEL_k1m8n1
+#define KERNEL_k2m8n2 KERNEL_k1m8n2 KERNEL_k1m8n2
+#define KERNEL_k2m8n4 KERNEL_k1m8n4 KERNEL_k1m8n4
+#define KERNEL_k2m8n8 KERNEL_k1m8n8 KERNEL_k1m8n8
+#define KERNEL_k2m8n12 \
+    "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\
+    unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,0,8,%1)\
+    unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,0,8,%1,%%r12,4)\
+    unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,0,8,%1,%%r12,8)\
+    "vmovsldup 32(%0),%%ymm1; vmovshdup 32(%0),%%ymm2; prefetcht0 512(%0); addq $64,%0;"\
+    unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,16,24,%1)\
+    unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,24,%1,%%r12,4)\
+    unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,16,24,%1,%%r12,8) "addq $32,%1;"
+#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+  #define unit_kernel_endn4_k1m8n8(offa1,offb1,offb2) \
+    "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\
+    unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,offb1,offb2,%1,%%r12,4)
+  #define unit_kernel_endn4_k1m8n12(offa1,offb1,offb2) \
+    "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\
+    unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,offb1,offb2,%1,%%r12,8)
+  #define unit_kernel_endn8_k1m8n12(offa1,offb1,offb2) unit_kernel_endn4_k1m8n8(offa1,offb1,offb2)\
+    unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,offb1,offb2,%1,%%r12,8)
+  #define kernel_kend_m8n8 \
+    unit_kernel_endn4_k1m8n8(0,0,8) unit_kernel_endn4_k1m8n8(32,16,24)\
+    unit_kernel_endn4_k1m8n8(64,32,40) unit_kernel_endn4_k1m8n8(96,48,56)
+  #define kernel_kend_m8n12 \
+    unit_kernel_endn8_k1m8n12(0,0,8) unit_kernel_endn8_k1m8n12(32,16,24)\
+    unit_kernel_endn8_k1m8n12(64,32,40) unit_kernel_endn8_k1m8n12(96,48,56)\
+    unit_kernel_endn4_k1m8n12(128,64,72) unit_kernel_endn4_k1m8n12(160,80,88)\
+    unit_kernel_endn4_k1m8n12(192,96,104) unit_kernel_endn4_k1m8n12(224,112,120)
+#else
+  #define kernel_kend_m8n8 ""
+  #define kernel_kend_m8n12 ""
+#endif
+#define kernel_kend_m8n4 ""
+#define kernel_kend_m8n2 ""
+#define kernel_kend_m8n1 ""
+#define INIT_m8n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define INIT_m8n2 INIT_m8n1 "vpxor %%ymm5,%%ymm5,%%ymm5;"
+#define INIT_m8n4 INIT_m8n2 "vpxor %%ymm6,%%ymm6,%%ymm6;vpxor %%ymm7,%%ymm7,%%ymm7;"
+#define unit_init_m8n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m8n8  INIT_m8n4 unit_init_m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11)
+#define INIT_m8n12 INIT_m8n8 unit_init_m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15)
+#define SAVE_m8n1 mult_alpha(%%ymm4,%%ymm0,%2) "vmovups %%ymm4,(%2);"
+#define unit_save_m8n2(c1,c2) \
+    "vunpcklps "#c2","#c1",%%ymm2; vunpckhps "#c2","#c1",%%ymm3; vunpcklpd %%ymm3,%%ymm2,"#c1"; vunpckhpd %%ymm3,%%ymm2,"#c2";"\
+    mult_alpha(c1,%%ymm0,%5) "vmovups "#c1",(%5);"\
+    mult_alpha(c2,%%ymm0,%5,%3,1) "vmovups "#c2",(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_m8n2 "movq %2,%5;" unit_save_m8n2(%%ymm4,%%ymm5)
+#define SAVE_m8n4  SAVE_m8n2  unit_save_m8n2(%%ymm6,%%ymm7)
+#define SAVE_m8n8  SAVE_m8n4  unit_save_m8n2(%%ymm8,%%ymm9)   unit_save_m8n2(%%ymm10,%%ymm11)
+#define SAVE_m8n12 SAVE_m8n8  unit_save_m8n2(%%ymm12,%%ymm13) unit_save_m8n2(%%ymm14,%%ymm15)
+#define COMPUTE_m8(ndim) \
+    init_update_kskip(32) INIT_m8n##ndim\
+    init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(8) "movq %2,%5; movq $0,%%r15;"\
+    kernel_kstart_n##ndim(8)\
+    "cmpq $64,%4; jb "#ndim"882f;"\
+    #ndim"881:\n\t"\
+    "cmpq $62,%%r15; movq $62,%%r15; cmoveq %3,%%r15;"\
+    KERNEL_k2m8n##ndim KERNEL_k2m8n##ndim\
+    "prefetcht1 (%5); subq $31,%5;"\
+    KERNEL_k2m8n##ndim KERNEL_k2m8n##ndim\
+    "addq %%r15,%5; prefetcht1 (%7); addq $16,%7;"\
+    "subq $32,%4; cmpq $64,%4; jnb "#ndim"881b;"\
+    "movq %2,%5;"\
+    #ndim"882:\n\t"\
+    "testq %4,%4; jz "#ndim"883f;"\
+    "prefetcht0 (%5); prefetcht0 31(%5);"\
+    KERNEL_k1m8n##ndim\
+    "prefetcht0 (%5,%3,4); prefetcht0 31(%5,%3,4); addq %3,%5;"\
+    "subq $4,%4; jmp "#ndim"882b;"\
+    #ndim"883:\n\t"\
+    kernel_kend_m8n##ndim "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
+    save_set_pa_pb_n##ndim(8) SAVE_m8n##ndim "addq $32,%2;" save_update_kskip(32)
+
+/* m = 4 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
+#define KERNEL_k1m4n1 \
+    "vmovups (%0),%%xmm1; addq $16,%0;"\
+    "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $4,%1;"
+#define KERNEL_h_k1m4n2 \
+    "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2; addq $16,%0;"\
+    "vmovddup (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
+#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $8,%1;"
+#define KERNEL_h_k1m4n4 \
+    KERNEL_h_k1m4n2 "vmovddup 8(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
+#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
+#define unit_kernel_k1m4n4(c1,c2,c3,c4,offb1,offb2,...) \
+    "vmovddup "#offb1"("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c1"; vfmadd231ps %%xmm2,%%xmm3,"#c2";"\
+    "vmovddup "#offb2"("#__VA_ARGS__"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,"#c3"; vfmadd231ps %%xmm2,%%xmm3,"#c4";"
+#define KERNEL_h_k1m4n8 KERNEL_h_k1m4n4 unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,0,8,%1,%%r12,4)
+#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;"
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,0,8,%1,%%r12,8)
+#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;"
+#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+  #define unit_kernel_endn4_k1m4n8(offa1,offb1,offb2) \
+    "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\
+    unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,offb1,offb2,%1,%%r12,4)
+  #define unit_kernel_endn4_k1m4n12(offa1,offb1,offb2) \
+    "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\
+    unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,offb1,offb2,%1,%%r12,8)
+  #define unit_kernel_endn8_k1m4n12(offa1,offb1,offb2) unit_kernel_endn4_k1m4n8(offa1,offb1,offb2)\
+    unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,offb1,offb2,%1,%%r12,8)
+  #define kernel_kend_m4n8 \
+    unit_kernel_endn4_k1m4n8(0,0,8) unit_kernel_endn4_k1m4n8(16,16,24)\
+    unit_kernel_endn4_k1m4n8(32,32,40) unit_kernel_endn4_k1m4n8(48,48,56)
+  #define kernel_kend_m4n12 \
+    unit_kernel_endn8_k1m4n12(0,0,8) unit_kernel_endn8_k1m4n12(16,16,24)\
+    unit_kernel_endn8_k1m4n12(32,32,40) unit_kernel_endn8_k1m4n12(48,48,56)\
+    unit_kernel_endn4_k1m4n12(64,64,72) unit_kernel_endn4_k1m4n12(80,80,88)\
+    unit_kernel_endn4_k1m4n12(96,96,104) unit_kernel_endn4_k1m4n12(112,112,120)
+#else
+  #define kernel_kend_m4n8 ""
+  #define kernel_kend_m4n12 ""
+#endif
+#define kernel_kend_m4n4 ""
+#define kernel_kend_m4n2 ""
+#define kernel_kend_m4n1 ""
+#define INIT_m4n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m4n2 INIT_m4n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m4n4 INIT_m4n2 "vpxor %%xmm6,%%xmm6,%%xmm6;vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define unit_init_m4n4(c1,c2,c3,c4) \
+    "vpxor "#c1","#c1","#c1";vpxor "#c2","#c2","#c2";vpxor "#c3","#c3","#c3";vpxor "#c4","#c4","#c4";"
+#define INIT_m4n8  INIT_m4n4 unit_init_m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11)
+#define INIT_m4n12 INIT_m4n8 unit_init_m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15)
+#define SAVE_m4n1 \
+    mult_alpha(%%xmm4,%%xmm0,%2) "vmovups %%xmm4,(%2);"
+#define unit_save_m4n2(c1,c2) \
+    "vunpcklps "#c2","#c1",%%xmm2; vunpckhps "#c2","#c1",%%xmm3; vunpcklpd %%xmm3,%%xmm2,"#c1"; vunpckhpd %%xmm3,%%xmm2,"#c2";"\
+    mult_alpha(c1,%%xmm0,%5) "vmovups "#c1",(%5);"\
+    mult_alpha(c2,%%xmm0,%5,%3,1) "vmovups "#c2",(%5,%3,1);"\
+    "leaq (%5,%3,2),%5;"
+#define SAVE_m4n2 "movq %2,%5;" unit_save_m4n2(%%xmm4,%%xmm5)
+#define SAVE_m4n4  SAVE_m4n2  unit_save_m4n2(%%xmm6,%%xmm7)
+#define SAVE_m4n8  SAVE_m4n4  unit_save_m4n2(%%xmm8,%%xmm9)   unit_save_m4n2(%%xmm10,%%xmm11)
+#define SAVE_m4n12 SAVE_m4n8  unit_save_m4n2(%%xmm12,%%xmm13) unit_save_m4n2(%%xmm14,%%xmm15)
+#define COMPUTE_m4(ndim) \
+    init_update_kskip(16) INIT_m4n##ndim\
+    init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(4)\
+    kernel_kstart_n##ndim(4)\
+    #ndim"442:\n\t"\
+    "testq %4,%4; jz "#ndim"443f;"\
+    KERNEL_k1m4n##ndim\
+    "subq $4,%4; jmp "#ndim"442b;"\
+    #ndim"443:\n\t"\
+    kernel_kend_m4n##ndim save_set_pa_pb_n##ndim(4) SAVE_m4n##ndim "addq $16,%2;" save_update_kskip(16)
+
+/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
+#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define KERNEL_k1m2n1 \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $4,%1;"
+#ifdef TRMMKERNEL
+ #define SAVE_m2n1 "vmulps %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
+#else
+ #define SAVE_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
+#endif
+#define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define KERNEL_k1m2n2 \
+    "vmovsd (%0),%%xmm1; addq $8,%0;"\
+    "vbroadcastss  (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
+    "addq $8,%1;"
+#ifdef TRMMKERNEL
+  #define SAVE_m2n2 SAVE_m2n1 "vmulps %%xmm5,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
+#else
+  #define SAVE_m2n2 SAVE_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
+#endif
+#define INIT_m2n4  INIT_m2n2
+#define INIT_m2n8  INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
+#define KERNEL_k1m2n4 \
+    "vmovups (%1),%%xmm3; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
+    "addq $8,%0;"
+#define KERNEL_k1m2n8 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
+    "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
+    "addq $8,%0;"
+#define KERNEL_k1m2n12 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
+    "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
+    "addq $8,%0;"
+#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+  #define unit_kernel_endn4_k1m2n8(aoff1,aoff2,boff) \
+    "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\
+    "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;"\
+    "vbroadcastss "#aoff2"(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm7;"
+  #define unit_kernel_endn4_k1m2n12(aoff1,aoff2,boff) \
+    "vmovups "#boff"(%1,%%r12,8),%%xmm3;"\
+    "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm8;"\
+    "vbroadcastss "#aoff2"(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm9;"
+  #define unit_kernel_endn8_k1m2n12(aoff1,aoff2,boff) \
+    "vmovups "#boff"(%1,%%r12,4),%%xmm3; vmovups "#boff"(%1,%%r12,8),%%xmm2;"\
+    "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6; vfmadd231ps %%xmm2,%%xmm1,%%xmm8;"\
+    "vbroadcastss "#aoff2"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm7; vfmadd231ps %%xmm2,%%xmm1,%%xmm9;"
+  #define kernel_kend_m2n8 \
+    unit_kernel_endn4_k1m2n8(0,4,0) unit_kernel_endn4_k1m2n8(8,12,16)\
+    unit_kernel_endn4_k1m2n8(16,20,32) unit_kernel_endn4_k1m2n8(24,28,48)
+  #define kernel_kend_m2n12 \
+    unit_kernel_endn8_k1m2n12(0,4,0) unit_kernel_endn8_k1m2n12(8,12,16)\
+    unit_kernel_endn8_k1m2n12(16,20,32) unit_kernel_endn8_k1m2n12(24,28,48)\
+    unit_kernel_endn4_k1m2n12(32,36,64) unit_kernel_endn4_k1m2n12(40,44,80)\
+    unit_kernel_endn4_k1m2n12(48,52,96) unit_kernel_endn4_k1m2n12(56,60,112)
+#else
+  #define kernel_kend_m2n8 ""
+  #define kernel_kend_m2n12 ""
+#endif
+#define kernel_kend_m2n4 ""
+#define kernel_kend_m2n2 ""
+#define kernel_kend_m2n1 ""
+#ifdef TRMMKERNEL
+  #define unit_save_m2n4(c1,c2) \
+    "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
+    "vmulps %%xmm1,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vmulps %%xmm2,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
+#else
+  #define unit_save_m2n4(c1,c2) \
+    "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
+    "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
+    "vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
+    "vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"
+#endif
+#define SAVE_m2n4 "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
+#define SAVE_m2n8   SAVE_m2n4    unit_save_m2n4(%%xmm6,%%xmm7)
+#define SAVE_m2n12  SAVE_m2n8   unit_save_m2n4(%%xmm8,%%xmm9)
+#define COMPUTE_m2(ndim) \
+    init_update_kskip(8) INIT_m2n##ndim\
+    init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(2)\
+    kernel_kstart_n##ndim(2)\
+    #ndim"222:\n\t"\
+    "testq %4,%4; jz "#ndim"223f;"\
+    KERNEL_k1m2n##ndim\
+    "subq $4,%4; jmp "#ndim"222b;"\
+    #ndim"223:\n\t"\
+    kernel_kend_m2n##ndim save_set_pa_pb_n##ndim(2) SAVE_m2n##ndim "addq $8,%2;" save_update_kskip(8)
+
+/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
+#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define KERNEL_k1m1n1 \
+    "vmovss (%1),%%xmm3; addq $4,%1;"\
+    "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#ifdef TRMMKERNEL
+  #define SAVE_m1n1 "vmulss %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
+#else
+  #define SAVE_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
+#endif
+#define INIT_m1n2 INIT_m1n1
+#define KERNEL_k1m1n2 \
+    "vmovsd (%1),%%xmm3; addq $8,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#ifdef TRMMKERNEL
+  #define SAVE_m1n2 \
+    "vmulps %%xmm4,%%xmm0,%%xmm4; vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
+#else
+  #define SAVE_m1n2 \
+    "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
+    "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
+#endif
+#define INIT_m1n4  INIT_m1n2
+#define INIT_m1n8  INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
+#define KERNEL_k1m1n4 \
+    "vmovups (%1),%%xmm3; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
+    "addq $4,%0;"
+#define KERNEL_k1m1n8 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
+    "addq $4,%0;"
+#define KERNEL_k1m1n12 \
+    "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\
+    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
+    "addq $4,%0;"
+#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+  #define unit_kernel_endn4_k1m1n8(aoff,boff) \
+    "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\
+    "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5;"
+  #define unit_kernel_endn4_k1m1n12(aoff,boff) \
+    "vmovups "#boff"(%1,%%r12,8),%%xmm3;"\
+    "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;"
+  #define unit_kernel_endn8_k1m1n12(aoff,boff) \
+    "vmovups "#boff"(%1,%%r12,4),%%xmm3; vmovups "#boff"(%1,%%r12,8),%%xmm2;"\
+    "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"
+  #define kernel_kend_m1n8 \
+    unit_kernel_endn4_k1m1n8(0,0) unit_kernel_endn4_k1m1n8(4,16)\
+    unit_kernel_endn4_k1m1n8(8,32) unit_kernel_endn4_k1m1n8(12,48)
+  #define kernel_kend_m1n12 \
+    unit_kernel_endn8_k1m1n12(0,0) unit_kernel_endn8_k1m1n12(4,16)\
+    unit_kernel_endn8_k1m1n12(8,32) unit_kernel_endn8_k1m1n12(12,48)\
+    unit_kernel_endn4_k1m1n12(16,64) unit_kernel_endn4_k1m1n12(20,80)\
+    unit_kernel_endn4_k1m1n12(24,96) unit_kernel_endn4_k1m1n12(28,112)
+#else
+  #define kernel_kend_m1n8 ""
+  #define kernel_kend_m1n12 ""
+#endif
+#define kernel_kend_m1n4 ""
+#define kernel_kend_m1n2 ""
+#define kernel_kend_m1n1 ""
+#ifdef TRMMKERNEL
+  #define unit_save_m1n4(c1) \
+    "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
+    "vmulps %%xmm2,%%xmm0,%%xmm2; vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vmulps %%xmm1,%%xmm0,%%xmm1; vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
+#else
+  #define unit_save_m1n4(c1) \
+    "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
+    "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
+    "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
+    "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
+    "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
+#endif
+#define SAVE_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
+#define SAVE_m1n8  SAVE_m1n4    unit_save_m1n4(%%xmm5)
+#define SAVE_m1n12 SAVE_m1n8    unit_save_m1n4(%%xmm6)
+#define COMPUTE_m1(ndim) \
+    init_update_kskip(4) INIT_m1n##ndim\
+    init_set_k "movq %%r14,%1;" init_set_pa_pb_n##ndim(1)\
+    kernel_kstart_n##ndim(1)\
+    #ndim"112:\n\t"\
+    "testq %4,%4; jz "#ndim"113f;"\
+    KERNEL_k1m1n##ndim\
+    "subq $4,%4; jmp "#ndim"112b;"\
+    #ndim"113:\n\t"\
+    kernel_kend_m1n##ndim save_set_pa_pb_n##ndim(1) SAVE_m1n##ndim "addq $4,%2;" save_update_kskip(4)
+
+#define COMPUTE(ndim) {\
+    HEAD_SET_OFFSET(ndim) next_b = b_pointer + ndim * K;\
+    __asm__ __volatile__(\
+    "vbroadcastss (%6),%%ymm0;"\
+    "movq %4,%%r12; salq $2,%%r12; movq %1,%%r14; movq %8,%%r11;" INIT_SET_KSKIP\
+    "cmpq $8,%%r11;jb 33101"#ndim"f;"\
+    "33109"#ndim":\n\t"\
+    COMPUTE_m8(ndim)\
+    "subq $8,%%r11;cmpq $8,%%r11;jnb 33109"#ndim"b;"\
+    "33101"#ndim":\n\t"\
+    "cmpq $4,%%r11;jb 33103"#ndim"f;"\
+    COMPUTE_m4(ndim)\
+    "subq $4,%%r11;"\
+    "33103"#ndim":\n\t"\
+    "cmpq $2,%%r11;jb 33104"#ndim"f;"\
+    COMPUTE_m2(ndim)\
+    "subq $2,%%r11;"\
+    "33104"#ndim":\n\t"\
+    "testq %%r11,%%r11;jz 33105"#ndim"f;"\
+    COMPUTE_m1(ndim)\
+    "33105"#ndim":\n\t"\
+    "movq %%r12,%4; sarq $2,%4; movq %%r14,%1; vzeroupper;"\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(const_val),"+r"(next_b)\
+    :"m"(M),"m"(off):"r11","r12","r13","r14","r15",\
+    "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15","cc","memory");\
+    TAIL_SET_OFFSET(ndim) a_pointer -= M * K; b_pointer += ndim * K; c_pointer += (LDC * ndim - M);\
+}
+
+#include "common.h"
+#include <stdint.h>
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC
+#ifdef TRMMKERNEL
+,BLASLONG offset
+#endif
+){
+    if(m==0||n==0||k==0||alpha==0.0) return 0;
+    int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);
+    float constval = alpha;
+    float *const_val=&constval;
+    int64_t M = (int64_t)m, K = (int64_t)k, off = 0;
+#ifdef TRMMKERNEL
+  #ifdef LEFT
+    off = offset;
+  #else
+    off = -offset;
+  #endif
+#endif
+    BLASLONG n_count = n;
+    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}

From b73bf01378d179b30fc515714d504ffeeb5360cd Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 6 Jan 2020 12:09:14 +0800
Subject: [PATCH 170/210] optimize AVX2 SGEMM

---
 kernel/x86_64/KERNEL.HASWELL | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index 9e30c12f2..d24b7f3b3 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -31,11 +31,11 @@ DAXPYKERNEL = daxpy.c
 CAXPYKERNEL = caxpy.c
 ZAXPYKERNEL = zaxpy.c
 
-STRMMKERNEL    =  sgemm_kernel_16x4_haswell.S
-SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
+STRMMKERNEL    =  sgemm_kernel_8x4_haswell.c
+SGEMMKERNEL    =  sgemm_kernel_8x4_haswell.c
 SGEMM_BETA     =  sgemm_beta_skylakex.c
-SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
-SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
+SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    =  sgemm_ncopy_4_skylakex.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)

From 92b10212de6972c808ebeccfe9fac0a82012e94e Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 6 Jan 2020 12:11:21 +0800
Subject: [PATCH 171/210] optimize AVX2 SGEMM

---
 kernel/x86_64/KERNEL.ZEN | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/KERNEL.ZEN b/kernel/x86_64/KERNEL.ZEN
index 98cd38dfa..7cec2e5ed 100644
--- a/kernel/x86_64/KERNEL.ZEN
+++ b/kernel/x86_64/KERNEL.ZEN
@@ -30,10 +30,10 @@ DAXPYKERNEL = daxpy.c
 CAXPYKERNEL = caxpy.c
 ZAXPYKERNEL = zaxpy.c
 
-STRMMKERNEL    =  sgemm_kernel_16x4_haswell.S
-SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
-SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
-SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
+STRMMKERNEL    =  sgemm_kernel_8x4_haswell.c
+SGEMMKERNEL    =  sgemm_kernel_8x4_haswell.c
+SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)

From b7b408a12018c2ffde0c595bb588d464389b74e3 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 6 Jan 2020 12:16:09 +0800
Subject: [PATCH 172/210] optimize AVX2 SGEMM

---
 param.h | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/param.h b/param.h
index d03e60fcb..70c5945ae 100644
--- a/param.h
+++ b/param.h
@@ -625,7 +625,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #else
 
-#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_M 8
 #define DGEMM_DEFAULT_UNROLL_M 4
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 8
@@ -666,7 +666,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #else
 
-#define SGEMM_DEFAULT_P 768
+#define SGEMM_DEFAULT_P 320
 #define DGEMM_DEFAULT_P 512
 #define CGEMM_DEFAULT_P 256
 #define ZGEMM_DEFAULT_P 192
@@ -675,7 +675,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 320
 #define DGEMM_DEFAULT_Q 128
 #else
-#define SGEMM_DEFAULT_Q 384
+#define SGEMM_DEFAULT_Q 320
 #define DGEMM_DEFAULT_Q 256
 #endif
 #define CGEMM_DEFAULT_Q 256
@@ -1528,7 +1528,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #else
 
-#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_M 8
 #define DGEMM_DEFAULT_UNROLL_M 4
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 8
@@ -1569,7 +1569,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #else
 
-#define SGEMM_DEFAULT_P 768
+#define SGEMM_DEFAULT_P 320
 #define DGEMM_DEFAULT_P 512
 #define CGEMM_DEFAULT_P 256
 #define ZGEMM_DEFAULT_P 192
@@ -1578,7 +1578,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SGEMM_DEFAULT_Q 320
 #define DGEMM_DEFAULT_Q 128
 #else
-#define SGEMM_DEFAULT_Q 384
+#define SGEMM_DEFAULT_Q 320
 #define DGEMM_DEFAULT_Q 256
 #endif
 #define CGEMM_DEFAULT_Q 256

From 9f5cdc49d4b757618265bef98ad6bd354ad03012 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 6 Jan 2020 12:28:43 +0800
Subject: [PATCH 173/210] Update CONTRIBUTORS.md

---
 CONTRIBUTORS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 9829c31f9..df497c1d2 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -179,3 +179,4 @@ In chronological order:
   * [2019-11-12] AVX512 CGEMM & ZGEMM kernels
   * [2019-12-23] optimize AVX2 CGEMM and ZGEMM
   * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels
+  * [2020-01-07] optimize AVX2 SGEMM and STRMM

From 9dc9b7b95ec2a97f00f0e920906cc1f672938a11 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 6 Jan 2020 20:11:36 +0800
Subject: [PATCH 174/210] Update sgemm_kernel_8x4_haswell.c

---
 kernel/x86_64/sgemm_kernel_8x4_haswell.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
index 87d9aa394..9b3ba7632 100644
--- a/kernel/x86_64/sgemm_kernel_8x4_haswell.c
+++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
@@ -37,7 +37,7 @@
 
 #ifdef TRMMKERNEL
   #define init_set_k "movq %%r12,%4; subq %%r13,%4;"
-  #if LEFT != TRANSA
+  #if (defined LEFT && !defined TRANSA) || (!defined LEFT && defined TRANSA)
     #define INIT_SET_KSKIP "movq %9,%%r13; salq $2,%%r13;"
     #define init_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;"
     #define save_set_pointers(a_copy,b_copy) ""

From bd4c032f52fb6bd1b8f6352baf23836c00842f05 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 7 Jan 2020 11:22:46 +0800
Subject: [PATCH 175/210] Update sgemm_kernel_8x4_haswell.c

---
 kernel/x86_64/sgemm_kernel_8x4_haswell.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
index 9b3ba7632..2b8aa9862 100644
--- a/kernel/x86_64/sgemm_kernel_8x4_haswell.c
+++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
@@ -9,7 +9,7 @@
   #define mult_alpha(acc,alpha,...) "vfmadd213ps ("#__VA_ARGS__"),"#alpha","#acc";"
 #endif
 
-#if defined TRMMKERNEL && !defined LEFT
+#if defined(TRMMKERNEL) && !defined(LEFT)
   #ifdef TRANSA
     #define HEAD_SET_OFFSET(ndim) {}
     #define TAIL_SET_OFFSET(ndim) {off+=ndim;}
@@ -22,7 +22,7 @@
   #define TAIL_SET_OFFSET(ndim) {}
 #endif
 
-#if defined TRMMKERNEL && defined LEFT
+#if defined(TRMMKERNEL) && defined(LEFT)
   #ifdef TRANSA
     #define init_update_kskip(val) "subq $"#val",%%r13;"
     #define save_update_kskip(val) ""
@@ -37,7 +37,7 @@
 
 #ifdef TRMMKERNEL
   #define init_set_k "movq %%r12,%4; subq %%r13,%4;"
-  #if (defined LEFT && !defined TRANSA) || (!defined LEFT && defined TRANSA)
+  #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
     #define INIT_SET_KSKIP "movq %9,%%r13; salq $2,%%r13;"
     #define init_set_pointers(a_copy,b_copy) "leaq (%0,%%r13,"#a_copy"),%0; leaq (%1,%%r13,"#b_copy"),%1;"
     #define save_set_pointers(a_copy,b_copy) ""
@@ -63,7 +63,7 @@
 #define save_set_pa_pb_n2(mdim) save_set_pointers(mdim,2)
 #define save_set_pa_pb_n1(mdim) save_set_pointers(mdim,1)
 
-#if defined TRMMKERNEL && !defined LEFT && defined TRANSA
+#if defined(TRMMKERNEL) && !defined(LEFT) && defined(TRANSA)
   #define kernel_kstart_n8(mdim) \
     KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "subq $16,%4;"
   #define kernel_kstart_n12(mdim) \
@@ -109,7 +109,7 @@
     unit_kernel_k1m8n4(%%ymm4,%%ymm5,%%ymm6,%%ymm7,16,24,%1)\
     unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,16,24,%1,%%r12,4)\
     unit_kernel_k1m8n4(%%ymm12,%%ymm13,%%ymm14,%%ymm15,16,24,%1,%%r12,8) "addq $32,%1;"
-#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
   #define unit_kernel_endn4_k1m8n8(offa1,offb1,offb2) \
     "vmovsldup "#offa1"(%0),%%ymm1; vmovshdup "#offa1"(%0),%%ymm2;"\
     unit_kernel_k1m8n4(%%ymm8,%%ymm9,%%ymm10,%%ymm11,offb1,offb2,%1,%%r12,4)
@@ -192,7 +192,7 @@
 #define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%1;"
 #define KERNEL_h_k1m4n12 KERNEL_h_k1m4n8 unit_kernel_k1m4n4(%%xmm12,%%xmm13,%%xmm14,%%xmm15,0,8,%1,%%r12,8)
 #define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%1;"
-#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
   #define unit_kernel_endn4_k1m4n8(offa1,offb1,offb2) \
     "vmovsldup "#offa1"(%0),%%xmm1; vmovshdup "#offa1"(%0),%%xmm2;"\
     unit_kernel_k1m4n4(%%xmm8,%%xmm9,%%xmm10,%%xmm11,offb1,offb2,%1,%%r12,4)
@@ -285,7 +285,7 @@
     "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
     "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
     "addq $8,%0;"
-#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
   #define unit_kernel_endn4_k1m2n8(aoff1,aoff2,boff) \
     "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\
     "vbroadcastss "#aoff1"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm6;"\
@@ -379,7 +379,7 @@
     "vmovups (%1),%%xmm3; vmovups (%1,%%r12,4),%%xmm2; vmovups (%1,%%r12,8),%%xmm1; addq $16,%1;"\
     "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
     "addq $4,%0;"
-#if defined TRMMKERNEL && !defined LEFT && !defined TRANSA
+#if defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)
   #define unit_kernel_endn4_k1m1n8(aoff,boff) \
     "vmovups "#boff"(%1,%%r12,4),%%xmm3;"\
     "vbroadcastss "#aoff"(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5;"

From 3a100b2797b62c1fc1341a668accda12137807da Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Thu, 9 Jan 2020 13:48:41 +0800
Subject: [PATCH 176/210] Update KERNEL.SKYLAKEX

---
 kernel/x86_64/KERNEL.SKYLAKEX | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index d5d32d1b3..0e6275748 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -1,7 +1,7 @@
 include $(KERNELDIR)/KERNEL.HASWELL
 
 SGEMMKERNEL    =  sgemm_kernel_16x4_skylakex_2.c
-
+STRMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    =  sgemm_tcopy_16_skylakex.c
 SGEMMONCOPY    =  sgemm_ncopy_4_skylakex.c

From daa4310db5c8abf82233810c240eacd640e78206 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 12 Jan 2020 22:00:50 +0100
Subject: [PATCH 177/210] Install new lapack.h

new file in LAPACK 3.9.0, split off from lapacke.h
---
 Makefile.install | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile.install b/Makefile.install
index 8070b4729..e01d866c9 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -51,6 +51,7 @@ endif
 ifneq ($(OSNAME), AIX)
 ifndef NO_LAPACKE
 	@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
+	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
 	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
 	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
 	@-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"
@@ -100,6 +101,7 @@ else
 #install on AIX has different options syntax
 ifndef NO_LAPACKE
 	@echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
+	@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h"
 	@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h"
 	@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h"
 	@-installbsd -c -m 644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h"

From 1c675670081422b8a3d7f0998dfd7d1454c0d2bd Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 13 Jan 2020 16:26:03 +0800
Subject: [PATCH 178/210] improve skylakex paralleled sgemm performance

---
 param.h | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/param.h b/param.h
index 70c5945ae..3baae31cf 100644
--- a/param.h
+++ b/param.h
@@ -1690,18 +1690,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #else
 
-#define SGEMM_DEFAULT_P 768
+#define SGEMM_DEFAULT_P 640
 #define DGEMM_DEFAULT_P 384
 #define CGEMM_DEFAULT_P 384
 #define ZGEMM_DEFAULT_P 256
 
-#ifdef WINDOWS_ABI
-#define SGEMM_DEFAULT_Q 192
-#define DGEMM_DEFAULT_Q 168
-#else
-#define SGEMM_DEFAULT_Q 192
+#define SGEMM_DEFAULT_Q 320
 #define DGEMM_DEFAULT_Q 168
-#endif
 #define CGEMM_DEFAULT_Q 192
 #define ZGEMM_DEFAULT_Q 128
 

From feaafbedd347871b3f25a018e6655fa9af6d141c Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 13 Jan 2020 16:28:41 +0800
Subject: [PATCH 179/210] make skylakex sgemm code more friendly for readers

BTW some kernels were adjusted to improve performance
---
 kernel/x86_64/sgemm_direct_skylakex.c        | 467 ++++++++++++
 kernel/x86_64/sgemm_kernel_16x4_skylakex.c   | 465 +-----------
 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 715 +++----------------
 3 files changed, 576 insertions(+), 1071 deletions(-)
 create mode 100644 kernel/x86_64/sgemm_direct_skylakex.c

diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c
new file mode 100644
index 000000000..4f9af6e57
--- /dev/null
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -0,0 +1,467 @@
+
+/* the direct sgemm code written by Arjan van der Ven */
+#include <immintrin.h>
+
+/*
+ * "Direct sgemm" code. This code operates directly on the inputs and outputs
+ * of the sgemm call, avoiding the copies, memory realignments and threading,
+ * and only supports alpha = 1 and beta = 0.
+ * This is a common case and provides value for relatively small matrixes.
+ * For larger matrixes the "regular" sgemm code is superior, there the cost of
+ * copying/shuffling the B matrix really pays off.
+ */
+
+
+
+#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps()
+#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
+#define LOAD_B_512(N,M)  __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)])
+#define MATMUL_512(N,M)  result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M)
+#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M)
+
+
+#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps()
+#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
+#define LOAD_B_256(N,M)  __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)])
+#define MATMUL_256(N,M)  result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M)
+#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M)
+
+#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps()
+#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
+#define LOAD_B_128(N,M)  __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)])
+#define MATMUL_128(N,M)  result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M)
+#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M)
+
+#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0;
+#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)];
+#define LOAD_B_SCALAR(N,M)  float Bval##N  = B[k * strideB + j + N];
+#define MATMUL_SCALAR(N,M) result##N##M +=  Aval##M * Bval##N;
+#define STORE_SCALAR(N,M)  R[(i+M) * strideR + j + N] = result##N##M;
+
+int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
+{
+	unsigned long long mnk = M * N * K;
+	/* large matrixes -> not performant */
+	if (mnk >= 28 * 512 * 512)
+		return 0;
+
+	/*
+	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
+	 * and the regular sgemm copy/realignment of data pays off much quicker
+	 */
+	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
+		return 0;
+
+#ifdef SMP
+	/* if we can run multithreaded, the threading changes the based threshold */
+	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
+		return 0;
+#endif
+
+	return 1;
+}
+
+
+
+void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+{
+	int i, j, k;
+
+        int m4 = M & ~3;
+	int m2 = M & ~1;
+
+	int n64 = N & ~63;
+	int n32 = N & ~31;
+	int n16 = N & ~15;
+	int n8 = N & ~7;
+	int n4 = N & ~3;
+	int n2 = N & ~1;
+
+	i = 0;
+
+	for (i = 0; i < m4; i+=4) {
+
+		for (j = 0; j < n64; j+= 64) {
+			k = 0;
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
+			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);    			DECLARE_RESULT_512(2, 2);    DECLARE_RESULT_512(3, 2);
+			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);    			DECLARE_RESULT_512(2, 3);    DECLARE_RESULT_512(3, 3);
+
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+				BROADCAST_LOAD_A_512(x, 2);
+				BROADCAST_LOAD_A_512(x, 3);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
+				MATMUL_512(0, 2);		MATMUL_512(1, 2);			MATMUL_512(2, 2);		MATMUL_512(3, 2);
+				MATMUL_512(0, 3);		MATMUL_512(1, 3);			MATMUL_512(2, 3);		MATMUL_512(3, 3);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
+			STORE_512(0, 2);		STORE_512(1, 2);			STORE_512(2, 2);		STORE_512(3, 2);
+			STORE_512(0, 3);		STORE_512(1, 3);			STORE_512(2, 3);		STORE_512(3, 3);
+		}
+
+		for (; j < n32; j+= 32) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
+			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);
+			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+				BROADCAST_LOAD_A_512(x, 2);
+				BROADCAST_LOAD_A_512(x, 3);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);
+				MATMUL_512(0, 2);		MATMUL_512(1, 2);
+				MATMUL_512(0, 3);		MATMUL_512(1, 3);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);
+			STORE_512(0, 2);		STORE_512(1, 2);
+			STORE_512(0, 3);		STORE_512(1, 3);
+		}
+
+		for (; j < n16; j+= 16) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+			DECLARE_RESULT_512(0, 2);
+			DECLARE_RESULT_512(0, 3);
+
+		 	for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+				BROADCAST_LOAD_A_512(x, 2);
+				BROADCAST_LOAD_A_512(x, 3);
+
+				LOAD_B_512(0, x);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+				MATMUL_512(0, 2);
+				MATMUL_512(0, 3);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+			STORE_512(0, 2);
+			STORE_512(0, 3);
+		}
+
+		for (; j < n8; j+= 8) {
+			DECLARE_RESULT_256(0, 0);
+			DECLARE_RESULT_256(0, 1);
+			DECLARE_RESULT_256(0, 2);
+			DECLARE_RESULT_256(0, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_256(x, 0);
+				BROADCAST_LOAD_A_256(x, 1);
+				BROADCAST_LOAD_A_256(x, 2);
+				BROADCAST_LOAD_A_256(x, 3);
+
+				LOAD_B_256(0, x);
+
+				MATMUL_256(0, 0);
+				MATMUL_256(0, 1);
+				MATMUL_256(0, 2);
+				MATMUL_256(0, 3);
+			}
+			STORE_256(0, 0);
+			STORE_256(0, 1);
+			STORE_256(0, 2);
+			STORE_256(0, 3);
+		}
+
+		for (; j < n4; j+= 4) {
+			DECLARE_RESULT_128(0, 0);
+			DECLARE_RESULT_128(0, 1);
+			DECLARE_RESULT_128(0, 2);
+			DECLARE_RESULT_128(0, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_128(x, 0);
+				BROADCAST_LOAD_A_128(x, 1);
+				BROADCAST_LOAD_A_128(x, 2);
+				BROADCAST_LOAD_A_128(x, 3);
+
+				LOAD_B_128(0, x);
+
+				MATMUL_128(0, 0);
+				MATMUL_128(0, 1);
+				MATMUL_128(0, 2);
+				MATMUL_128(0, 3);
+			}
+			STORE_128(0, 0);
+			STORE_128(0, 1);
+			STORE_128(0, 2);
+			STORE_128(0, 3);
+		}
+
+		for (; j < n2; j+= 2) {
+			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
+			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
+			DECLARE_RESULT_SCALAR(0, 2);	DECLARE_RESULT_SCALAR(1, 2);
+			DECLARE_RESULT_SCALAR(0, 3);	DECLARE_RESULT_SCALAR(1, 3);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(x, 0);
+				BROADCAST_LOAD_A_SCALAR(x, 1);
+				BROADCAST_LOAD_A_SCALAR(x, 2);
+				BROADCAST_LOAD_A_SCALAR(x, 3);
+
+				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
+
+				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
+				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
+				MATMUL_SCALAR(0, 2);	MATMUL_SCALAR(1, 2);
+				MATMUL_SCALAR(0, 3);	MATMUL_SCALAR(1, 3);
+			}
+			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
+			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
+			STORE_SCALAR(0, 2);	STORE_SCALAR(1, 2);
+			STORE_SCALAR(0, 3);	STORE_SCALAR(1, 3);
+		}
+
+		for (; j < N; j++) {
+			DECLARE_RESULT_SCALAR(0, 0)
+			DECLARE_RESULT_SCALAR(0, 1)
+			DECLARE_RESULT_SCALAR(0, 2)
+			DECLARE_RESULT_SCALAR(0, 3)
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(0, 0);
+				BROADCAST_LOAD_A_SCALAR(0, 1);
+				BROADCAST_LOAD_A_SCALAR(0, 2);
+				BROADCAST_LOAD_A_SCALAR(0, 3);
+
+				LOAD_B_SCALAR(0, 0);
+
+				MATMUL_SCALAR(0, 0);
+				MATMUL_SCALAR(0, 1);
+				MATMUL_SCALAR(0, 2);
+				MATMUL_SCALAR(0, 3);
+			}
+			STORE_SCALAR(0, 0);
+			STORE_SCALAR(0, 1);
+			STORE_SCALAR(0, 2);
+			STORE_SCALAR(0, 3);
+		}
+	}
+
+	for (; i < m2; i+=2) {
+		j = 0;
+
+		for (; j < n64; j+= 64) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
+
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
+		}
+
+		for (; j < n32; j+= 32) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
+			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);
+
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);
+				MATMUL_512(0, 1);		MATMUL_512(1, 1);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);
+			STORE_512(0, 1);		STORE_512(1, 1);
+		}
+
+
+		for (; j < n16; j+= 16) {
+			DECLARE_RESULT_512(0, 0);
+			DECLARE_RESULT_512(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				BROADCAST_LOAD_A_512(x, 1);
+
+				LOAD_B_512(0, x);
+
+				MATMUL_512(0, 0);
+				MATMUL_512(0, 1);
+			}
+			STORE_512(0, 0);
+			STORE_512(0, 1);
+		}
+
+		for (; j < n8; j+= 8) {
+			DECLARE_RESULT_256(0, 0);
+			DECLARE_RESULT_256(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_256(x, 0);
+				BROADCAST_LOAD_A_256(x, 1);
+
+				LOAD_B_256(0, x);
+
+				MATMUL_256(0, 0);
+				MATMUL_256(0, 1);
+			}
+			STORE_256(0, 0);
+			STORE_256(0, 1);
+		}
+
+		for (; j < n4; j+= 4) {
+			DECLARE_RESULT_128(0, 0);
+			DECLARE_RESULT_128(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_128(x, 0);
+				BROADCAST_LOAD_A_128(x, 1);
+
+				LOAD_B_128(0, x);
+
+				MATMUL_128(0, 0);
+				MATMUL_128(0, 1);
+			}
+			STORE_128(0, 0);
+			STORE_128(0, 1);
+		}
+		for (; j < n2; j+= 2) {
+			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
+			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(x, 0);
+				BROADCAST_LOAD_A_SCALAR(x, 1);
+
+				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
+
+				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
+				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
+			}
+			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
+			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
+		}
+
+		for (; j < N; j++) {
+			DECLARE_RESULT_SCALAR(0, 0);
+			DECLARE_RESULT_SCALAR(0, 1);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(0, 0);
+				BROADCAST_LOAD_A_SCALAR(0, 1);
+
+				LOAD_B_SCALAR(0, 0);
+
+				MATMUL_SCALAR(0, 0);
+				MATMUL_SCALAR(0, 1);
+			}
+			STORE_SCALAR(0, 0);
+			STORE_SCALAR(0, 1);
+		}
+	}
+
+	for (; i < M; i+=1) {
+		j = 0;
+		for (; j < n64; j+= 64) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
+		}
+		for (; j < n32; j+= 32) {
+			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+				LOAD_B_512(0, x);		LOAD_B_512(1, x);
+				MATMUL_512(0, 0);		MATMUL_512(1, 0);
+			}
+			STORE_512(0, 0);		STORE_512(1, 0);
+		}
+
+
+		for (; j < n16; j+= 16) {
+			DECLARE_RESULT_512(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_512(x, 0);
+
+				LOAD_B_512(0, x);
+
+				MATMUL_512(0, 0);
+			}
+			STORE_512(0, 0);
+		}
+
+		for (; j < n8; j+= 8) {
+			DECLARE_RESULT_256(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_256(x, 0);
+				LOAD_B_256(0, x);
+				MATMUL_256(0, 0);
+			}
+			STORE_256(0, 0);
+		}
+
+		for (; j < n4; j+= 4) {
+			DECLARE_RESULT_128(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_128(x, 0);
+				LOAD_B_128(0, x);
+				MATMUL_128(0, 0);
+			}
+			STORE_128(0, 0);
+		}
+
+		for (; j < n2; j+= 2) {
+			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(x, 0);
+				LOAD_B_SCALAR(0, 0);	LOAD_B_SCALAR(1, 0);
+				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
+			}
+			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
+		}
+
+		for (; j < N; j++) {
+			DECLARE_RESULT_SCALAR(0, 0);
+
+			for (k = 0; k < K; k++) {
+				BROADCAST_LOAD_A_SCALAR(0, 0);
+				LOAD_B_SCALAR(0, 0);
+				MATMUL_SCALAR(0, 0);
+			}
+			STORE_SCALAR(0, 0);
+		}
+	}
+}
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
index 76b82e65b..d174bbcc3 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c
@@ -1176,467 +1176,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo
 	return 0;
 }
 
-
-/*
- * "Direct sgemm" code. This code operates directly on the inputs and outputs
- * of the sgemm call, avoiding the copies, memory realignments and threading,
- * and only supports alpha = 1 and beta = 0.
- * This is a common case and provides value for relatively small matrixes.
- * For larger matrixes the "regular" sgemm code is superior, there the cost of
- * copying/shuffling the B matrix really pays off.
- */
-
-
-
-#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps()
-#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
-#define LOAD_B_512(N,M)  __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)])
-#define MATMUL_512(N,M)  result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M)
-#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M)
-
-
-#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps()
-#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
-#define LOAD_B_256(N,M)  __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)])
-#define MATMUL_256(N,M)  result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M)
-#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M)
-
-#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps()
-#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
-#define LOAD_B_128(N,M)  __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)])
-#define MATMUL_128(N,M)  result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M)
-#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M)
-
-#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0;
-#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)];
-#define LOAD_B_SCALAR(N,M)  float Bval##N  = B[k * strideB + j + N];
-#define MATMUL_SCALAR(N,M) result##N##M +=  Aval##M * Bval##N;
-#define STORE_SCALAR(N,M)  R[(i+M) * strideR + j + N] = result##N##M;
-
-int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
-{
-	unsigned long long mnk = M * N * K;
-	/* large matrixes -> not performant */
-	if (mnk >= 28 * 512 * 512)
-		return 0;
-
-	/*
-	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
-	 * and the regular sgemm copy/realignment of data pays off much quicker
-	 */
-	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
-		return 0;
-
-#ifdef SMP
-	/* if we can run multithreaded, the threading changes the based threshold */
-	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
-		return 0;
-#endif
-
-	return 1;
-}
-
-
-
-void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
-{
-	int i, j, k;
-
-        int m4 = M & ~3;
-	int m2 = M & ~1;
-
-	int n64 = N & ~63;
-	int n32 = N & ~31;
-	int n16 = N & ~15;
-	int n8 = N & ~7;
-	int n4 = N & ~3;
-	int n2 = N & ~1;
-
-	i = 0;
-
-	for (i = 0; i < m4; i+=4) {
-
-		for (j = 0; j < n64; j+= 64) {
-			k = 0;
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
-			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);    			DECLARE_RESULT_512(2, 2);    DECLARE_RESULT_512(3, 2);
-			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);    			DECLARE_RESULT_512(2, 3);    DECLARE_RESULT_512(3, 3);
-
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-				BROADCAST_LOAD_A_512(x, 2);
-				BROADCAST_LOAD_A_512(x, 3);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
-				MATMUL_512(0, 2);		MATMUL_512(1, 2);			MATMUL_512(2, 2);		MATMUL_512(3, 2);
-				MATMUL_512(0, 3);		MATMUL_512(1, 3);			MATMUL_512(2, 3);		MATMUL_512(3, 3);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
-			STORE_512(0, 2);		STORE_512(1, 2);			STORE_512(2, 2);		STORE_512(3, 2);
-			STORE_512(0, 3);		STORE_512(1, 3);			STORE_512(2, 3);		STORE_512(3, 3);
-		}
-
-		for (; j < n32; j+= 32) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
-			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);
-			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-				BROADCAST_LOAD_A_512(x, 2);
-				BROADCAST_LOAD_A_512(x, 3);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);
-				MATMUL_512(0, 2);		MATMUL_512(1, 2);
-				MATMUL_512(0, 3);		MATMUL_512(1, 3);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);
-			STORE_512(0, 2);		STORE_512(1, 2);
-			STORE_512(0, 3);		STORE_512(1, 3);
-		}
-
-		for (; j < n16; j+= 16) {
-			DECLARE_RESULT_512(0, 0);
-			DECLARE_RESULT_512(0, 1);
-			DECLARE_RESULT_512(0, 2);
-			DECLARE_RESULT_512(0, 3);
-
-		 	for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-				BROADCAST_LOAD_A_512(x, 2);
-				BROADCAST_LOAD_A_512(x, 3);
-
-				LOAD_B_512(0, x);
-
-				MATMUL_512(0, 0);
-				MATMUL_512(0, 1);
-				MATMUL_512(0, 2);
-				MATMUL_512(0, 3);
-			}
-			STORE_512(0, 0);
-			STORE_512(0, 1);
-			STORE_512(0, 2);
-			STORE_512(0, 3);
-		}
-
-		for (; j < n8; j+= 8) {
-			DECLARE_RESULT_256(0, 0);
-			DECLARE_RESULT_256(0, 1);
-			DECLARE_RESULT_256(0, 2);
-			DECLARE_RESULT_256(0, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_256(x, 0);
-				BROADCAST_LOAD_A_256(x, 1);
-				BROADCAST_LOAD_A_256(x, 2);
-				BROADCAST_LOAD_A_256(x, 3);
-
-				LOAD_B_256(0, x);
-
-				MATMUL_256(0, 0);
-				MATMUL_256(0, 1);
-				MATMUL_256(0, 2);
-				MATMUL_256(0, 3);
-			}
-			STORE_256(0, 0);
-			STORE_256(0, 1);
-			STORE_256(0, 2);
-			STORE_256(0, 3);
-		}
-
-		for (; j < n4; j+= 4) {
-			DECLARE_RESULT_128(0, 0);
-			DECLARE_RESULT_128(0, 1);
-			DECLARE_RESULT_128(0, 2);
-			DECLARE_RESULT_128(0, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_128(x, 0);
-				BROADCAST_LOAD_A_128(x, 1);
-				BROADCAST_LOAD_A_128(x, 2);
-				BROADCAST_LOAD_A_128(x, 3);
-
-				LOAD_B_128(0, x);
-
-				MATMUL_128(0, 0);
-				MATMUL_128(0, 1);
-				MATMUL_128(0, 2);
-				MATMUL_128(0, 3);
-			}
-			STORE_128(0, 0);
-			STORE_128(0, 1);
-			STORE_128(0, 2);
-			STORE_128(0, 3);
-		}
-
-		for (; j < n2; j+= 2) {
-			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
-			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
-			DECLARE_RESULT_SCALAR(0, 2);	DECLARE_RESULT_SCALAR(1, 2);
-			DECLARE_RESULT_SCALAR(0, 3);	DECLARE_RESULT_SCALAR(1, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(x, 0);
-				BROADCAST_LOAD_A_SCALAR(x, 1);
-				BROADCAST_LOAD_A_SCALAR(x, 2);
-				BROADCAST_LOAD_A_SCALAR(x, 3);
-
-				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
-
-				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
-				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
-				MATMUL_SCALAR(0, 2);	MATMUL_SCALAR(1, 2);
-				MATMUL_SCALAR(0, 3);	MATMUL_SCALAR(1, 3);
-			}
-			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
-			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
-			STORE_SCALAR(0, 2);	STORE_SCALAR(1, 2);
-			STORE_SCALAR(0, 3);	STORE_SCALAR(1, 3);
-		}
-
-		for (; j < N; j++) {
-			DECLARE_RESULT_SCALAR(0, 0)
-			DECLARE_RESULT_SCALAR(0, 1)
-			DECLARE_RESULT_SCALAR(0, 2)
-			DECLARE_RESULT_SCALAR(0, 3)
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(0, 0);
-				BROADCAST_LOAD_A_SCALAR(0, 1);
-				BROADCAST_LOAD_A_SCALAR(0, 2);
-				BROADCAST_LOAD_A_SCALAR(0, 3);
-
-				LOAD_B_SCALAR(0, 0);
-
-				MATMUL_SCALAR(0, 0);
-				MATMUL_SCALAR(0, 1);
-				MATMUL_SCALAR(0, 2);
-				MATMUL_SCALAR(0, 3);
-			}
-			STORE_SCALAR(0, 0);
-			STORE_SCALAR(0, 1);
-			STORE_SCALAR(0, 2);
-			STORE_SCALAR(0, 3);
-		}
-	}
-
-	for (; i < m2; i+=2) {
-		j = 0;
-
-		for (; j < n64; j+= 64) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
-
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
-		}
-
-		for (; j < n32; j+= 32) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);
-		}
-
-
-		for (; j < n16; j+= 16) {
-			DECLARE_RESULT_512(0, 0);
-			DECLARE_RESULT_512(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-
-				LOAD_B_512(0, x);
-
-				MATMUL_512(0, 0);
-				MATMUL_512(0, 1);
-			}
-			STORE_512(0, 0);
-			STORE_512(0, 1);
-		}
-
-		for (; j < n8; j+= 8) {
-			DECLARE_RESULT_256(0, 0);
-			DECLARE_RESULT_256(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_256(x, 0);
-				BROADCAST_LOAD_A_256(x, 1);
-
-				LOAD_B_256(0, x);
-
-				MATMUL_256(0, 0);
-				MATMUL_256(0, 1);
-			}
-			STORE_256(0, 0);
-			STORE_256(0, 1);
-		}
-
-		for (; j < n4; j+= 4) {
-			DECLARE_RESULT_128(0, 0);
-			DECLARE_RESULT_128(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_128(x, 0);
-				BROADCAST_LOAD_A_128(x, 1);
-
-				LOAD_B_128(0, x);
-
-				MATMUL_128(0, 0);
-				MATMUL_128(0, 1);
-			}
-			STORE_128(0, 0);
-			STORE_128(0, 1);
-		}
-		for (; j < n2; j+= 2) {
-			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
-			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(x, 0);
-				BROADCAST_LOAD_A_SCALAR(x, 1);
-
-				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
-
-				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
-				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
-			}
-			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
-			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
-		}
-
-		for (; j < N; j++) {
-			DECLARE_RESULT_SCALAR(0, 0);
-			DECLARE_RESULT_SCALAR(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(0, 0);
-				BROADCAST_LOAD_A_SCALAR(0, 1);
-
-				LOAD_B_SCALAR(0, 0);
-
-				MATMUL_SCALAR(0, 0);
-				MATMUL_SCALAR(0, 1);
-			}
-			STORE_SCALAR(0, 0);
-			STORE_SCALAR(0, 1);
-		}
-	}
-
-	for (; i < M; i+=1) {
-		j = 0;
-		for (; j < n64; j+= 64) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
-		}
-		for (; j < n32; j+= 32) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);
-		}
-
-
-		for (; j < n16; j+= 16) {
-			DECLARE_RESULT_512(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-
-				LOAD_B_512(0, x);
-
-				MATMUL_512(0, 0);
-			}
-			STORE_512(0, 0);
-		}
-
-		for (; j < n8; j+= 8) {
-			DECLARE_RESULT_256(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_256(x, 0);
-				LOAD_B_256(0, x);
-				MATMUL_256(0, 0);
-			}
-			STORE_256(0, 0);
-		}
-
-		for (; j < n4; j+= 4) {
-			DECLARE_RESULT_128(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_128(x, 0);
-				LOAD_B_128(0, x);
-				MATMUL_128(0, 0);
-			}
-			STORE_128(0, 0);
-		}
-
-		for (; j < n2; j+= 2) {
-			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(x, 0);
-				LOAD_B_SCALAR(0, 0);	LOAD_B_SCALAR(1, 0);
-				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
-			}
-			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
-		}
-
-		for (; j < N; j++) {
-			DECLARE_RESULT_SCALAR(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(0, 0);
-				LOAD_B_SCALAR(0, 0);
-				MATMUL_SCALAR(0, 0);
-			}
-			STORE_SCALAR(0, 0);
-		}
-	}
-}
+#include "sgemm_direct_skylakex.c"
diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
index ee3417505..e4ca6b1bd 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
@@ -1,5 +1,5 @@
 /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 for k_count, %5 for c_store */
-/* r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
+/* r10 to assist prefetch, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
 
 #include "common.h"
 #include <stdint.h>
@@ -53,26 +53,25 @@
 #define SAVE_m16(ndim) SAVE_h_m16n##ndim "addq $64,%2;"
 #define COMPUTE_m16(ndim) \
     INIT_m16n##ndim\
-    "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5;"\
-    "cmpq $18,%4; jb "#ndim"016162f;"\
+    "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15; movq %2,%5; xorq %%r10,%%r10;"\
+    "cmpq $16,%4; jb "#ndim"016162f;"\
     #ndim"016161:\n\t"\
+    "cmpq $126,%%r10; movq $126,%%r10; cmoveq %3,%%r10;"\
     KERNEL_k1m16n##ndim\
     KERNEL_k1m16n##ndim\
+    "prefetcht1 (%5); subq $63,%5; addq %%r10,%5;"\
     KERNEL_k1m16n##ndim\
-    "prefetcht1 (%5); prefetcht1 63(%5); addq %3,%5;"\
     KERNEL_k1m16n##ndim\
-    KERNEL_k1m16n##ndim\
-    KERNEL_k1m16n##ndim\
-    "prefetcht1 (%8); addq $32,%8;"\
-    "subq $6,%4; cmpq $18,%4; jnb "#ndim"016161b;"\
+    "prefetcht1 (%6); addq $32,%6;"\
+    "subq $4,%4; cmpq $16,%4; jnb "#ndim"016161b;"\
     "movq %2,%5;"\
     #ndim"016162:\n\t"\
-    "testq %4,%4; jz "#ndim"016163f;"\
+    "testq %4,%4; jz "#ndim"016164f;"\
+    #ndim"016163:\n\t"\
     "prefetcht0 (%5); prefetcht0 63(%5); prefetcht0 (%5,%3,1); prefetcht0 63(%5,%3,1);"\
     KERNEL_k1m16n##ndim\
-    "leaq (%5,%3,2),%5;"\
-    "decq %4; jmp "#ndim"016162b;"\
-    #ndim"016163:\n\t"\
+    "leaq (%5,%3,2),%5; decq %4; jnz "#ndim"016163b;"\
+    #ndim"016164:\n\t"\
     "prefetcht0 (%%r14); prefetcht0 64(%%r14);"\
     SAVE_m16(ndim)
 
@@ -212,185 +211,152 @@
 #define COMPUTE_m4_n24 COMPUTE_L_m4(12,55555) COMPUTE_R_m4(12,55955)
 #define COMPUTE_m4(ndim) COMPUTE_m4_n##ndim
 
-/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
+/* m = 2 *//* xmm0 for alpha, xmm1-xmm3 for temporary use, xmm4-xmm15 for accumulators */
 #define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
-#define KERNEL_k1m2n1(b_addr) \
+#define KERNEL_k1m2n1 \
     "vmovsd (%0),%%xmm1; addq $8,%0;"\
-    "vbroadcastss ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
-    "addq $4,"#b_addr";"
-#define SAVE_L_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
+    "vbroadcastss (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "addq $4,%1;"
+#define SAVE_h_m2n1 "vmovsd (%2),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2);"
 #define INIT_m2n2 INIT_m2n1 "vpxor %%xmm5,%%xmm5,%%xmm5;"
-#define KERNEL_k1m2n2(b_addr) \
+#define KERNEL_k1m2n2 \
     "vmovsd (%0),%%xmm1; addq $8,%0;"\
-    "vbroadcastss  ("#b_addr"),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
-    "vbroadcastss 4("#b_addr"),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
-    "addq $8,"#b_addr";"
-#define SAVE_L_m2n2 SAVE_L_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
+    "vbroadcastss  (%1),%%xmm2; vfmadd231ps %%xmm1,%%xmm2,%%xmm4;"\
+    "vbroadcastss 4(%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm5;"\
+    "addq $8,%1;"
+#define SAVE_h_m2n2 SAVE_h_m2n1 "vmovsd (%2,%3,1),%%xmm1; vfmadd213ps %%xmm1,%%xmm0,%%xmm5; vmovsd %%xmm5,(%2,%3,1);"
 #define INIT_m2n4  INIT_m2n2
 #define INIT_m2n8  INIT_m2n4 "vpxor %%xmm6,%%xmm6,%%xmm6; vpxor %%xmm7,%%xmm7,%%xmm7;"
 #define INIT_m2n12 INIT_m2n8 "vpxor %%xmm8,%%xmm8,%%xmm8; vpxor %%xmm9,%%xmm9,%%xmm9;"
-#define KERNEL_k1m2n4(b_addr) \
-    "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\
-    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
-    "vbroadcastss 4(%0),%%xmm2; vfmadd231ps %%xmm3,%%xmm2,%%xmm5;"\
-    "addq $8,%0;"
-#define KERNEL_k1m2n8(b_addr) \
-    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\
-    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm6;"\
-    "vbroadcastss 4(%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm5; vfmadd231ps %%xmm2,%%xmm1,%%xmm7;"\
-    "addq $8,%0;"
-#define KERNEL_k1m2n12(b_addr) \
-    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\
-    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm6; vfmadd231ps %%xmm1,%%xmm10,%%xmm8;"\
-    "vbroadcastss 4(%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm5; vfmadd231ps %%xmm2,%%xmm10,%%xmm7; vfmadd231ps %%xmm1,%%xmm10,%%xmm9;"\
-    "addq $8,%0;"
+#define INIT_m2n16 INIT_m2n12 "vpxor %%xmm10,%%xmm10,%%xmm10; vpxor %%xmm11,%%xmm11,%%xmm11;"
+#define INIT_m2n20 INIT_m2n16 "vpxor %%xmm12,%%xmm12,%%xmm12; vpxor %%xmm13,%%xmm13,%%xmm13;"
+#define INIT_m2n24 INIT_m2n20 "vpxor %%xmm14,%%xmm14,%%xmm14; vpxor %%xmm15,%%xmm15,%%xmm15;"
+#define KERNEL_h_k1m2n4 \
+    "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2; addq $8,%0;"\
+    "vmovups (%1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm4; vfmadd231ps %%xmm2,%%xmm3,%%xmm5;"
+#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
+#define KERNEL_h_k1m2n8 KERNEL_h_k1m2n4 "vmovups (%1,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm6; vfmadd231ps %%xmm2,%%xmm3,%%xmm7;"
+#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%1;"
+#define KERNEL_k1m2n12 KERNEL_h_k1m2n8 \
+    "vmovups (%1,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm8; vfmadd231ps %%xmm2,%%xmm3,%%xmm9; addq $16,%1;"
+#define KERNEL_h_k1m2n16 KERNEL_k1m2n12 "vmovups (%%r15),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm10; vfmadd231ps %%xmm2,%%xmm3,%%xmm11;"
+#define KERNEL_k1m2n16 KERNEL_h_k1m2n16 "addq $16,%%r15;"
+#define KERNEL_h_k1m2n20 KERNEL_h_k1m2n16 "vmovups (%%r15,%%r12,1),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm12; vfmadd231ps %%xmm2,%%xmm3,%%xmm13;"
+#define KERNEL_k1m2n20 KERNEL_h_k1m2n20 "addq $16,%%r15;"
+#define KERNEL_h_k1m2n24 KERNEL_h_k1m2n20 "vmovups (%%r15,%%r12,2),%%xmm3; vfmadd231ps %%xmm1,%%xmm3,%%xmm14; vfmadd231ps %%xmm2,%%xmm3,%%xmm15;"
+#define KERNEL_k1m2n24 KERNEL_h_k1m2n24 "addq $16,%%r15;"
 #define unit_save_m2n4(c1,c2) \
     "vunpcklps "#c2","#c1",%%xmm1; vunpckhps "#c2","#c1",%%xmm2;"\
     "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1; vmovsd %%xmm1,(%5); vmovhpd %%xmm1,(%5,%3,1);"\
     "leaq (%5,%3,2),%5;"\
     "vmovsd (%5),%%xmm3; vmovhpd (%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2; vmovsd %%xmm2,(%5); vmovhpd %%xmm2,(%5,%3,1);"\
     "leaq (%5,%3,2),%5;"
-#define SAVE_L_m2n4  "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
-#define SAVE_L_m2n8  SAVE_L_m2n4   unit_save_m2n4(%%xmm6,%%xmm7)
-#define SAVE_L_m2n12 SAVE_L_m2n8   unit_save_m2n4(%%xmm8,%%xmm9)
-#define SAVE_R_m2n4                unit_save_m2n4(%%xmm4,%%xmm5)
-#define SAVE_R_m2n8  SAVE_R_m2n4   unit_save_m2n4(%%xmm6,%%xmm7)
-#define SAVE_R_m2n12 SAVE_R_m2n8   unit_save_m2n4(%%xmm8,%%xmm9)
-#define COMPUTE_L_m2(ndim,sim) \
-    INIT_m2n##ndim\
-    "movq %%r13,%4; movq %%r14,%1;"\
-    #ndim""#sim"222:\n\t"\
-    "testq %4,%4; jz "#ndim""#sim"223f;"\
-    KERNEL_k1m2n##ndim(%1)\
-    "decq %4; jmp "#ndim""#sim"222b;"\
-    #ndim""#sim"223:\n\t"\
-    SAVE_L_m2n##ndim "addq $8,%2;"
-#define COMPUTE_R_m2(ndim,sim) \
-    "salq $3,%%r13;subq %%r13,%0;sarq $3,%%r13;"\
+#define SAVE_h_m2n4  "movq %2,%5;" unit_save_m2n4(%%xmm4,%%xmm5)
+#define SAVE_h_m2n8  SAVE_h_m2n4   unit_save_m2n4(%%xmm6,%%xmm7)
+#define SAVE_h_m2n12 SAVE_h_m2n8   unit_save_m2n4(%%xmm8,%%xmm9)
+#define SAVE_h_m2n16 SAVE_h_m2n12  unit_save_m2n4(%%xmm10,%%xmm11)
+#define SAVE_h_m2n20 SAVE_h_m2n16  unit_save_m2n4(%%xmm12,%%xmm13)
+#define SAVE_h_m2n24 SAVE_h_m2n20  unit_save_m2n4(%%xmm14,%%xmm15)
+#define SAVE_m2(ndim) SAVE_h_m2n##ndim "addq $8,%2;"
+#define COMPUTE_m2(ndim) \
     INIT_m2n##ndim\
-    "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
-    #ndim""#sim"222:\n\t"\
-    "testq %4,%4; jz "#ndim""#sim"223f;"\
-    KERNEL_k1m2n##ndim(%%r15)\
-    "decq %4; jmp "#ndim""#sim"222b;"\
-    #ndim""#sim"223:\n\t"\
-    SAVE_R_m2n##ndim
-#define COMPUTE_m2_n1  COMPUTE_L_m2(1,77877)
-#define COMPUTE_m2_n2  COMPUTE_L_m2(2,77877)
-#define COMPUTE_m2_n4  COMPUTE_L_m2(4,77877)
-#define COMPUTE_m2_n8  COMPUTE_L_m2(8,77877)
-#define COMPUTE_m2_n12 COMPUTE_L_m2(12,77877)
-#define COMPUTE_m2_n16 COMPUTE_L_m2(12,77777) COMPUTE_R_m2(4,77977)
-#define COMPUTE_m2_n20 COMPUTE_L_m2(12,77677) COMPUTE_R_m2(8,77977)
-#define COMPUTE_m2_n24 COMPUTE_L_m2(12,77577) COMPUTE_R_m2(12,77977)
-#define COMPUTE_m2(ndim) COMPUTE_m2_n##ndim
-
-/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm6 for accumulators */
+    "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\
+    "testq %4,%4; jz "#ndim"002022f;"\
+    #ndim"002021:\n\t"\
+    KERNEL_k1m2n##ndim "decq %4; jnz "#ndim"002021b;"\
+    #ndim"002022:\n\t"\
+    SAVE_m2(ndim)
+
+/* m = 1 *//* xmm0 for alpha, xmm1-xmm3 and xmm10 for temporary use, xmm4-xmm9 for accumulators */
 #define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
-#define KERNEL_k1m1n1(b_addr) \
-    "vmovss ("#b_addr"),%%xmm3; addq $4,"#b_addr";"\
+#define KERNEL_k1m1n1 \
+    "vmovss (%1),%%xmm3; addq $4,%1;"\
     "vmovss (%0),%%xmm1; vfmadd231ss %%xmm3,%%xmm1,%%xmm4;"\
     "addq $4,%0;"
-#define SAVE_L_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
+#define SAVE_h_m1n1 "vfmadd213ss (%2),%%xmm0,%%xmm4; vmovss %%xmm4,(%2);"
 #define INIT_m1n2 INIT_m1n1
-#define KERNEL_k1m1n2(b_addr) \
-    "vmovsd ("#b_addr"),%%xmm3; addq $8,"#b_addr";"\
+#define KERNEL_k1m1n2 \
+    "vmovsd (%1),%%xmm3; addq $8,%1;"\
     "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
     "addq $4,%0;"
-#define SAVE_L_m1n2 \
+#define SAVE_h_m1n2 \
     "vmovss (%2),%%xmm3; vinsertps $16,(%2,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm4;"\
     "vmovss %%xmm4,(%2); vextractps $1,%%xmm4,(%2,%3,1);"
 #define INIT_m1n4  INIT_m1n2
 #define INIT_m1n8  INIT_m1n4 "vpxor %%xmm5,%%xmm5,%%xmm5;"
 #define INIT_m1n12 INIT_m1n8 "vpxor %%xmm6,%%xmm6,%%xmm6;"
-#define KERNEL_k1m1n4(b_addr) \
-    "vmovups ("#b_addr"),%%xmm3; addq $16,"#b_addr";"\
-    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4;"\
-    "addq $4,%0;"
-#define KERNEL_k1m1n8(b_addr) \
-    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; addq $16,"#b_addr";"\
-    "vbroadcastss  (%0),%%xmm1; vfmadd231ps %%xmm3,%%xmm1,%%xmm4; vfmadd231ps %%xmm2,%%xmm1,%%xmm5;"\
-    "addq $4,%0;"
-#define KERNEL_k1m1n12(b_addr) \
-    "vmovups ("#b_addr"),%%xmm3; vmovups ("#b_addr",%%r12,1),%%xmm2; vmovups ("#b_addr",%%r12,2),%%xmm1; addq $16,"#b_addr";"\
-    "vbroadcastss  (%0),%%xmm10; vfmadd231ps %%xmm3,%%xmm10,%%xmm4; vfmadd231ps %%xmm2,%%xmm10,%%xmm5; vfmadd231ps %%xmm1,%%xmm10,%%xmm6;"\
-    "addq $4,%0;"
+#define INIT_m1n16 INIT_m1n12 "vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define INIT_m1n20 INIT_m1n16 "vpxor %%xmm8,%%xmm8,%%xmm8;"
+#define INIT_m1n24 INIT_m1n20 "vpxor %%xmm9,%%xmm9,%%xmm9;"
+#define KERNEL_h_k1m1n4 \
+    "vbroadcastss (%0),%%xmm1; addq $4,%0; vfmadd231ps (%1),%%xmm1,%%xmm4;"
+#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
+#define KERNEL_h_k1m1n8 KERNEL_h_k1m1n4 "vfmadd231ps (%1,%%r12,1),%%xmm1,%%xmm5;"
+#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%1;"
+#define KERNEL_k1m1n12 KERNEL_h_k1m1n8 "vfmadd231ps (%1,%%r12,2),%%xmm1,%%xmm6; addq $16,%1;"
+#define KERNEL_h_k1m1n16 KERNEL_k1m1n12 "vfmadd231ps (%%r15),%%xmm1,%%xmm7;"
+#define KERNEL_k1m1n16 KERNEL_h_k1m1n16 "addq $16,%%r15;"
+#define KERNEL_h_k1m1n20 KERNEL_h_k1m1n16 "vfmadd231ps (%%r15,%%r12,1),%%xmm1,%%xmm8;"
+#define KERNEL_k1m1n20 KERNEL_h_k1m1n20 "addq $16,%%r15;"
+#define KERNEL_h_k1m1n24 KERNEL_h_k1m1n20 "vfmadd231ps (%%r15,%%r12,2),%%xmm1,%%xmm9;"
+#define KERNEL_k1m1n24 KERNEL_h_k1m1n24 "addq $16,%%r15;"
 #define unit_save_m1n4(c1) \
     "vpxor %%xmm10,%%xmm10,%%xmm10; vmovsd "#c1",%%xmm10,%%xmm2; vmovhlps "#c1",%%xmm10,%%xmm1;"\
     "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm2;"\
     "vmovss %%xmm2,(%5); vextractps $1,%%xmm2,(%5,%3,1); leaq (%5,%3,2),%5;"\
     "vmovss (%5),%%xmm3; vinsertps $16,(%5,%3,1),%%xmm3,%%xmm3; vfmadd213ps %%xmm3,%%xmm0,%%xmm1;"\
     "vmovss %%xmm1,(%5); vextractps $1,%%xmm1,(%5,%3,1); leaq (%5,%3,2),%5;"
-#define SAVE_L_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
-#define SAVE_L_m1n8  SAVE_L_m1n4  unit_save_m1n4(%%xmm5)
-#define SAVE_L_m1n12 SAVE_L_m1n8  unit_save_m1n4(%%xmm6)
-#define SAVE_R_m1n4               unit_save_m1n4(%%xmm4)
-#define SAVE_R_m1n8  SAVE_R_m1n4  unit_save_m1n4(%%xmm5)
-#define SAVE_R_m1n12 SAVE_R_m1n8  unit_save_m1n4(%%xmm6)
-#define COMPUTE_L_m1(ndim,sim) \
-    INIT_m1n##ndim\
-    "movq %%r13,%4; movq %%r14,%1;"\
-    #ndim""#sim"112:\n\t"\
-    "testq %4,%4; jz "#ndim""#sim"113f;"\
-    KERNEL_k1m1n##ndim(%1)\
-    "decq %4; jmp "#ndim""#sim"112b;"\
-    #ndim""#sim"113:\n\t"\
-    SAVE_L_m1n##ndim "addq $4,%2;"
-#define COMPUTE_R_m1(ndim,sim) \
-    "salq $2,%%r13;subq %%r13,%0;sarq $2,%%r13;"\
+#define SAVE_h_m1n4 "movq %2,%5;" unit_save_m1n4(%%xmm4)
+#define SAVE_h_m1n8  SAVE_h_m1n4  unit_save_m1n4(%%xmm5)
+#define SAVE_h_m1n12 SAVE_h_m1n8  unit_save_m1n4(%%xmm6)
+#define SAVE_h_m1n16 SAVE_h_m1n12 unit_save_m1n4(%%xmm7)
+#define SAVE_h_m1n20 SAVE_h_m1n16 unit_save_m1n4(%%xmm8)
+#define SAVE_h_m1n24 SAVE_h_m1n20 unit_save_m1n4(%%xmm9)
+#define SAVE_m1(ndim) SAVE_h_m1n##ndim "addq $4,%2;"
+#define COMPUTE_m1(ndim) \
     INIT_m1n##ndim\
-    "movq %%r13,%4; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"\
-    #ndim""#sim"112:\n\t"\
-    "testq %4,%4; jz "#ndim""#sim"113f;"\
-    KERNEL_k1m1n##ndim(%%r15)\
-    "decq %4; jmp "#ndim""#sim"112b;"\
-    #ndim""#sim"113:\n\t"\
-    SAVE_R_m1n##ndim
-#define COMPUTE_m1_n1  COMPUTE_L_m1(1,99899)
-#define COMPUTE_m1_n2  COMPUTE_L_m1(2,99899)
-#define COMPUTE_m1_n4  COMPUTE_L_m1(4,99899)
-#define COMPUTE_m1_n8  COMPUTE_L_m1(8,99899)
-#define COMPUTE_m1_n12 COMPUTE_L_m1(12,99899)
-#define COMPUTE_m1_n16 COMPUTE_L_m1(12,99799) COMPUTE_R_m1(4,99999)
-#define COMPUTE_m1_n20 COMPUTE_L_m1(12,99699) COMPUTE_R_m1(8,99999)
-#define COMPUTE_m1_n24 COMPUTE_L_m1(12,99599) COMPUTE_R_m1(12,99999)
-#define COMPUTE_m1(ndim) COMPUTE_m1_n##ndim
+    "movq %%r13,%4; movq %%r14,%1; leaq (%1,%%r12,2),%%r15; addq %%r12,%%r15;"\
+    "testq %4,%4; jz "#ndim"001012f;"\
+    #ndim"001011:\n\t"\
+    KERNEL_k1m1n##ndim "decq %4; jnz "#ndim"001011b;"\
+    #ndim"001012:\n\t"\
+    SAVE_m1(ndim)
 
 /* %0 = "+r"(a_pointer), %1 = "+r"(b_pointer), %2 = "+r"(c_pointer), %3 = "+r"(ldc_in_bytes), %4 = "+r"(K), %5 = "+r"(ctemp) */
-/* %6 = "+r"(&alpha), %7 = "+r"(M), %8 = "+r"(next_b) */
-/* r11 = m(const), r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
+/* %6 = "+r"(next_b), %7 = "m"(ALPHA), %8 = "m"(M) */
+/* r11 = m_counter, r12 = k << 4(const), r13 = k(const), r14 = b_head_pos(const), r15 = %1 + 3r12 */
 
 #define COMPUTE(ndim) {\
     next_b = b_pointer + ndim * K;\
     __asm__ __volatile__(\
-    "vbroadcastss (%6),%%zmm0;"\
-    "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %7,%%r11;"\
-    "cmpq $16,%7;jb 33101"#ndim"f;"\
+    "vbroadcastss %7,%%zmm0;"\
+    "movq %4,%%r13; movq %4,%%r12; salq $4,%%r12; movq %1,%%r14; movq %8,%%r11;"\
+    "cmpq $16,%%r11;jb 33101"#ndim"f;"\
     "33109"#ndim":\n\t"\
     COMPUTE_m16(ndim)\
-    "subq $16,%7;cmpq $16,%7;jnb 33109"#ndim"b;"\
+    "subq $16,%%r11;cmpq $16,%%r11;jnb 33109"#ndim"b;"\
     "33101"#ndim":\n\t"\
-    "cmpq $8,%7;jb 33102"#ndim"f;"\
+    "cmpq $8,%%r11;jb 33102"#ndim"f;"\
     COMPUTE_m8(ndim)\
-    "subq $8,%7;"\
+    "subq $8,%%r11;"\
     "33102"#ndim":\n\t"\
-    "cmpq $4,%7;jb 33103"#ndim"f;"\
+    "cmpq $4,%%r11;jb 33103"#ndim"f;"\
     COMPUTE_m4(ndim)\
-    "subq $4,%7;"\
+    "subq $4,%%r11;"\
     "33103"#ndim":\n\t"\
-    "cmpq $2,%7;jb 33104"#ndim"f;"\
+    "cmpq $2,%%r11;jb 33104"#ndim"f;"\
     COMPUTE_m2(ndim)\
-    "subq $2,%7;"\
+    "subq $2,%%r11;"\
     "33104"#ndim":\n\t"\
-    "testq %7,%7;jz 33105"#ndim"f;"\
+    "testq %%r11,%%r11;jz 33105"#ndim"f;"\
     COMPUTE_m1(ndim)\
     "33105"#ndim":\n\t"\
-    "movq %%r13,%4; movq %%r14,%1; movq %%r11,%7;"\
-    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(alp),"+r"(M),"+r"(next_b)\
-    ::"r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
+    "movq %%r13,%4; movq %%r14,%1; vzeroupper;"\
+    :"+r"(a_pointer),"+r"(b_pointer),"+r"(c_pointer),"+r"(ldc_in_bytes),"+r"(K),"+r"(ctemp),"+r"(next_b):"m"(ALPHA),"m"(M)\
+    :"r10","r11","r12","r13","r14","r15","zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14",\
     "zmm15","zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31",\
     "cc","memory");\
-    a_pointer -= M * K; b_pointer += ndim * K;c_pointer += LDC * ndim - M;\
+    a_pointer -= M * K; b_pointer += ndim * K; c_pointer += LDC * ndim - M;\
 }
 int __attribute__ ((noinline))
 CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, float * __restrict__ B, float * __restrict__ C, BLASLONG LDC)
@@ -399,7 +365,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
     int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);float ALPHA = alpha;
     int64_t M = (int64_t)m, K = (int64_t)k;
     BLASLONG n_count = n;
-    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*alp = &ALPHA,*next_b = B;
+    float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B;
     for(;n_count>23;n_count-=24) COMPUTE(24)
     for(;n_count>19;n_count-=20) COMPUTE(20)
     for(;n_count>15;n_count-=16) COMPUTE(16)
@@ -411,469 +377,4 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
     return 0;
 }
 
-#include <immintrin.h>
-/* codes below are copied from the sgemm kernel written by Arjan van der Ven */
-
-/*
- * "Direct sgemm" code. This code operates directly on the inputs and outputs
- * of the sgemm call, avoiding the copies, memory realignments and threading,
- * and only supports alpha = 1 and beta = 0.
- * This is a common case and provides value for relatively small matrixes.
- * For larger matrixes the "regular" sgemm code is superior, there the cost of
- * copying/shuffling the B matrix really pays off.
- */
-
-
-
-#define DECLARE_RESULT_512(N,M) __m512 result##N##M = _mm512_setzero_ps()
-#define BROADCAST_LOAD_A_512(N,M) __m512 Aval##M = _mm512_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
-#define LOAD_B_512(N,M)  __m512 Bval##N = _mm512_loadu_ps(&B[strideB * k + j + (N*16)])
-#define MATMUL_512(N,M)  result##N##M = _mm512_fmadd_ps(Aval##M, Bval##N , result##N##M)
-#define STORE_512(N,M) _mm512_storeu_ps(&R[(i+M) * strideR + j+(N*16)], result##N##M)
-
-
-#define DECLARE_RESULT_256(N,M) __m256 result##N##M = _mm256_setzero_ps()
-#define BROADCAST_LOAD_A_256(N,M) __m256 Aval##M = _mm256_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
-#define LOAD_B_256(N,M)  __m256 Bval##N = _mm256_loadu_ps(&B[strideB * k + j + (N*8)])
-#define MATMUL_256(N,M)  result##N##M = _mm256_fmadd_ps(Aval##M, Bval##N , result##N##M)
-#define STORE_256(N,M) _mm256_storeu_ps(&R[(i+M) * strideR + j+(N*8)], result##N##M)
-
-#define DECLARE_RESULT_128(N,M) __m128 result##N##M = _mm_setzero_ps()
-#define BROADCAST_LOAD_A_128(N,M) __m128 Aval##M = _mm_broadcastss_ps(_mm_load_ss(&A[k  + strideA * (i+M)]))
-#define LOAD_B_128(N,M)  __m128 Bval##N = _mm_loadu_ps(&B[strideB * k + j + (N*4)])
-#define MATMUL_128(N,M)  result##N##M = _mm_fmadd_ps(Aval##M, Bval##N , result##N##M)
-#define STORE_128(N,M) _mm_storeu_ps(&R[(i+M) * strideR + j+(N*4)], result##N##M)
-
-#define DECLARE_RESULT_SCALAR(N,M) float result##N##M = 0;
-#define BROADCAST_LOAD_A_SCALAR(N,M) float Aval##M = A[k + strideA * (i + M)];
-#define LOAD_B_SCALAR(N,M)  float Bval##N  = B[k * strideB + j + N];
-#define MATMUL_SCALAR(N,M) result##N##M +=  Aval##M * Bval##N;
-#define STORE_SCALAR(N,M)  R[(i+M) * strideR + j + N] = result##N##M;
-
-int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
-{
-	unsigned long long mnk = M * N * K;
-	/* large matrixes -> not performant */
-	if (mnk >= 28 * 512 * 512)
-		return 0;
-
-	/*
-	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
-	 * and the regular sgemm copy/realignment of data pays off much quicker
-	 */
-	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
-		return 0;
-
-#ifdef SMP
-	/* if we can run multithreaded, the threading changes the based threshold */
-	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
-		return 0;
-#endif
-
-	return 1;
-}
-
-
-
-void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
-{
-	int i, j, k;
-
-        int m4 = M & ~3;
-	int m2 = M & ~1;
-
-	int n64 = N & ~63;
-	int n32 = N & ~31;
-	int n16 = N & ~15;
-	int n8 = N & ~7;
-	int n4 = N & ~3;
-	int n2 = N & ~1;
-
-	i = 0;
-
-	for (i = 0; i < m4; i+=4) {
-
-		for (j = 0; j < n64; j+= 64) {
-			k = 0;
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
-			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);    			DECLARE_RESULT_512(2, 2);    DECLARE_RESULT_512(3, 2);
-			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);    			DECLARE_RESULT_512(2, 3);    DECLARE_RESULT_512(3, 3);
-
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-				BROADCAST_LOAD_A_512(x, 2);
-				BROADCAST_LOAD_A_512(x, 3);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
-				MATMUL_512(0, 2);		MATMUL_512(1, 2);			MATMUL_512(2, 2);		MATMUL_512(3, 2);
-				MATMUL_512(0, 3);		MATMUL_512(1, 3);			MATMUL_512(2, 3);		MATMUL_512(3, 3);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
-			STORE_512(0, 2);		STORE_512(1, 2);			STORE_512(2, 2);		STORE_512(3, 2);
-			STORE_512(0, 3);		STORE_512(1, 3);			STORE_512(2, 3);		STORE_512(3, 3);
-		}
-
-		for (; j < n32; j+= 32) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
-			DECLARE_RESULT_512(0, 2);    DECLARE_RESULT_512(1, 2);
-			DECLARE_RESULT_512(0, 3);    DECLARE_RESULT_512(1, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-				BROADCAST_LOAD_A_512(x, 2);
-				BROADCAST_LOAD_A_512(x, 3);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);
-				MATMUL_512(0, 2);		MATMUL_512(1, 2);
-				MATMUL_512(0, 3);		MATMUL_512(1, 3);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);
-			STORE_512(0, 2);		STORE_512(1, 2);
-			STORE_512(0, 3);		STORE_512(1, 3);
-		}
-
-		for (; j < n16; j+= 16) {
-			DECLARE_RESULT_512(0, 0);
-			DECLARE_RESULT_512(0, 1);
-			DECLARE_RESULT_512(0, 2);
-			DECLARE_RESULT_512(0, 3);
-
-		 	for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-				BROADCAST_LOAD_A_512(x, 2);
-				BROADCAST_LOAD_A_512(x, 3);
-
-				LOAD_B_512(0, x);
-
-				MATMUL_512(0, 0);
-				MATMUL_512(0, 1);
-				MATMUL_512(0, 2);
-				MATMUL_512(0, 3);
-			}
-			STORE_512(0, 0);
-			STORE_512(0, 1);
-			STORE_512(0, 2);
-			STORE_512(0, 3);
-		}
-
-		for (; j < n8; j+= 8) {
-			DECLARE_RESULT_256(0, 0);
-			DECLARE_RESULT_256(0, 1);
-			DECLARE_RESULT_256(0, 2);
-			DECLARE_RESULT_256(0, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_256(x, 0);
-				BROADCAST_LOAD_A_256(x, 1);
-				BROADCAST_LOAD_A_256(x, 2);
-				BROADCAST_LOAD_A_256(x, 3);
-
-				LOAD_B_256(0, x);
-
-				MATMUL_256(0, 0);
-				MATMUL_256(0, 1);
-				MATMUL_256(0, 2);
-				MATMUL_256(0, 3);
-			}
-			STORE_256(0, 0);
-			STORE_256(0, 1);
-			STORE_256(0, 2);
-			STORE_256(0, 3);
-		}
-
-		for (; j < n4; j+= 4) {
-			DECLARE_RESULT_128(0, 0);
-			DECLARE_RESULT_128(0, 1);
-			DECLARE_RESULT_128(0, 2);
-			DECLARE_RESULT_128(0, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_128(x, 0);
-				BROADCAST_LOAD_A_128(x, 1);
-				BROADCAST_LOAD_A_128(x, 2);
-				BROADCAST_LOAD_A_128(x, 3);
-
-				LOAD_B_128(0, x);
-
-				MATMUL_128(0, 0);
-				MATMUL_128(0, 1);
-				MATMUL_128(0, 2);
-				MATMUL_128(0, 3);
-			}
-			STORE_128(0, 0);
-			STORE_128(0, 1);
-			STORE_128(0, 2);
-			STORE_128(0, 3);
-		}
-
-		for (; j < n2; j+= 2) {
-			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
-			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
-			DECLARE_RESULT_SCALAR(0, 2);	DECLARE_RESULT_SCALAR(1, 2);
-			DECLARE_RESULT_SCALAR(0, 3);	DECLARE_RESULT_SCALAR(1, 3);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(x, 0);
-				BROADCAST_LOAD_A_SCALAR(x, 1);
-				BROADCAST_LOAD_A_SCALAR(x, 2);
-				BROADCAST_LOAD_A_SCALAR(x, 3);
-
-				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
-
-				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
-				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
-				MATMUL_SCALAR(0, 2);	MATMUL_SCALAR(1, 2);
-				MATMUL_SCALAR(0, 3);	MATMUL_SCALAR(1, 3);
-			}
-			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
-			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
-			STORE_SCALAR(0, 2);	STORE_SCALAR(1, 2);
-			STORE_SCALAR(0, 3);	STORE_SCALAR(1, 3);
-		}
-
-		for (; j < N; j++) {
-			DECLARE_RESULT_SCALAR(0, 0)
-			DECLARE_RESULT_SCALAR(0, 1)
-			DECLARE_RESULT_SCALAR(0, 2)
-			DECLARE_RESULT_SCALAR(0, 3)
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(0, 0);
-				BROADCAST_LOAD_A_SCALAR(0, 1);
-				BROADCAST_LOAD_A_SCALAR(0, 2);
-				BROADCAST_LOAD_A_SCALAR(0, 3);
-
-				LOAD_B_SCALAR(0, 0);
-
-				MATMUL_SCALAR(0, 0);
-				MATMUL_SCALAR(0, 1);
-				MATMUL_SCALAR(0, 2);
-				MATMUL_SCALAR(0, 3);
-			}
-			STORE_SCALAR(0, 0);
-			STORE_SCALAR(0, 1);
-			STORE_SCALAR(0, 2);
-			STORE_SCALAR(0, 3);
-		}
-	}
-
-	for (; i < m2; i+=2) {
-		j = 0;
-
-		for (; j < n64; j+= 64) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);    			DECLARE_RESULT_512(2, 1);    DECLARE_RESULT_512(3, 1);
-
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);			MATMUL_512(2, 1);		MATMUL_512(3, 1);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);			STORE_512(2, 1);		STORE_512(3, 1);
-		}
-
-		for (; j < n32; j+= 32) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
-			DECLARE_RESULT_512(0, 1);    DECLARE_RESULT_512(1, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);
-
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);
-				MATMUL_512(0, 1);		MATMUL_512(1, 1);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);
-			STORE_512(0, 1);		STORE_512(1, 1);
-		}
-
-
-		for (; j < n16; j+= 16) {
-			DECLARE_RESULT_512(0, 0);
-			DECLARE_RESULT_512(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				BROADCAST_LOAD_A_512(x, 1);
-
-				LOAD_B_512(0, x);
-
-				MATMUL_512(0, 0);
-				MATMUL_512(0, 1);
-			}
-			STORE_512(0, 0);
-			STORE_512(0, 1);
-		}
-
-		for (; j < n8; j+= 8) {
-			DECLARE_RESULT_256(0, 0);
-			DECLARE_RESULT_256(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_256(x, 0);
-				BROADCAST_LOAD_A_256(x, 1);
-
-				LOAD_B_256(0, x);
-
-				MATMUL_256(0, 0);
-				MATMUL_256(0, 1);
-			}
-			STORE_256(0, 0);
-			STORE_256(0, 1);
-		}
-
-		for (; j < n4; j+= 4) {
-			DECLARE_RESULT_128(0, 0);
-			DECLARE_RESULT_128(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_128(x, 0);
-				BROADCAST_LOAD_A_128(x, 1);
-
-				LOAD_B_128(0, x);
-
-				MATMUL_128(0, 0);
-				MATMUL_128(0, 1);
-			}
-			STORE_128(0, 0);
-			STORE_128(0, 1);
-		}
-		for (; j < n2; j+= 2) {
-			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
-			DECLARE_RESULT_SCALAR(0, 1);	DECLARE_RESULT_SCALAR(1, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(x, 0);
-				BROADCAST_LOAD_A_SCALAR(x, 1);
-
-				LOAD_B_SCALAR(0, x);	LOAD_B_SCALAR(1, x);
-
-				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
-				MATMUL_SCALAR(0, 1);	MATMUL_SCALAR(1, 1);
-			}
-			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
-			STORE_SCALAR(0, 1);	STORE_SCALAR(1, 1);
-		}
-
-		for (; j < N; j++) {
-			DECLARE_RESULT_SCALAR(0, 0);
-			DECLARE_RESULT_SCALAR(0, 1);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(0, 0);
-				BROADCAST_LOAD_A_SCALAR(0, 1);
-
-				LOAD_B_SCALAR(0, 0);
-
-				MATMUL_SCALAR(0, 0);
-				MATMUL_SCALAR(0, 1);
-			}
-			STORE_SCALAR(0, 0);
-			STORE_SCALAR(0, 1);
-		}
-	}
-
-	for (; i < M; i+=1) {
-		j = 0;
-		for (; j < n64; j+= 64) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);    			DECLARE_RESULT_512(2, 0);    DECLARE_RESULT_512(3, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);			LOAD_B_512(2, x);		LOAD_B_512(3, x);
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);			MATMUL_512(2, 0);		MATMUL_512(3, 0);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);			STORE_512(2, 0);		STORE_512(3, 0);
-		}
-		for (; j < n32; j+= 32) {
-			DECLARE_RESULT_512(0, 0);    DECLARE_RESULT_512(1, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-				LOAD_B_512(0, x);		LOAD_B_512(1, x);
-				MATMUL_512(0, 0);		MATMUL_512(1, 0);
-			}
-			STORE_512(0, 0);		STORE_512(1, 0);
-		}
-
-
-		for (; j < n16; j+= 16) {
-			DECLARE_RESULT_512(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_512(x, 0);
-
-				LOAD_B_512(0, x);
-
-				MATMUL_512(0, 0);
-			}
-			STORE_512(0, 0);
-		}
-
-		for (; j < n8; j+= 8) {
-			DECLARE_RESULT_256(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_256(x, 0);
-				LOAD_B_256(0, x);
-				MATMUL_256(0, 0);
-			}
-			STORE_256(0, 0);
-		}
-
-		for (; j < n4; j+= 4) {
-			DECLARE_RESULT_128(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_128(x, 0);
-				LOAD_B_128(0, x);
-				MATMUL_128(0, 0);
-			}
-			STORE_128(0, 0);
-		}
-
-		for (; j < n2; j+= 2) {
-			DECLARE_RESULT_SCALAR(0, 0);	DECLARE_RESULT_SCALAR(1, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(x, 0);
-				LOAD_B_SCALAR(0, 0);	LOAD_B_SCALAR(1, 0);
-				MATMUL_SCALAR(0, 0);	MATMUL_SCALAR(1, 0);
-			}
-			STORE_SCALAR(0, 0);	STORE_SCALAR(1, 0);
-		}
-
-		for (; j < N; j++) {
-			DECLARE_RESULT_SCALAR(0, 0);
-
-			for (k = 0; k < K; k++) {
-				BROADCAST_LOAD_A_SCALAR(0, 0);
-				LOAD_B_SCALAR(0, 0);
-				MATMUL_SCALAR(0, 0);
-			}
-			STORE_SCALAR(0, 0);
-		}
-	}
-}
+#include "sgemm_direct_skylakex.c"

From 952cc2ba3860419defed3c27af1c3becca9e40e9 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 13 Jan 2020 16:58:54 +0800
Subject: [PATCH 180/210] Update sgemm_kernel_16x4_skylakex_2.c

---
 kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
index e4ca6b1bd..6ca822b91 100644
--- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
+++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c
@@ -376,5 +376,5 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
     if(n_count>0) COMPUTE(1)
     return 0;
 }
-
+#include <immintrin.h>
 #include "sgemm_direct_skylakex.c"

From e5dcdeb5506a8e0ab26e0956c5b8e7fed7e80e9a Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 13 Jan 2020 16:59:23 +0800
Subject: [PATCH 181/210] Update sgemm_direct_skylakex.c

---
 kernel/x86_64/sgemm_direct_skylakex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c
index 4f9af6e57..0e8f1318f 100644
--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -1,6 +1,6 @@
 
 /* the direct sgemm code written by Arjan van der Ven */
-#include <immintrin.h>
+//#include <immintrin.h>
 
 /*
  * "Direct sgemm" code. This code operates directly on the inputs and outputs

From 78100b80935753a7a86c6a5380e2a53bc9469b7f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 18 Jan 2020 15:06:39 +0100
Subject: [PATCH 182/210] Free Windows thread memory with MEM_RELEASE rather
 than MEM_DECOMMIT

as suggested by hjmndv in #2370
---
 driver/others/memory.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index 55dce72b8..62a5a0214 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -822,7 +822,7 @@ static void *alloc_qalloc(void *address){
 
 static void alloc_windows_free(struct alloc_t *alloc_info){
 
-  VirtualFree(alloc_info, allocation_block_size, MEM_DECOMMIT);
+  VirtualFree(alloc_info, 0, MEM_RELEASE);
 
 }
 
@@ -935,7 +935,7 @@ static void alloc_hugetlb_free(struct alloc_t *alloc_info){
 
 #ifdef OS_WINDOWS
 
-  VirtualFree(alloc_info, allocation_block_size, MEM_LARGE_PAGES | MEM_DECOMMIT);
+  VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
 
 #endif
 
@@ -2310,7 +2310,7 @@ static void *alloc_qalloc(void *address){
 
 static void alloc_windows_free(struct release_t *release){
 
-  VirtualFree(release -> address, BUFFER_SIZE, MEM_DECOMMIT);
+  VirtualFree(release -> address, 0, MEM_RELEASE);
 
 }
 
@@ -2432,7 +2432,7 @@ static void alloc_hugetlb_free(struct release_t *release){
 
 #ifdef OS_WINDOWS
 
-  VirtualFree(release -> address, BUFFER_SIZE, MEM_LARGE_PAGES | MEM_DECOMMIT);
+  VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
 
 #endif
 

From 23f322f997c8b018977be24122c56fb62d728a05 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 19 Jan 2020 13:28:27 +0100
Subject: [PATCH 183/210] Do not run any cleanup if the program is exiting
 anyway

From keno's PR #2350 - this avoids the potential hang in blas_thread_shutdown where we may wait for threads to exit while they are waiting on the loader lock from DllMain
---
 exports/dllinit.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/exports/dllinit.c b/exports/dllinit.c
index 4a05c0e14..88f9af658 100644
--- a/exports/dllinit.c
+++ b/exports/dllinit.c
@@ -50,7 +50,10 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {
         gotoblas_init();
         break;
       case DLL_PROCESS_DETACH:
-        gotoblas_quit();
+        // If the process is about to exit, don't bother releasing any resources
+        // The kernel is much better at bulk releasing then.
+        if (!reserved)
+          gotoblas_quit();
         break;
       case DLL_THREAD_ATTACH:
         break;

From ff42e68652fbba58936c9c66d0b060c3a6d694e7 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 20 Jan 2020 11:49:42 +0800
Subject: [PATCH 184/210] Optimize genenal Gemm Beta

---
 kernel/generic/gemm_beta.c | 132 ++++++++++++-------------------------
 1 file changed, 42 insertions(+), 90 deletions(-)

diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c
index c4e4f7abe..fa9d7680d 100644
--- a/kernel/generic/gemm_beta.c
+++ b/kernel/generic/gemm_beta.c
@@ -42,101 +42,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
 	  FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
 	  FLOAT *c, BLASLONG ldc){
 
+
   BLASLONG i, j;
+  BLASLONG chunk, remain;
   FLOAT *c_offset1, *c_offset;
-  FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
-  FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
-
   c_offset = c;
-
+  chunk = m >> 3;
+  remain = m & 7;
   if (beta == ZERO){
-
-    j = n;
-    do {
-      c_offset1 = c_offset;
-      c_offset += ldc;
-
-      i = (m >> 3);
-      if (i > 0){
-	do {
-	  *(c_offset1 + 0) = ZERO;
-	  *(c_offset1 + 1) = ZERO;
-	  *(c_offset1 + 2) = ZERO;
-	  *(c_offset1 + 3) = ZERO;
-	  *(c_offset1 + 4) = ZERO;
-	  *(c_offset1 + 5) = ZERO;
-	  *(c_offset1 + 6) = ZERO;
-	  *(c_offset1 + 7) = ZERO;
-	  c_offset1 += 8;
-	  i --;
-	} while (i > 0);
-      }
-
-      i = (m & 7);
-      if (i > 0){
-	do {
-	  *c_offset1 = ZERO;
-	  c_offset1 ++;
-	  i --;
-	} while (i > 0);
-      }
-      j --;
-    } while (j > 0);
-
+	  for(j=n; j>0; j--){
+		c_offset1 = c_offset;
+		c_offset += ldc;
+		for(i=chunk; i>0; i--){
+			*(c_offset1 + 0) = ZERO;
+			*(c_offset1 + 1) = ZERO;
+			*(c_offset1 + 2) = ZERO;
+			*(c_offset1 + 3) = ZERO;
+			*(c_offset1 + 4) = ZERO;
+			*(c_offset1 + 5) = ZERO;
+			*(c_offset1 + 6) = ZERO;
+			*(c_offset1 + 7) = ZERO;
+			c_offset1 += 8;
+		}
+		for(i=remain; i>0; i--){
+			*c_offset1 = ZERO;
+			c_offset1 ++;
+		}
+	  }
   } else {
-
-    j = n;
-    do {
-      c_offset1 = c_offset;
-      c_offset += ldc;
-
-      i = (m >> 3);
-      if (i > 0){
-	do {
-	  ctemp1 = *(c_offset1 + 0);
-	  ctemp2 = *(c_offset1 + 1);
-	  ctemp3 = *(c_offset1 + 2);
-	  ctemp4 = *(c_offset1 + 3);
-	  ctemp5 = *(c_offset1 + 4);
-	  ctemp6 = *(c_offset1 + 5);
-	  ctemp7 = *(c_offset1 + 6);
-	  ctemp8 = *(c_offset1 + 7);
-
-	  ctemp1 *= beta;
-	  ctemp2 *= beta;
-	  ctemp3 *= beta;
-	  ctemp4 *= beta;
-	  ctemp5 *= beta;
-	  ctemp6 *= beta;
-	  ctemp7 *= beta;
-	  ctemp8 *= beta;
-
-	  *(c_offset1 + 0) = ctemp1;
-	  *(c_offset1 + 1) = ctemp2;
-	  *(c_offset1 + 2) = ctemp3;
-	  *(c_offset1 + 3) = ctemp4;
-	  *(c_offset1 + 4) = ctemp5;
-	  *(c_offset1 + 5) = ctemp6;
-	  *(c_offset1 + 6) = ctemp7;
-	  *(c_offset1 + 7) = ctemp8;
-	  c_offset1 += 8;
-	  i --;
-	} while (i > 0);
-      }
-
-      i = (m & 7);
-      if (i > 0){
-	do {
-	  ctemp1 = *c_offset1;
-	  ctemp1 *= beta;
-	  *c_offset1 = ctemp1;
-	  c_offset1 ++;
-	  i --;
-	} while (i > 0);
-      }
-      j --;
-    } while (j > 0);
-
+	  for(j=n; j>0; j--){
+		c_offset1 = c_offset;
+		c_offset += ldc;
+		for(i=chunk; i>0; i--){
+			*(c_offset1 + 0) *= beta;
+			*(c_offset1 + 1) *= beta;
+			*(c_offset1 + 2) *= beta;
+			*(c_offset1 + 3) *= beta;
+			*(c_offset1 + 4) *= beta;
+			*(c_offset1 + 5) *= beta;
+			*(c_offset1 + 6) *= beta;
+			*(c_offset1 + 7) *= beta;
+			c_offset1 += 8;
+		}
+		for(i=remain; i>0; i--){
+			*c_offset1 *= beta;
+			c_offset1 ++;
+		}
+	  }
   }
   return 0;
 };

From fbf4f48f4a3d324dd268aaad51624022ee4f0ea2 Mon Sep 17 00:00:00 2001
From: "Wang,Long" <long1.wang@intel.com>
Date: Wed, 22 Jan 2020 15:07:50 +0000
Subject: [PATCH 185/210] fix a few performance drop in some matrix size per
 data type

Signed-off-by: Wang,Long <long1.wang@intel.com>
---
 param.h | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/param.h b/param.h
index 3baae31cf..075c12ca2 100644
--- a/param.h
+++ b/param.h
@@ -1507,8 +1507,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P  8
 
-#define SWITCH_RATIO	32
-#define GEMM_PREFERED_SIZE	16
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO            4
+#define GEMM_PREFERED_SIZE      4
+#else
+#define SWITCH_RATIO            8
+#define GEMM_PREFERED_SIZE      8
+#endif
 
 #ifdef ARCH_X86
 
@@ -1627,8 +1632,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SYMV_P  8
 
-#define SWITCH_RATIO	32
-#define GEMM_PREFERED_SIZE	32
+#if defined(XDOUBLE) || defined(DOUBLE)
+#define SWITCH_RATIO           8
+#define GEMM_PREFERED_SIZE     8
+#else
+#define SWITCH_RATIO           16
+#define GEMM_PREFERED_SIZE     16
+#endif
 #define USE_SGEMM_KERNEL_DIRECT 1
 
 #ifdef ARCH_X86

From e9fb8f62b1822c456ccc0b9db23f49aa66dd6801 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 22 Jan 2020 17:40:03 +0000
Subject: [PATCH 186/210] Update level3_gemm3m_thread.c

---
 driver/level3/level3_gemm3m_thread.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/driver/level3/level3_gemm3m_thread.c b/driver/level3/level3_gemm3m_thread.c
index 21d431b60..9216daaed 100644
--- a/driver/level3/level3_gemm3m_thread.c
+++ b/driver/level3/level3_gemm3m_thread.c
@@ -104,7 +104,7 @@ typedef struct {
 #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \
 	GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \
 		  BETA[0], BETA[1], NULL, 0, NULL, 0, \
-		  (FLOAT *)(C) + (M_FROM) + (N_FROM) * (LDC) * COMPSIZE, LDC)
+		  (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC)
 #endif
 
 #ifndef ICOPYB_OPERATION
@@ -414,7 +414,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, xxx + div_n) - jjs;
-	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
+	if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
 
 	START_RPCC();
 
@@ -550,7 +550,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, xxx + div_n) - jjs;
-	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
+	if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
 
 	START_RPCC();
 
@@ -687,7 +687,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 
       for(jjs = xxx; jjs < MIN(n_to, xxx + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, xxx + div_n) - jjs;
-	if (min_jj > GEMM3M_UNROLL_N) min_jj = GEMM3M_UNROLL_N;
+	if (min_jj > GEMM3M_UNROLL_N*3) min_jj = GEMM3M_UNROLL_N*3;
 
 	START_RPCC();
 

From 8dc9fd4dfeb894d8b7553c8e5fcc991917335557 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 30 Jan 2020 12:41:18 +0100
Subject: [PATCH 187/210] Add -march option for AVX512

---
 cmake/cc.cmake | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 37da0d6ed..22217575c 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -96,3 +96,10 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
   endif ()
 endif ()
 
+if (${CORE} STREQUAL "SKYLAKEX")
+  if (NOT DYNAMIC_ARCH)
+    if (NOT NO_AVX512)
+      set (CCOMMON_OPT = "${CCOMMON_OPT} -march=skylake-avx512")
+    endif ()
+  endif ()
+endif ()

From 8019e70211f5e6679e1e5afd5658016d8045de19 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 3 Feb 2020 21:32:56 +0800
Subject: [PATCH 188/210] AVX512 16x2 DGEMM kernel

---
 kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 488 +++++++++++++++++++++
 1 file changed, 488 insertions(+)
 create mode 100644 kernel/x86_64/dgemm_kernel_16x2_skylakex.c

diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
new file mode 100644
index 000000000..250ff8d49
--- /dev/null
+++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
@@ -0,0 +1,488 @@
+#if (defined (LEFT) && !defined(TRANSA)) || (!defined (LEFT) && defined(TRANSA))
+  #define BACKWARDS 1
+#else
+  #define BACKWARDS 0
+#endif
+#define GEMM_SET_PB "movq %%r14,%1; leaq (%%r14,%%r12,2),%%r15; addq %%r12,%%r15;"
+#define set_p_copy1(ptr) "sarq $1,%%r12; addq %%r12,"#ptr"; salq $1,%%r12; salq $3,%%r13; subq %%r13,"#ptr"; sarq $3,%%r13;"
+#define set_p_copy2(ptr) "addq %%r12,"#ptr"; salq $4,%%r13; subq %%r13,"#ptr"; sarq $4,%%r13;"
+#define set_p_copy4(ptr) "leaq ("#ptr",%%r12,2),"#ptr"; salq $5,%%r13; subq %%r13,"#ptr"; sarq $5,%%r13;"
+#define set_p_copy8(ptr) "leaq ("#ptr",%%r12,4),"#ptr"; salq $6,%%r13; subq %%r13,"#ptr"; sarq $6,%%r13;"
+#define set_p_copy16(ptr) "leaq ("#ptr",%%r12,8),"#ptr"; salq $7,%%r13; subq %%r13,"#ptr"; sarq $7,%%r13;"
+#define set_p_b_dim1(ptr) set_p_copy1(ptr)
+#define set_p_b_dim2(ptr) set_p_copy2(ptr)
+#define set_p_b_dim4(ptr) set_p_copy2(ptr)
+#define set_p_b_dim6(ptr) set_p_copy2(ptr)
+#define set_p_b_dim8(ptr) set_p_copy2(ptr)
+#define set_p_b_dim10(ptr) set_p_copy2(ptr)
+#define set_p_b_dim12(ptr) set_p_copy2(ptr)
+#ifdef TRMMKERNEL
+  #if BACKWARDS == 1
+    #define INIT_set_papb(mdim,ndim) GEMM_SET_PB set_p_copy##mdim(%0) set_p_b_dim##ndim(%1) set_p_b_dim##ndim(%%r15)
+    #define SAVE_set_pa(mdim) ""
+  #else
+    #define INIT_set_papb(mdim,ndim) GEMM_SET_PB
+    #define SAVE_set_pa(mdim) set_p_copy##mdim(%0)
+  #endif
+#else
+  #define INIT_set_papb(mdim,ndim) GEMM_SET_PB
+  #define SAVE_set_pa(mdim) ""
+#endif
+#if defined(TRMMKERNEL) && !defined(LEFT)
+  #if BACKWARDS == 1
+    #define HEAD_SET_OFF(ndim) {}
+    #define TAIL_SET_OFF(ndim) {off += ndim;}
+    #define kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n2 KERNEL_k1m##mdim##n2 "addq $32,%%r15; "#updk" $2,%5;"
+    #define kernel_kstart_n6(mdim,updk) kernel_kstart_n4(mdim,updk) KERNEL_k1m##mdim##n4 KERNEL_k1m##mdim##n4 "addq $32,%%r15; "#updk" $2,%5;"
+    #define kernel_kstart_n8(mdim,updk) kernel_kstart_n6(mdim,updk) KERNEL_k1m##mdim##n6 KERNEL_k1m##mdim##n6 "addq $32,%%r15; "#updk" $2,%5;"
+    #define kernel_kstart_n10(mdim,updk) kernel_kstart_n8(mdim,updk) KERNEL_k1m##mdim##n8 KERNEL_k1m##mdim##n8 #updk" $2,%5;"
+    #define kernel_kstart_n12(mdim,updk) kernel_kstart_n10(mdim,updk) KERNEL_k1m##mdim##n10 KERNEL_k1m##mdim##n10 #updk" $2,%5;"
+    #define kernel_kend_n4(mdim) ""
+    #define kernel_kend_n6(mdim) ""
+    #define kernel_kend_n8(mdim) ""
+    #define kernel_kend_n10(mdim) ""
+    #define kernel_kend_n12(mdim) ""
+  #else
+    #define HEAD_SET_OFF(ndim) {off += (ndim > 2 ? 2 : ndim);}
+    #define TAIL_SET_OFF(ndim) {off += (ndim > 2 ? (ndim-2) : 0);}
+    #define kernel_kstart_n4(mdim,updk) ""
+    #define kernel_kstart_n6(mdim,updk) ""
+    #define kernel_kstart_n8(mdim,updk) ""
+    #define kernel_kstart_n10(mdim,updk) ""
+    #define kernel_kstart_n12(mdim,updk) ""
+    #define kernel_kend_n4(mdim) "xorq %3,%3;"\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24)
+    #define kernel_kend_n6(mdim) "xorq %3,%3;"\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56)
+    #define kernel_kend_n8(mdim) "xorq %3,%3;"\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88)
+    #define kernel_kend_n10(mdim) "xorq %3,%3;"\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120)
+    #define kernel_kend_n12(mdim) "xorq %3,%3;"\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(0,8) acc_kend_nc3_k1m##mdim(0,8) acc_kend_nc4_k1m##mdim(0,8) acc_kend_nc5_k1m##mdim(0,8) acc_kend_nc6_k1m##mdim(0,8)\
+      loada_kend_k1m##mdim acc_kend_nc2_k1m##mdim(16,24) acc_kend_nc3_k1m##mdim(16,24) acc_kend_nc4_k1m##mdim(16,24) acc_kend_nc5_k1m##mdim(16,24) acc_kend_nc6_k1m##mdim(16,24)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(32,40) acc_kend_nc4_k1m##mdim(32,40) acc_kend_nc5_k1m##mdim(32,40) acc_kend_nc6_k1m##mdim(32,40)\
+      loada_kend_k1m##mdim acc_kend_nc3_k1m##mdim(48,56) acc_kend_nc4_k1m##mdim(48,56) acc_kend_nc5_k1m##mdim(48,56) acc_kend_nc6_k1m##mdim(48,56)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(64,72) acc_kend_nc5_k1m##mdim(64,72) acc_kend_nc6_k1m##mdim(64,72)\
+      loada_kend_k1m##mdim acc_kend_nc4_k1m##mdim(80,88) acc_kend_nc5_k1m##mdim(80,88) acc_kend_nc6_k1m##mdim(80,88)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(96,104) acc_kend_nc6_k1m##mdim(96,104)\
+      loada_kend_k1m##mdim acc_kend_nc5_k1m##mdim(112,120) acc_kend_nc6_k1m##mdim(112,120)\
+      loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(128,136)\
+      loada_kend_k1m##mdim acc_kend_nc6_k1m##mdim(144,152)
+  #endif
+#else
+  #define HEAD_SET_OFF(ndim) {}
+  #define TAIL_SET_OFF(ndim) {}
+  #define kernel_kstart_n4(mdim,updk) ""
+  #define kernel_kstart_n6(mdim,updk) ""
+  #define kernel_kstart_n8(mdim,updk) ""
+  #define kernel_kstart_n10(mdim,updk) ""
+  #define kernel_kstart_n12(mdim,updk) ""
+  #define kernel_kend_n4(mdim) ""
+  #define kernel_kend_n6(mdim) ""
+  #define kernel_kend_n8(mdim) ""
+  #define kernel_kend_n10(mdim) ""
+  #define kernel_kend_n12(mdim) ""
+#endif
+#define kernel_kstart_n1(mdim,updk) ""
+#define kernel_kstart_n2(mdim,updk) ""
+#define kernel_kend_n1(mdim) ""
+#define kernel_kend_n2(mdim) ""
+
+#ifdef TRMMKERNEL
+  #if BACKWARDS == 1
+    #define INITASM_SET_K "movq %10,%%r13; subq %9,%%r13;"
+  #else
+    #define INITASM_SET_K "movq %9,%%r13;"
+  #endif
+#else
+  #define INITASM_SET_K "movq %10,%%r13;"
+#endif
+#if defined(TRMMKERNEL) && defined(LEFT)
+  #if BACKWARDS==1
+    #define init_update_k(mdim) ""
+    #define save_update_k(mdim) "subq $"#mdim",%%r13;"
+  #else
+    #define init_update_k(mdim) "addq $"#mdim",%%r13;"
+    #define save_update_k(mdim) ""
+  #endif
+#else
+  #define init_update_k(mdim) ""
+  #define save_update_k(mdim) ""
+#endif
+    
+#define KERNEL_h_k1m16n1 \
+  "vmovupd (%0),%%zmm1; vmovupd 64(%0),%%zmm2; addq $128,%0;"\
+  "vbroadcastsd (%1),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm8; vfmadd231pd %%zmm2,%%zmm3,%%zmm9;"
+#define KERNEL_k1m16n1 KERNEL_h_k1m16n1 "addq $8,%1;"
+#define KERNEL_h_k1m16n2 KERNEL_h_k1m16n1\
+  "vbroadcastsd 8(%1),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm10; vfmadd231pd %%zmm2,%%zmm4,%%zmm11;"
+#define KERNEL_k1m16n2 KERNEL_h_k1m16n2 "addq $16,%1;"
+#define unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,boff1,boff2,...)\
+  "vbroadcastsd "#boff1"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"\
+  "vbroadcastsd "#boff2"("#__VA_ARGS__"),%%zmm4; vfmadd231pd %%zmm1,%%zmm4,%%zmm"#c3_no"; vfmadd231pd %%zmm2,%%zmm4,%%zmm"#c4_no";"
+#define unit_acc_m16n2(c1_no,c2_no,c3_no,c4_no,...) unit_acc_gen_m16n2(c1_no,c2_no,c3_no,c4_no,0,8,__VA_ARGS__)
+#define KERNEL_h_k1m16n4 KERNEL_h_k1m16n2 "prefetcht0 384(%0);" unit_acc_m16n2(12,13,14,15,%1,%%r12,1)
+#define KERNEL_k1m16n4 KERNEL_h_k1m16n4 "addq $16,%1;"
+#define KERNEL_k1m16n6 KERNEL_h_k1m16n4 unit_acc_m16n2(16,17,18,19,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m16n8 KERNEL_k1m16n6 "prefetcht0 448(%0);" unit_acc_m16n2(20,21,22,23,%%r15)
+#define KERNEL_k1m16n8 KERNEL_h_k1m16n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m16n10 KERNEL_h_k1m16n8 unit_acc_m16n2(24,25,26,27,%%r15,%%r12,1)
+#define KERNEL_k1m16n10 KERNEL_h_k1m16n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m16n12 KERNEL_h_k1m16n10 unit_acc_m16n2(28,29,30,31,%%r15,%%r12,2)
+#define KERNEL_k1m16n12 KERNEL_h_k1m16n12 "addq $16,%%r15;"
+#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
+  #define loada_kend_k1m16 "vmovupd (%0,%3,1),%%zmm1; vmovupd 64(%0,%3,1),%%zmm2; addq $128,%3;"
+  #define acc_kend_nc2_k1m16(boff1,boff2) unit_acc_gen_m16n2(12,13,14,15,boff1,boff2,%1,%%r12,1)
+  #define acc_kend_nc3_k1m16(boff1,boff2) unit_acc_gen_m16n2(16,17,18,19,boff1,boff2,%1,%%r12,2)
+  #define acc_kend_nc4_k1m16(boff1,boff2) unit_acc_gen_m16n2(20,21,22,23,boff1,boff2,%%r15)
+  #define acc_kend_nc5_k1m16(boff1,boff2) unit_acc_gen_m16n2(24,25,26,27,boff1,boff2,%%r15,%%r12,1)
+  #define acc_kend_nc6_k1m16(boff1,boff2) unit_acc_gen_m16n2(28,29,30,31,boff1,boff2,%%r15,%%r12,2)
+#endif
+#define save_init_m16 "movq %2,%3; addq $128,%2;"
+#ifdef TRMMKERNEL
+  #define SAVE_m16n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vmulpd %%zmm9,%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
+  #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
+    "vmulpd %%zmm"#c1_no",%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vmulpd %%zmm"#c2_no",%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
+    "vmulpd %%zmm"#c3_no",%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vmulpd %%zmm"#c4_no",%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
+#else
+  #define SAVE_m16n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); vfmadd213pd 64(%2),%%zmm0,%%zmm9; vmovupd %%zmm9,64(%2); addq $128,%2;"
+  #define unit_save_m16n2(c1_no,c2_no,c3_no,c4_no)\
+    "vfmadd213pd (%3),%%zmm0,%%zmm"#c1_no"; vmovupd %%zmm"#c1_no",(%3); vfmadd213pd 64(%3),%%zmm0,%%zmm"#c2_no"; vmovupd %%zmm"#c2_no",64(%3);"\
+    "vfmadd213pd (%3,%4,1),%%zmm0,%%zmm"#c3_no"; vmovupd %%zmm"#c3_no",(%3,%4,1); vfmadd213pd 64(%3,%4,1),%%zmm0,%%zmm"#c4_no"; vmovupd %%zmm"#c4_no",64(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define SAVE_m16n2 save_init_m16 unit_save_m16n2(8,9,10,11)
+#define SAVE_m16n4 SAVE_m16n2 unit_save_m16n2(12,13,14,15)
+#define SAVE_m16n6 SAVE_m16n4 unit_save_m16n2(16,17,18,19)
+#define SAVE_m16n8 SAVE_m16n6 unit_save_m16n2(20,21,22,23)
+#define SAVE_m16n10 SAVE_m16n8 unit_save_m16n2(24,25,26,27)
+#define SAVE_m16n12 SAVE_m16n10 unit_save_m16n2(28,29,30,31)
+#define unit_init_2zmm(c1_no,c2_no) "vpxorq %%zmm"#c1_no",%%zmm"#c1_no",%%zmm"#c1_no"; vpxorq %%zmm"#c2_no",%%zmm"#c2_no",%%zmm"#c2_no";"
+#define unit_init_4zmm(c1_no,c2_no,c3_no,c4_no) unit_init_2zmm(c1_no,c2_no) unit_init_2zmm(c3_no,c4_no)
+#define INIT_m16n1 unit_init_2zmm(8,9)
+#define INIT_m16n2 unit_init_4zmm(8,9,10,11)
+#define INIT_m16n4 INIT_m16n2 unit_init_4zmm(12,13,14,15)
+#define INIT_m16n6 INIT_m16n4 unit_init_4zmm(16,17,18,19)
+#define INIT_m16n8 INIT_m16n6 unit_init_4zmm(20,21,22,23)
+#define INIT_m16n10 INIT_m16n8 unit_init_4zmm(24,25,26,27)
+#define INIT_m16n12 INIT_m16n10 unit_init_4zmm(28,29,30,31)
+
+#define KERNEL_k1m8n1 \
+  "vbroadcastsd (%1),%%zmm1; addq $8,%1;"\
+  "vfmadd231pd (%0),%%zmm1,%%zmm8; addq $64,%0;"
+#define unit_acc_gen_m8n2(c1_no,c2_no,boff,...)\
+  "vbroadcastf32x4 "#boff"("#__VA_ARGS__"),%%zmm3; vfmadd231pd %%zmm1,%%zmm3,%%zmm"#c1_no"; vfmadd231pd %%zmm2,%%zmm3,%%zmm"#c2_no";"
+#define unit_acc_m8n2(c1_no,c2_no,...) unit_acc_gen_m8n2(c1_no,c2_no,0,__VA_ARGS__)
+#define KERNEL_h_k1m8n2 \
+  "vmovddup (%0),%%zmm1; vmovddup 8(%0),%%zmm2; addq $64,%0;" unit_acc_m8n2(8,9,%1)
+#define KERNEL_k1m8n2 KERNEL_h_k1m8n2 "addq $16,%1;"
+#define KERNEL_h_k1m8n4 KERNEL_h_k1m8n2 unit_acc_m8n2(10,11,%1,%%r12,1)
+#define KERNEL_k1m8n4 KERNEL_h_k1m8n4 "addq $16,%1;"
+#define KERNEL_k1m8n6 KERNEL_h_k1m8n4 unit_acc_m8n2(12,13,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m8n8 KERNEL_k1m8n6 unit_acc_m8n2(14,15,%%r15)
+#define KERNEL_k1m8n8 KERNEL_h_k1m8n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m8n10 KERNEL_h_k1m8n8 unit_acc_m8n2(16,17,%%r15,%%r12,1)
+#define KERNEL_k1m8n10 KERNEL_h_k1m8n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m8n12 KERNEL_h_k1m8n10 unit_acc_m8n2(18,19,%%r15,%%r12,2)
+#define KERNEL_k1m8n12 KERNEL_h_k1m8n12 "addq $16,%%r15;"
+#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
+  #define loada_kend_k1m8 "vmovddup (%0,%3,1),%%zmm1; vmovddup 8(%0,%3,1),%%zmm2; addq $64,%3;"
+  #define acc_kend_nc2_k1m8(boff1,boff2) unit_acc_gen_m8n2(10,11,boff1,%1,%%r12,1)
+  #define acc_kend_nc3_k1m8(boff1,boff2) unit_acc_gen_m8n2(12,13,boff1,%1,%%r12,2)
+  #define acc_kend_nc4_k1m8(boff1,boff2) unit_acc_gen_m8n2(14,15,boff1,%%r15)
+  #define acc_kend_nc5_k1m8(boff1,boff2) unit_acc_gen_m8n2(16,17,boff1,%%r15,%%r12,1)
+  #define acc_kend_nc6_k1m8(boff1,boff2) unit_acc_gen_m8n2(18,19,boff1,%%r15,%%r12,2)
+#endif
+#define save_init_m8 "movq %2,%3; addq $64,%2;"
+#ifdef TRMMKERNEL
+  #define SAVE_m8n1 "vmulpd %%zmm8,%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;"
+  #define unit_save_m8n2(c1_no,c2_no)\
+    "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vmulpd %%zmm1,%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\
+    "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vmulpd %%zmm2,%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#else
+  #define SAVE_m8n1 "vfmadd213pd (%2),%%zmm0,%%zmm8; vmovupd %%zmm8,(%2); addq $64,%2;"
+  #define unit_save_m8n2(c1_no,c2_no)\
+    "vunpcklpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm1; vfmadd213pd (%3),%%zmm0,%%zmm1; vmovupd %%zmm1,(%3);"\
+    "vunpckhpd %%zmm"#c2_no",%%zmm"#c1_no",%%zmm2; vfmadd213pd (%3,%4,1),%%zmm0,%%zmm2; vmovupd %%zmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define SAVE_m8n2 save_init_m8 unit_save_m8n2(8,9)
+#define SAVE_m8n4 SAVE_m8n2 unit_save_m8n2(10,11)
+#define SAVE_m8n6 SAVE_m8n4 unit_save_m8n2(12,13)
+#define SAVE_m8n8 SAVE_m8n6 unit_save_m8n2(14,15)
+#define SAVE_m8n10 SAVE_m8n8 unit_save_m8n2(16,17)
+#define SAVE_m8n12 SAVE_m8n10 unit_save_m8n2(18,19)
+#define INIT_m8n1 "vpxorq %%zmm8,%%zmm8,%%zmm8;"
+#define INIT_m8n2 unit_init_2zmm(8,9)
+#define INIT_m8n4 INIT_m8n2 unit_init_2zmm(10,11)
+#define INIT_m8n6 INIT_m8n4 unit_init_2zmm(12,13)
+#define INIT_m8n8 INIT_m8n6 unit_init_2zmm(14,15)
+#define INIT_m8n10 INIT_m8n8 unit_init_2zmm(16,17)
+#define INIT_m8n12 INIT_m8n10 unit_init_2zmm(18,19)
+
+#define KERNEL_k1m4n1 \
+  "vbroadcastsd (%1),%%ymm1; addq $8,%1;"\
+  "vfmadd231pd (%0),%%ymm1,%%ymm4; addq $32,%0;"
+#define unit_acc_gen_m4n2(c1_no,c2_no,boff,...)\
+  "vbroadcastf128 "#boff"("#__VA_ARGS__"),%%ymm3; vfmadd231pd %%ymm1,%%ymm3,%%ymm"#c1_no"; vfmadd231pd %%ymm2,%%ymm3,%%ymm"#c2_no";"
+#define unit_acc_m4n2(c1_no,c2_no,...) unit_acc_gen_m4n2(c1_no,c2_no,0,__VA_ARGS__)
+#define KERNEL_h_k1m4n2 \
+  "vmovddup (%0),%%ymm1; vmovddup 8(%0),%%ymm2; addq $32,%0;" unit_acc_m4n2(4,5,%1)
+#define KERNEL_k1m4n2 KERNEL_h_k1m4n2 "addq $16,%1;"
+#define KERNEL_h_k1m4n4 KERNEL_h_k1m4n2 unit_acc_m4n2(6,7,%1,%%r12,1)
+#define KERNEL_k1m4n4 KERNEL_h_k1m4n4 "addq $16,%1;"
+#define KERNEL_k1m4n6 KERNEL_h_k1m4n4 unit_acc_m4n2(8,9,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m4n8 KERNEL_k1m4n6 unit_acc_m4n2(10,11,%%r15)
+#define KERNEL_k1m4n8 KERNEL_h_k1m4n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m4n10 KERNEL_h_k1m4n8 unit_acc_m4n2(12,13,%%r15,%%r12,1)
+#define KERNEL_k1m4n10 KERNEL_h_k1m4n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m4n12 KERNEL_h_k1m4n10 unit_acc_m4n2(14,15,%%r15,%%r12,2)
+#define KERNEL_k1m4n12 KERNEL_h_k1m4n12 "addq $16,%%r15;"
+#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
+  #define loada_kend_k1m4 "vmovddup (%0,%3,1),%%ymm1; vmovddup 8(%0,%3,1),%%ymm2; addq $32,%3;"
+  #define acc_kend_nc2_k1m4(boff1,boff2) unit_acc_gen_m4n2(6,7,boff1,%1,%%r12,1)
+  #define acc_kend_nc3_k1m4(boff1,boff2) unit_acc_gen_m4n2(8,9,boff1,%1,%%r12,2)
+  #define acc_kend_nc4_k1m4(boff1,boff2) unit_acc_gen_m4n2(10,11,boff1,%%r15)
+  #define acc_kend_nc5_k1m4(boff1,boff2) unit_acc_gen_m4n2(12,13,boff1,%%r15,%%r12,1)
+  #define acc_kend_nc6_k1m4(boff1,boff2) unit_acc_gen_m4n2(14,15,boff1,%%r15,%%r12,2)
+#endif
+#define save_init_m4 "movq %2,%3; addq $32,%2;"
+#ifdef TRMMKERNEL
+  #define SAVE_m4n1 "vmulpd %%ymm4,%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;"
+  #define unit_save_m4n2(c1_no,c2_no)\
+    "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vmulpd %%ymm1,%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\
+    "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vmulpd %%ymm2,%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#else
+  #define SAVE_m4n1 "vfmadd213pd (%2),%%ymm0,%%ymm4; vmovupd %%ymm4,(%2); addq $32,%2;"
+  #define unit_save_m4n2(c1_no,c2_no)\
+    "vunpcklpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm1; vfmadd213pd (%3),%%ymm0,%%ymm1; vmovupd %%ymm1,(%3);"\
+    "vunpckhpd %%ymm"#c2_no",%%ymm"#c1_no",%%ymm2; vfmadd213pd (%3,%4,1),%%ymm0,%%ymm2; vmovupd %%ymm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define SAVE_m4n2 save_init_m4 unit_save_m4n2(4,5)
+#define SAVE_m4n4 SAVE_m4n2 unit_save_m4n2(6,7)
+#define SAVE_m4n6 SAVE_m4n4 unit_save_m4n2(8,9)
+#define SAVE_m4n8 SAVE_m4n6 unit_save_m4n2(10,11)
+#define SAVE_m4n10 SAVE_m4n8 unit_save_m4n2(12,13)
+#define SAVE_m4n12 SAVE_m4n10 unit_save_m4n2(14,15)
+#define INIT_m4n1 "vpxor %%ymm4,%%ymm4,%%ymm4;"
+#define unit_init_2ymm(c1_no,c2_no) "vpxor %%ymm"#c1_no",%%ymm"#c1_no",%%ymm"#c1_no"; vpxor %%ymm"#c2_no",%%ymm"#c2_no",%%ymm"#c2_no";"
+#define INIT_m4n2 unit_init_2ymm(4,5)
+#define INIT_m4n4 INIT_m4n2 unit_init_2ymm(6,7)
+#define INIT_m4n6 INIT_m4n4 unit_init_2ymm(8,9)
+#define INIT_m4n8 INIT_m4n6 unit_init_2ymm(10,11)
+#define INIT_m4n10 INIT_m4n8 unit_init_2ymm(12,13)
+#define INIT_m4n12 INIT_m4n10 unit_init_2ymm(14,15)
+
+#define KERNEL_k1m2n1 \
+  "vmovddup (%1),%%xmm1; addq $8,%1;"\
+  "vfmadd231pd (%0),%%xmm1,%%xmm4; addq $16,%0;"
+#define unit_acc_gen_m2n2(c1_no,c2_no,boff,...)\
+  "vmovupd "#boff"("#__VA_ARGS__"),%%xmm3; vfmadd231pd %%xmm1,%%xmm3,%%xmm"#c1_no"; vfmadd231pd %%xmm2,%%xmm3,%%xmm"#c2_no";"
+#define unit_acc_m2n2(c1_no,c2_no,...) unit_acc_gen_m2n2(c1_no,c2_no,0,__VA_ARGS__)
+#define KERNEL_h_k1m2n2 \
+  "vmovddup (%0),%%xmm1; vmovddup 8(%0),%%xmm2; addq $16,%0;" unit_acc_m2n2(4,5,%1)
+#define KERNEL_k1m2n2 KERNEL_h_k1m2n2 "addq $16,%1;"
+#define KERNEL_h_k1m2n4 KERNEL_h_k1m2n2 unit_acc_m2n2(6,7,%1,%%r12,1)
+#define KERNEL_k1m2n4 KERNEL_h_k1m2n4 "addq $16,%1;"
+#define KERNEL_k1m2n6 KERNEL_h_k1m2n4 unit_acc_m2n2(8,9,%1,%%r12,2) "addq $16,%1;"
+#define KERNEL_h_k1m2n8 KERNEL_k1m2n6 unit_acc_m2n2(10,11,%%r15)
+#define KERNEL_k1m2n8 KERNEL_h_k1m2n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m2n10 KERNEL_h_k1m2n8 unit_acc_m2n2(12,13,%%r15,%%r12,1)
+#define KERNEL_k1m2n10 KERNEL_h_k1m2n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m2n12 KERNEL_h_k1m2n10 unit_acc_m2n2(14,15,%%r15,%%r12,2)
+#define KERNEL_k1m2n12 KERNEL_h_k1m2n12 "addq $16,%%r15;"
+#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
+  #define loada_kend_k1m2 "vmovddup (%0,%3,1),%%xmm1; vmovddup 8(%0,%3,1),%%xmm2; addq $16,%3;"
+  #define acc_kend_nc2_k1m2(boff1,boff2) unit_acc_gen_m2n2(6,7,boff1,%1,%%r12,1)
+  #define acc_kend_nc3_k1m2(boff1,boff2) unit_acc_gen_m2n2(8,9,boff1,%1,%%r12,2)
+  #define acc_kend_nc4_k1m2(boff1,boff2) unit_acc_gen_m2n2(10,11,boff1,%%r15)
+  #define acc_kend_nc5_k1m2(boff1,boff2) unit_acc_gen_m2n2(12,13,boff1,%%r15,%%r12,1)
+  #define acc_kend_nc6_k1m2(boff1,boff2) unit_acc_gen_m2n2(14,15,boff1,%%r15,%%r12,2)
+#endif
+#define save_init_m2 "movq %2,%3; addq $16,%2;"
+#ifdef TRMMKERNEL
+  #define SAVE_m2n1 "vmulpd %%xmm4,%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;"
+  #define unit_save_m2n2(c1_no,c2_no)\
+    "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vmulpd %%xmm1,%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\
+    "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vmulpd %%xmm2,%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#else
+  #define SAVE_m2n1 "vfmadd213pd (%2),%%xmm0,%%xmm4; vmovupd %%xmm4,(%2); addq $16,%2;"
+  #define unit_save_m2n2(c1_no,c2_no)\
+    "vunpcklpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm1; vfmadd213pd (%3),%%xmm0,%%xmm1; vmovupd %%xmm1,(%3);"\
+    "vunpckhpd %%xmm"#c2_no",%%xmm"#c1_no",%%xmm2; vfmadd213pd (%3,%4,1),%%xmm0,%%xmm2; vmovupd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define SAVE_m2n2 save_init_m2 unit_save_m2n2(4,5)
+#define SAVE_m2n4 SAVE_m2n2 unit_save_m2n2(6,7)
+#define SAVE_m2n6 SAVE_m2n4 unit_save_m2n2(8,9)
+#define SAVE_m2n8 SAVE_m2n6 unit_save_m2n2(10,11)
+#define SAVE_m2n10 SAVE_m2n8 unit_save_m2n2(12,13)
+#define SAVE_m2n12 SAVE_m2n10 unit_save_m2n2(14,15)
+#define INIT_m2n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define unit_init_2xmm(c1_no,c2_no) "vpxor %%xmm"#c1_no",%%xmm"#c1_no",%%xmm"#c1_no"; vpxor %%xmm"#c2_no",%%xmm"#c2_no",%%xmm"#c2_no";"
+#define INIT_m2n2 unit_init_2xmm(4,5)
+#define INIT_m2n4 INIT_m2n2 unit_init_2xmm(6,7)
+#define INIT_m2n6 INIT_m2n4 unit_init_2xmm(8,9)
+#define INIT_m2n8 INIT_m2n6 unit_init_2xmm(10,11)
+#define INIT_m2n10 INIT_m2n8 unit_init_2xmm(12,13)
+#define INIT_m2n12 INIT_m2n10 unit_init_2xmm(14,15)
+
+#define KERNEL_k1m1n1 \
+  "vmovsd (%1),%%xmm1; addq $8,%1;"\
+  "vfmadd231sd (%0),%%xmm1,%%xmm4; addq $8,%0;"
+#define KERNEL_h_k1m1n2 \
+  "vmovddup (%0),%%xmm1; addq $8,%0;"\
+  "vfmadd231pd (%1),%%xmm1,%%xmm4;"
+#define KERNEL_k1m1n2 KERNEL_h_k1m1n2 "addq $16,%1;"
+#define KERNEL_h_k1m1n4 KERNEL_h_k1m1n2 "vfmadd231pd (%1,%%r12,1),%%xmm1,%%xmm5;"
+#define KERNEL_k1m1n4 KERNEL_h_k1m1n4 "addq $16,%1;"
+#define KERNEL_k1m1n6 KERNEL_h_k1m1n4 "vfmadd231pd (%1,%%r12,2),%%xmm1,%%xmm6; addq $16,%1;"
+#define KERNEL_h_k1m1n8 KERNEL_k1m1n6 "vfmadd231pd (%%r15),%%xmm1,%%xmm7;"
+#define KERNEL_k1m1n8 KERNEL_h_k1m1n8 "addq $16,%%r15;"
+#define KERNEL_h_k1m1n10 KERNEL_h_k1m1n8 "vfmadd231pd (%%r15,%%r12,1),%%xmm1,%%xmm8;"
+#define KERNEL_k1m1n10 KERNEL_h_k1m1n10 "addq $16,%%r15;"
+#define KERNEL_h_k1m1n12 KERNEL_h_k1m1n10 "vfmadd231pd (%%r15,%%r12,2),%%xmm1,%%xmm9;"
+#define KERNEL_k1m1n12 KERNEL_h_k1m1n12 "addq $16,%%r15;"
+#if defined(TRMMKERNEL) && !defined(LEFT) && (BACKWARDS == 0)
+  #define loada_kend_k1m1 "vmovddup (%0,%3,1),%%xmm1; addq $8,%3;"
+  #define acc_kend_nc2_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,1),%%xmm1,%%xmm5;"
+  #define acc_kend_nc3_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%1,%%r12,2),%%xmm1,%%xmm6;"
+  #define acc_kend_nc4_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15),%%xmm1,%%xmm7;"
+  #define acc_kend_nc5_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,1),%%xmm1,%%xmm8;"
+  #define acc_kend_nc6_k1m1(boff1,boff2) "vfmadd231pd "#boff1"(%%r15,%%r12,2),%%xmm1,%%xmm9;"
+#endif
+#define save_init_m1 "movq %2,%3; addq $8,%2;"
+#ifdef TRMMKERNEL
+  #define SAVE_m1n1 "vmulsd %%xmm4,%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;"
+  #define unit_save_m1n2(c1_no)\
+    "vmulpd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#else
+  #define SAVE_m1n1 "vfmadd213sd (%2),%%xmm0,%%xmm4; vmovsd %%xmm4,(%2); addq $8,%2;"
+  #define unit_save_m1n2(c1_no)\
+    "vmovsd (%3),%%xmm2; vmovhpd (%3,%4,1),%%xmm2,%%xmm2; vfmadd231pd %%xmm"#c1_no",%%xmm0,%%xmm2; vmovsd %%xmm2,(%3); vmovhpd %%xmm2,(%3,%4,1); leaq (%3,%4,2),%3;"
+#endif
+#define SAVE_m1n2 save_init_m1 unit_save_m1n2(4)
+#define SAVE_m1n4 SAVE_m1n2 unit_save_m1n2(5)
+#define SAVE_m1n6 SAVE_m1n4 unit_save_m1n2(6)
+#define SAVE_m1n8 SAVE_m1n6 unit_save_m1n2(7)
+#define SAVE_m1n10 SAVE_m1n8 unit_save_m1n2(8)
+#define SAVE_m1n12 SAVE_m1n10 unit_save_m1n2(9)
+#define INIT_m1n1 "vpxor %%xmm4,%%xmm4,%%xmm4;"
+#define INIT_m1n2 INIT_m1n1
+#define INIT_m1n4 INIT_m1n2 "vpxor %%xmm5,%%xmm5,%%xmm5;"
+#define INIT_m1n6 INIT_m1n4 "vpxor %%xmm6,%%xmm6,%%xmm6;"
+#define INIT_m1n8 INIT_m1n6 "vpxor %%xmm7,%%xmm7,%%xmm7;"
+#define INIT_m1n10 INIT_m1n8 "vpxor %%xmm8,%%xmm8,%%xmm8;"
+#define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;"
+
+#define COMPUTE_SIMPLE(mdim,ndim)\
+  init_update_k(mdim) INIT_m##mdim##n##ndim "testq %%r13,%%r13; jz 7"#mdim"7"#ndim"9f;"\
+  "movq %%r13,%5;" INIT_set_papb(mdim,ndim)\
+  kernel_kstart_n##ndim(mdim,subq)\
+  "7"#mdim"7"#ndim"1:\n\t"\
+  KERNEL_k1m##mdim##n##ndim "decq %5; jnz 7"#mdim"7"#ndim"1b;"\
+  "7"#mdim"7"#ndim"9:\n\t"\
+  kernel_kend_n##ndim(mdim)\
+  SAVE_set_pa(mdim) SAVE_m##mdim##n##ndim save_update_k(mdim)
+#define COMPUTE_m16n1 COMPUTE_SIMPLE(16,1)
+#define COMPUTE_m16n2 COMPUTE_SIMPLE(16,2)
+#define COMPUTE_m16n4 COMPUTE_SIMPLE(16,4)
+#define COMPUTE_m16n6 COMPUTE_SIMPLE(16,6)
+#define COMPUTE_m16n8 COMPUTE_SIMPLE(16,8)
+#define COMPUTE_m16n10 COMPUTE_SIMPLE(16,10)
+#if defined(TRMMKERNEL) && !defined(LEFT) && defined(TRANSA)
+  #define INVERSE_K_MID "negq %5; leaq 6(%%r13,%5,1),%5;"
+#else
+  #define INVERSE_K_MID "negq %5; leaq 16(%%r13,%5,1),%5;"
+#endif
+#define COMPUTE_m16n12 \
+  init_update_k(16) INIT_m16n12 "movq %%r13,%5;" INIT_set_papb(16,12) "movq %2,%3;"\
+  kernel_kstart_n12(16,subq)\
+  "cmpq $16,%5; jb 7167123f; movq $16,%5;"\
+  "7167121:\n\t"\
+  KERNEL_k1m16n12 "addq $4,%5; testq $12,%5; movq $172,%%r10; cmovz %4,%%r10;"\
+  KERNEL_k1m16n12 "prefetcht1 (%3); subq $129,%3; addq %%r10,%3;"\
+  KERNEL_k1m16n12 "prefetcht1 (%6); addq $32,%6; cmpq $208,%5; cmoveq %2,%3;"\
+  KERNEL_k1m16n12 "cmpq %5,%%r13; jnb 7167121b;"\
+  "movq %2,%3;" INVERSE_K_MID\
+  "7167123:\n\t"\
+  "testq %5,%5; jz 7167129f;"\
+  "7167125:\n\t"\
+  "prefetcht0 (%3); prefetcht0 64(%3); prefetcht0 127(%3);"\
+  KERNEL_k1m16n12 "addq %4,%3; decq %5;jnz 7167125b;"\
+  "7167129:\n\t"\
+  kernel_kend_n12(16)\
+  "prefetcht0 (%%r14);" SAVE_set_pa(16) SAVE_m16n12 save_update_k(16)
+#define COMPUTE(ndim) {\
+  b_pref = b_ptr + ndim * K; HEAD_SET_OFF(ndim)\
+  __asm__ __volatile__(\
+    "vbroadcastsd %8,%%zmm0; movq %7,%%r11; movq %1,%%r14; movq %10,%%r12; salq $4,%%r12;" INITASM_SET_K\
+    "cmpq $16,%%r11; jb "#ndim"33102f;"\
+    #ndim"33101:\n\t"\
+    COMPUTE_m16n##ndim "subq $16,%%r11; cmpq $16,%%r11; jnb "#ndim"33101b;"\
+    #ndim"33102:\n\t"\
+    "cmpq $8,%%r11; jb "#ndim"33103f;"\
+    COMPUTE_SIMPLE(8,ndim) "subq $8,%%r11;"\
+    #ndim"33103:\n\t"\
+    "cmpq $4,%%r11; jb "#ndim"33104f;"\
+    COMPUTE_SIMPLE(4,ndim) "subq $4,%%r11;"\
+    #ndim"33104:\n\t"\
+    "cmpq $2,%%r11; jb "#ndim"33105f;"\
+    COMPUTE_SIMPLE(2,ndim) "subq $2,%%r11;"\
+    #ndim"33105:\n\t"\
+    "testq %%r11,%%r11; jz "#ndim"33106f;"\
+    COMPUTE_SIMPLE(1,ndim) "subq $1,%%r11;"\
+    #ndim"33106:\n\t"\
+    "movq %%r14,%1;"\
+  :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_in_bytes),"+r"(k_count),"+r"(b_pref):"m"(M),"m"(ALPHA),"m"(off),"m"(K):"r10","r11","r12","r13","r14","r15","cc","memory",\
+    "zmm0","zmm1","zmm2","zmm3","zmm4","zmm5","zmm6","zmm7","zmm8","zmm9","zmm10","zmm11","zmm12","zmm13","zmm14","zmm15",\
+    "zmm16","zmm17","zmm18","zmm19","zmm20","zmm21","zmm22","zmm23","zmm24","zmm25","zmm26","zmm27","zmm28","zmm29","zmm30","zmm31");\
+  a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ndim * ldc - M; TAIL_SET_OFF(ndim)\
+}
+
+#include "common.h"
+#include <stdint.h>
+
+int __attribute__ ((noinline))
+CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A, double * __restrict__ B, double * __restrict__ C, BLASLONG ldc
+#ifdef TRMMKERNEL
+  , BLASLONG offset
+#endif
+)
+{
+    if(m==0||n==0||k==0||alpha==0.0) return 0;
+    int64_t ldc_in_bytes = (int64_t)ldc * sizeof(double); double ALPHA = alpha;
+    int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0;
+    BLASLONG n_count = n, off = 0;
+    double *a_ptr = A,*b_ptr = B,*c_ptr = C,*c_tmp = C,*b_pref = B;
+#ifdef TRMMKERNEL
+  #ifdef LEFT
+    off = offset;
+  #else
+    off = -offset;
+  #endif
+#endif
+    for(;n_count>11;n_count-=12) COMPUTE(12)
+    for(;n_count>9;n_count-=10) COMPUTE(10)
+    for(;n_count>7;n_count-=8) COMPUTE(8)
+    for(;n_count>5;n_count-=6) COMPUTE(6)
+    for(;n_count>3;n_count-=4) COMPUTE(4)
+    for(;n_count>1;n_count-=2) COMPUTE(2)
+    if(n_count>0) COMPUTE(1)
+    return 0;
+}
+

From f3f969f681ffc67f81ac78d37e51f83129232985 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 3 Feb 2020 21:34:12 +0800
Subject: [PATCH 189/210] Update param.h

---
 param.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/param.h b/param.h
index 075c12ca2..219d99fc6 100644
--- a/param.h
+++ b/param.h
@@ -1660,14 +1660,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #define SGEMM_DEFAULT_UNROLL_M 16
-#define DGEMM_DEFAULT_UNROLL_M 4
+#define DGEMM_DEFAULT_UNROLL_M 16
 #define QGEMM_DEFAULT_UNROLL_M 2
 #define CGEMM_DEFAULT_UNROLL_M 8
 #define ZGEMM_DEFAULT_UNROLL_M 4
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_N 2
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
@@ -1701,12 +1701,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 
 #define SGEMM_DEFAULT_P 640
-#define DGEMM_DEFAULT_P 384
+#define DGEMM_DEFAULT_P 192
 #define CGEMM_DEFAULT_P 384
 #define ZGEMM_DEFAULT_P 256
 
 #define SGEMM_DEFAULT_Q 320
-#define DGEMM_DEFAULT_Q 168
+#define DGEMM_DEFAULT_Q 384
 #define CGEMM_DEFAULT_Q 192
 #define ZGEMM_DEFAULT_Q 128
 

From 081b1885294afedf7f5ee87e1d9bd8b82d096664 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Mon, 3 Feb 2020 21:38:08 +0800
Subject: [PATCH 190/210] Update KERNEL.SKYLAKEX

---
 kernel/x86_64/KERNEL.SKYLAKEX | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 0e6275748..dcd201649 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -7,10 +7,13 @@ SGEMMITCOPY    =  sgemm_tcopy_16_skylakex.c
 SGEMMONCOPY    =  sgemm_ncopy_4_skylakex.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 
-DGEMMKERNEL    =  dgemm_kernel_4x8_skylakex_2.c
-
-DGEMMONCOPY    =  dgemm_ncopy_8_skylakex.c
-DGEMMOTCOPY    =  dgemm_tcopy_8_skylakex.c
+DGEMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
+DTRMMKERNEL    =  dgemm_kernel_16x2_skylakex.c
+DGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+DTRSMKERNEL_RN =  ../generic/trsm_kernel_RN.c
 
 SGEMM_BETA = sgemm_beta_skylakex.c
 DGEMM_BETA = dgemm_beta_skylakex.c

From 83b6be7976dd02973851c6df68e579a024aebfcc Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 4 Feb 2020 19:55:26 +0800
Subject: [PATCH 191/210] Update param.h

---
 param.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/param.h b/param.h
index 219d99fc6..e6ab93aa5 100644
--- a/param.h
+++ b/param.h
@@ -1711,7 +1711,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_Q 128
 
 #define SGEMM_DEFAULT_R sgemm_r
-#define DGEMM_DEFAULT_R 13824
+#define DGEMM_DEFAULT_R 8640
 #define CGEMM_DEFAULT_R cgemm_r
 #define ZGEMM_DEFAULT_R zgemm_r
 

From 1c3e20ce483c5b6a9bb457bd199c16a0b0bbd8de Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 4 Feb 2020 20:30:23 +0800
Subject: [PATCH 192/210] Update level3.c

---
 driver/level3/level3.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index 1ab7a740e..9aa67286f 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -332,13 +332,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #else
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
-
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
         if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
         else
         	if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
         	else
           		if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 
 
 	START_RPCC();

From 77b8f49556096952fb4495b7019b58597f81dce8 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Tue, 4 Feb 2020 20:33:08 +0800
Subject: [PATCH 193/210] Update level3_thread.c

---
 driver/level3/level3_thread.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index cfbff7554..bf558447e 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -365,12 +365,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
       /* Split local region of B into parts */
       for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){
 	min_jj = MIN(n_to, js + div_n) - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
 	if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
 	else
           if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
           else
             if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
         /* Copy part of local region of B into workspace */
 	START_RPCC();
 	OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs,

From 833bd0f8ffd1b7921ca8e98196e40183158f8cb7 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 5 Feb 2020 10:09:41 +0800
Subject: [PATCH 194/210] Update trmm_L.c

---
 driver/level3/trmm_L.c | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c
index 8a81d31a0..9117090b5 100644
--- a/driver/level3/trmm_L.c
+++ b/driver/level3/trmm_L.c
@@ -135,10 +135,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
+#ifdef SKYLAKEX
+      /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+      if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
       else
       	if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
       START_RPCC();
 
       GEMM_ONCOPY(min_l, min_jj, b + (jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@@ -201,10 +205,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
         if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
         else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 	START_RPCC();
 
 	GEMM_ONCOPY(min_l, min_jj, b + (ls + jjs * ldb) * COMPSIZE, ldb, sb + min_l * (jjs - js) * COMPSIZE);
@@ -292,10 +300,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
     for(jjs = js; jjs < js + min_j; jjs += min_jj){
       min_jj = min_j + js - jjs;
+#ifdef SKYLAKEX
+      /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+      if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
       if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
       else
         if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
       START_RPCC();
 
       GEMM_ONCOPY(min_l, min_jj, b + (m - min_l + jjs * ldb) * COMPSIZE, ldb,
@@ -358,10 +370,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
         if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
         else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 	START_RPCC();
 
 	GEMM_ONCOPY(min_l, min_jj, b + (ls - min_l + jjs * ldb) * COMPSIZE, ldb,

From 2f96a2c55b2cfae827973b3002ae5af8bfa6d7d6 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 5 Feb 2020 10:15:02 +0800
Subject: [PATCH 195/210] Update trmm_R.c

---
 driver/level3/trmm_R.c | 36 ++++++++++++++++++++++++++++++------
 1 file changed, 30 insertions(+), 6 deletions(-)

diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c
index 0882aa496..62c6a2442 100644
--- a/driver/level3/trmm_R.c
+++ b/driver/level3/trmm_R.c
@@ -122,10 +122,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < ls - js; jjs += min_jj){
 	min_jj = ls - js - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 	else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (js + jjs) * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE);
 #else
@@ -142,10 +146,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 	else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 #ifndef TRANSA
 	TRMM_OLNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * (ls - js + jjs) * COMPSIZE);
 #else
@@ -195,10 +203,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 	else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + jjs * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
 #else
@@ -246,10 +258,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < min_l; jjs += min_jj){
 	min_jj = min_l - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 	else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 #ifndef TRANSA
 	TRMM_OUNCOPY(min_l, min_jj, a, lda, ls, ls + jjs, sb + min_l * jjs * COMPSIZE);
 #else
@@ -267,10 +283,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){
 	min_jj = js - ls - min_l - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 	else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (ls + min_l + jjs) * lda) * COMPSIZE, lda,
 		    sb + min_l * (min_l + jjs) * COMPSIZE);
@@ -324,10 +344,14 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
 
       for(jjs = js; jjs < js + min_j; jjs += min_jj){
 	min_jj = min_j + js - jjs;
+#ifdef SKYLAKEX
+	/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
+	if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
+#else
 	if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
 	else
 	  if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
-
+#endif
 #ifndef TRANSA
 	GEMM_ONCOPY(min_l, min_jj, a + (ls + (jjs - min_j) * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE);
 #else

From 096da2f51acc37d67e84646336753a77b4b007ad Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Wed, 5 Feb 2020 13:36:57 +0800
Subject: [PATCH 196/210] Update dgemm_kernel_16x2_skylakex.c

---
 kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
index 250ff8d49..743ad5aa7 100644
--- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
@@ -1,3 +1,6 @@
+/* %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc(bytes), %5 = k_counter, %6 = b_pref */
+/* r10 = tmp, r11 = m_counter, r12 = size_of_1_tile_in_b, r13 = k, r14 = b_head, r15 = %1+3*r12 */
+
 #if (defined (LEFT) && !defined(TRANSA)) || (!defined (LEFT) && defined(TRANSA))
   #define BACKWARDS 1
 #else

From 0b909203cb31f4667bfc9172d475d20d53bbad75 Mon Sep 17 00:00:00 2001
From: w00421467 <wuzeyi1@huawei.com>
Date: Wed, 5 Feb 2020 14:53:37 +0800
Subject: [PATCH 197/210] Fix bugs in benchmark of gemv

---
 benchmark/gemv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index b6a42f42f..781df695e 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -197,7 +197,7 @@ int main(int argc, char *argv[]){
    		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
    		for(j = 0; j < m; j++){
       			for(i = 0; i < n * COMPSIZE; i++){
-				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+				a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
       			}
    		}
 
@@ -208,7 +208,7 @@ int main(int argc, char *argv[]){
 				x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
 
-   			for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+   			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
     			gettimeofday( &start, (struct timezone *)0);

From ce9ea8f826e5a9ded2719b4104ca4901bab5b2fe Mon Sep 17 00:00:00 2001
From: w00421467 <wuzeyi1@huawei.com>
Date: Wed, 5 Feb 2020 15:07:18 +0800
Subject: [PATCH 198/210] Fix another branch

---
 benchmark/gemv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index 781df695e..adf8f3d91 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -234,7 +234,7 @@ int main(int argc, char *argv[]){
    		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
    		for(j = 0; j < m; j++){
       			for(i = 0; i < n * COMPSIZE; i++){
-				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
+				a[j + i * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
       			}
    		}
 
@@ -245,7 +245,7 @@ int main(int argc, char *argv[]){
 				x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
 
-   			for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
+   			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
     			gettimeofday( &start, (struct timezone *)0);

From 4e00d96a78b8b2d7b6cda9dafc9a72b6777d8828 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Thu, 6 Feb 2020 01:46:36 +0000
Subject: [PATCH 199/210] Update dgemm_kernel_16x2_skylakex.c

---
 kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
index 743ad5aa7..4c4c2f4e4 100644
--- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
@@ -467,7 +467,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double * __restrict__ A,
 #endif
 )
 {
-    if(m==0||n==0||k==0||alpha==0.0) return 0;
+    if(m==0||n==0) return 0;
     int64_t ldc_in_bytes = (int64_t)ldc * sizeof(double); double ALPHA = alpha;
     int64_t M = (int64_t)m, K = (int64_t)k, k_count = 0;
     BLASLONG n_count = n, off = 0;

From 8b5cdcc64c0b5ed4fc58e9bd6a314759eb6c1294 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Thu, 6 Feb 2020 01:47:46 +0000
Subject: [PATCH 200/210] Update sgemm_kernel_8x4_haswell.c

---
 kernel/x86_64/sgemm_kernel_8x4_haswell.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/sgemm_kernel_8x4_haswell.c b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
index 2b8aa9862..2250e4b97 100644
--- a/kernel/x86_64/sgemm_kernel_8x4_haswell.c
+++ b/kernel/x86_64/sgemm_kernel_8x4_haswell.c
@@ -467,7 +467,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f
 ,BLASLONG offset
 #endif
 ){
-    if(m==0||n==0||k==0||alpha==0.0) return 0;
+    if(m==0||n==0) return 0;
     int64_t ldc_in_bytes = (int64_t)LDC * sizeof(float);
     float constval = alpha;
     float *const_val=&constval;

From 3447d04eaf3c98b4fcd1ba41fa13774ff15c72e3 Mon Sep 17 00:00:00 2001
From: wjc404 <52632443+wjc404@users.noreply.github.com>
Date: Thu, 6 Feb 2020 02:14:10 +0000
Subject: [PATCH 201/210] Update dgemm_kernel_16x2_skylakex.c

---
 kernel/x86_64/dgemm_kernel_16x2_skylakex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
index 4c4c2f4e4..416ace59b 100644
--- a/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
+++ b/kernel/x86_64/dgemm_kernel_16x2_skylakex.c
@@ -393,9 +393,10 @@
 #define INIT_m1n12 INIT_m1n10 "vpxor %%xmm9,%%xmm9,%%xmm9;"
 
 #define COMPUTE_SIMPLE(mdim,ndim)\
-  init_update_k(mdim) INIT_m##mdim##n##ndim "testq %%r13,%%r13; jz 7"#mdim"7"#ndim"9f;"\
+  init_update_k(mdim) INIT_m##mdim##n##ndim\
   "movq %%r13,%5;" INIT_set_papb(mdim,ndim)\
   kernel_kstart_n##ndim(mdim,subq)\
+  "testq %5,%5; jz 7"#mdim"7"#ndim"9f;"\
   "7"#mdim"7"#ndim"1:\n\t"\
   KERNEL_k1m##mdim##n##ndim "decq %5; jnz 7"#mdim"7"#ndim"1b;"\
   "7"#mdim"7"#ndim"9:\n\t"\

From 9694037b2317b944e925a98a4d3e222e9d70311e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Feb 2020 10:09:25 +0100
Subject: [PATCH 202/210] Set SUFFIX in tempfile commands, fix bad architecture
 option for PGI compiler in avx512 test

---
 c_check | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/c_check b/c_check
index 3d82aa73c..543789207 100644
--- a/c_check
+++ b/c_check
@@ -188,13 +188,13 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     if ($@){ 
 	warn "could not load PERL module File::Temp, so could not check MSA capatibility";
     } else {
-	$tmpf = new File::Temp( UNLINK => 1 );
+	$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$code = '"addvi.b $w0, $w1, 1"';
 	$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
 	print $tmpf "#include <msa.h>\n\n";
 	print $tmpf "void main(void){ __asm__ volatile($code); }\n";
 
-	$args = "$msa_flags -o $tmpf.o -x c $tmpf";
+	$args = "$msa_flags -o $tmpf.o $tmpf";
 	my @cmd = ("$compiler_name $args");
 	system(@cmd) == 0;
 	if ($? != 0) {
@@ -229,10 +229,13 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) {
 	$no_avx512 = 0;
     } else {
 #	$tmpf = new File::Temp( UNLINK => 1 );
-	($fh,$tmpf) = tempfile( UNLINK => 1 );
+	($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 );
 	$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
 	print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
-	$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
+	$args = " -march=skylake-avx512 -c -o $tmpf.o $tmpf";
+	if ($compiler eq "PGI") {
+	    $args = " -tp skylake -c -o $tmpf.o $tmpf";
+	}
 	my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
 	system(@cmd) == 0;
 	if ($? != 0) {

From 68a43db35882aebe8eefff1756eb86d2ffb31b1f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Feb 2020 10:15:18 +0100
Subject: [PATCH 203/210] Fix utest compilation with PGI

---
 utest/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utest/Makefile b/utest/Makefile
index 8c7e6b9f8..bd4bdf3ae 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -31,6 +31,10 @@ OBJS += test_fork.o
 endif
 endif
 
+ifeq ($(C_COMPILER), PGI)
+OBJS = utest_main2.o
+endif
+
 all : run_test
 
 $(UTESTBIN): $(OBJS)

From 598984152426e611e22ebe064745b7af602914af Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Feb 2020 13:01:31 +0100
Subject: [PATCH 204/210] Add PGI to avx512-supporting compilers

---
 getarch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/getarch.c b/getarch.c
index 1f590390a..3a78771c2 100644
--- a/getarch.c
+++ b/getarch.c
@@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <unistd.h>
 #endif
 
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6) || defined(__PGI))
 #else
 #define NO_AVX512
 #endif

From d55b10830f9a077fea3a1e785bf214370ef0f959 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Feb 2020 16:02:17 +0100
Subject: [PATCH 205/210] Remove OpenMP libraries from link list

---
 c_check | 1 +
 1 file changed, 1 insertion(+)

diff --git a/c_check b/c_check
index 543789207..fbd1838aa 100644
--- a/c_check
+++ b/c_check
@@ -321,6 +321,7 @@ $linker_a = "";
 	    && ($flags !~ /kernel32/)
 	    && ($flags !~ /advapi32/)
 	    && ($flags !~ /shell32/)
+	    && ($flags !~ /omp/)
 	    ) {
 	    $linker_l .= $flags . " "
 	}

From cfe63d8cc202ca7dc9cf58b27f3a898b6087cfac Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 7 Feb 2020 16:03:51 +0100
Subject: [PATCH 206/210] Remove OpenMP libraries from link list

---
 f_check | 1 +
 1 file changed, 1 insertion(+)

diff --git a/f_check b/f_check
index 79b24e2dc..fac8fc707 100644
--- a/f_check
+++ b/f_check
@@ -334,6 +334,7 @@ if ($link ne "") {
 	    && ($flags !~ /kernel32/)
 	    && ($flags !~ /advapi32/)
 	    && ($flags !~ /shell32/)
+	    && ($flags !~ /omp/)
 		&& ($flags !~ /^\-l$/)
 	    ) {
 	    $linker_l .= $flags . " ";

From b3cbd60d7aa39ea76d5a2590ee5eb54038fea3c3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 8 Feb 2020 10:20:13 +0100
Subject: [PATCH 207/210] Remove PGI from list again as it is actually still
 not capable

---
 getarch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/getarch.c b/getarch.c
index 3a78771c2..1f590390a 100644
--- a/getarch.c
+++ b/getarch.c
@@ -91,7 +91,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <unistd.h>
 #endif
 
-#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6) || defined(__PGI))
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
 #else
 #define NO_AVX512
 #endif

From 50545b19d083558117337a393eff9ee656b22fd7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Feb 2020 00:06:07 +0100
Subject: [PATCH 208/210] Update  CPU and OS support and document DYNAMIC_ARCH
 option in README.md

prompted by #2388
---
 README.md | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 14815ff00..b1ecc6521 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge
 
 Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code
 using Git from https://github.com/xianyi/OpenBLAS.git.
+Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option.
+Most can also be given directly on the make or cmake command line.
 
 ### Dependencies
 
@@ -101,7 +103,7 @@ The default installation directory is `/opt/OpenBLAS`.
 
 ## Supported CPUs and Operating Systems
 
-Please read `GotoBLAS_01Readme.txt`.
+Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by the 2010 GotoBLAS.
 
 ### Additional supported CPUs
 
@@ -109,8 +111,8 @@ Please read `GotoBLAS_01Readme.txt`.
 
 - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes.
 - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64.
-- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA  on x86-64.
-- **Intel Skylake**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA  on x86-64.
+- **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64.
+- **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64.
 - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
 - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar)
 - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations.
@@ -129,8 +131,15 @@ Please read `GotoBLAS_01Readme.txt`.
 
 #### ARM64
 
-- **ARMv8**: Experimental
-- **ARM Cortex-A57**: Experimental
+- **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS
+- **Cortex-A53**: same as ARMV8 (different cpu specifications)
+- **Cortex A57**: Optimized Level-3 and Level-2 functions
+- **Cortex A72**: same as A57 ( different cpu specifications)
+- **Cortex A73**: same as A57 (different cpu specifications)
+- **Falkor**: same as A57 (different cpu specifications)
+- **ThunderX**: Optimized some Level-1 functions
+- **ThunderX2T99**: Optimized Level-3 BLAS and parts of Levels 1 and 2
+- **TSV110**: Optimized some Level-3 helper functions
 
 #### PPC/PPC64
 
@@ -139,18 +148,34 @@ Please read `GotoBLAS_01Readme.txt`.
 
 #### IBM zEnterprise System
 
-- **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision)
-- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision)
+- **Z13**: Optimized Level-3 BLAS and Level-1,2
+- **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
+
+### Support for multiple targets in a single library
+
+OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying DYNAMIC_ARCH=1 in Makefile.rule, on the gmake command line or as -DDYNAMIC_ARCH=TRUE in cmake.
+For **x86_64**, the list of targets this activates contains Prescott, Core2, Nehalem, Barcelona, Sandybridge, Bulldozer, Piledriver, Steamroller, Excavator, Haswell, Zen, SkylakeX. For cpu generations not included in this list, the corresponding older model is used. If you also specify DYNAMIC_OLDER=1, specific support for Penryn, Dunnington, Opteron, Opteron/SSE3, Bobcat, Atom and Nano is added. Finally there is an option DYNAMIC_LIST that allows to specify an individual list of targets to include instead of the default.
+DYNAMIC_ARCH is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
+Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.
+On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
+For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
+The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
+common code in the library, usually you will want to set this to the oldest model you expect to encounter.
+Please not that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
 
 ### Supported OS
 
 - **GNU/Linux**
 - **MinGW or Visual Studio (CMake)/Windows**: Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-in-Microsoft-Visual-Studio>.
-- **Darwin/macOS**: Experimental. Although GotoBLAS2 supports Darwin, we are not macOS experts.
+- **Darwin/macOS/OSX/iOS**: Experimental. Although GotoBLAS2 already supports Darwin, we are not OSX/iOS experts.
 - **FreeBSD**: Supported by the community. We don't actively test the library on this OS.
 - **OpenBSD**: Supported by the community. We don't actively test the library on this OS.
+- **NetBSD**: Supported by the community. We don't actively test the library on this OS.
 - **DragonFly BSD**: Supported by the community. We don't actively test the library on this OS.
 - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>.
+- **AIX**: Supported on PPC up to POWER8
+- **Haiku**: Supported by the community. We don't actively test the library on this OS.
+- **SunOS**: Supported by the community. We don't actively test the library on this OS:
 
 ## Usage
 

From 47c1bf7f4de7d6b91d6b8f419eb0854e4e94e413 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Feb 2020 01:06:40 +0100
Subject: [PATCH 209/210] typo fixes

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b1ecc6521..04f43f4c7 100644
--- a/README.md
+++ b/README.md
@@ -161,7 +161,7 @@ On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73,
 For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
 The TARGET option can be used in conjunction with DYNAMIC_ARCH=1 to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.
-Please not that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
+Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
 
 ### Supported OS
 
@@ -230,7 +230,7 @@ Please see Changelog.txt to view the differences between OpenBLAS and GotoBLAS2
 * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture.
   Clang 3.0 will generate the wrong AVX binary code.
 * Please use GCC version 6 or LLVM version 6 and above to compile Skylake AVX512 kernels.
-* The number of CPUs/cores should less than or equal to 256. On Linux `x86_64` (`amd64`),
+* The number of CPUs/cores should be less than or equal to 256. On Linux `x86_64` (`amd64`),
   there is experimental support for up to 1024 CPUs/cores and 128 numa nodes if you build
   the library with `BIGNUMA=1`.
 * OpenBLAS does not set processor affinity by default.

From 32d97330b3071fff273cb2a3fa136666a21b7fb4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 9 Feb 2020 23:00:36 +0100
Subject: [PATCH 210/210] Update with changes from 0.3.8

---
 Changelog.txt | 55 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index f160a4e13..549ca4aa5 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,59 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.8
+ 9-Feb-2020
+
+common: 
+`	* LAPACK has been updated to 3.9.0 (plus patches up to 
+	  January 2nd, 2020)
+	* CMAKE support has been improved in several areas including
+	  cross-compilation
+	* a thread race condition in the GEMM3M kernels was resolved
+	* the "generic" (plain C) gemm beta kernel used by many targets
+	  has been sped up
+	* an optimized version of the LAPACK trtrs functions has been added
+	* an incompatibilty between the LAPACK tests and the OpenBLAS
+	  implementation of XERBLA was resolved, removing the numerous
+	  warnings about wrong error exits in the former 
+	* support for NetBSD has been added
+	* support for compilation with g95 and non-GNU versions of ld
+	  has been improved
+	* support for compilation with (upcoming) gcc 10 has been added
+
+POWER:
+	* worked around miscompilation of several POWER8 and POWER9 
+	  kernels by older versions of gcc
+	* added support for big-endian POWER8 and for compilation on AIX
+	* corrected bugs in the big-endian support for PPC440 and PPC970
+	* DYNAMIC_ARCH support is now available in CMAKE builds as well
+
+ARMV8:
+	* performance of DGEMM_BETA and SGEMM_NCOPY has been improved
+	* compilation for 32bit works again 
+	* performance of the RPCC function has been improved
+	* improved performance on small systems
+	* DYNAMIC_ARCH support is now available in CMAKE builds as well
+	* cross-compilation from OSX to IOS was simplified
+ 
+x86_64:
+	* a new AVX512 DGEMM kernel was added and the AVX512 SGEMM kernel
+	  was significantly improved
+	* optimized AVX512 kernels for CGEMM and ZGEMM have been added
+	* AVX2 kernels for STRMM, SGEMM, and CGEMM have been significantly
+	  sped up and optimized CGEMM3M and ZGEMM3M kernels have been added 
+	* added support for QEMU virtual cpus
+	* a compilation problem with PGI and SUN compilers was fixed
+	* Intel "Goldmont plus" is now autodetected
+	* a potential crash on program exit on MS Windows has been fixed
+
+x86:
+	* an unwanted case sensitivity in the implementation of LSAME
+  	  on older 32bit AMD cpus was fixed
+
+zarch:
+	* Z15 is now supported as Z14
+	* DYNAMIC_ARCH is now available on ZARCH as well
+
 ====================================================================
 Version 0.3.7
 11-Aug 2019