Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B

Added HFLOAT16 support for RISCV64 Added shgemm_kernel_8x8 for RISCV64_ZVL128B and shgemm_kernel_16x8 for RISCV64_ZVL256B based on HFLOAT16 The instruction sets used are ZVFH and ZFH, which need to be supported by RVV1.0 Related to issue #5279 Co-authored-by Linjin Li <linjin_li@163.com>
5 months ago · 670ec6f757
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,6 +152,9 @@ endif ()
 if (NOT DEFINED BUILD_BFLOAT16)
 set (BUILD_BFLOAT16 false)
 endif ()
 if (NOT DEFINED BUILD_HFLOAT16)
 set (BUILD_HFLOAT16 false)
 endif ()
 # set which float types we want to build for
 if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
  # if none are defined, build for all
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -64,11 +64,11 @@ TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
 endif
 ifeq ($(TARGET), RISCV64_ZVL256B)
 TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
 TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(TARGET), RISCV64_ZVL128B)
 TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
 TARGET_FLAGS = -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(TARGET), RISCV64_GENERIC)
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@@ -7,12 +7,12 @@ CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
 endif
 ifeq ($(CORE), RISCV64_ZVL256B)
 CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
 CCOMMON_OPT += -march=rv64imafdcv_zvl256b_zvfh_zfh -mabi=lp64d
 FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_ZVL128B)
 CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
 FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d
 CCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d 
 FCOMMON_OPT += -march=rv64imafdcv_zvfh_zfh -mabi=lp64d
 endif
 ifeq ($(CORE), RISCV64_GENERIC)
 CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -308,6 +308,8 @@ COMMON_PROF = -pg
 # If you want to enable the experimental BFLOAT16 support
 # BUILD_BFLOAT16 = 1
 # If you want to enable the experimental HFLOAT16 support
 BUILD_HFLOAT16 = 1
 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS
 # will be allocated on the heap rather than the stack. (This array alone requires 
--- a/Makefile.system
+++ b/Makefile.system
@@ -280,6 +280,7 @@ GEMM_GEMV_FORWARD_BF16 = 1
 endif
 ifeq ($(ARCH), riscv)
 GEMM_GEMV_FORWARD = 1
 BUILD_HFLOAT16 = 1
 endif
 ifeq ($(ARCH), power)
 GEMM_GEMV_FORWARD = 1
@@ -1547,6 +1548,9 @@ endif
 ifeq ($(BUILD_BFLOAT16), 1)
 CCOMMON_OPT += -DBUILD_BFLOAT16
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 CCOMMON_OPT += -DBUILD_HFLOAT16
 endif
 ifeq ($(BUILD_SINGLE), 1)
 CCOMMON_OPT += -DBUILD_SINGLE=1
 endif
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -35,6 +35,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM   BLASFUNC(dgemm)
 #elif defined(HALF)
 #define GEMM   BLASFUNC(sbgemm)
 #elif defined(HFLOAT16)
 #define GEMM   BLASFUNC(shgemm)
 #else
 #define GEMM   BLASFUNC(sgemm)
 #endif
--- a/cblas.h
+++ b/cblas.h
@@ -446,7 +446,7 @@ void   cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum C
 void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
 /*** FLOAT16 extensions */
 /*** FLOAT16 extensions ***/
 void cblas_shgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		    OPENBLAS_CONST float alpha, OPENBLAS_CONST hfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST hfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
--- a/common.h
+++ b/common.h
@@ -266,9 +266,12 @@ typedef uint16_t bfloat16;
 #define BFLOAT16CONVERSION 1
 #endif
 #ifndef hfloat16
 #include <stdint.h>
 typedef uint16_t hfloat16;
 #ifdef BUILD_HFLOAT16
  #ifndef hfloat16
  typedef _Float16 hfloat16;
  #endif
 #else
  typedef uint16_t hfloat16;
 #endif
 #ifdef USE64BITINT
--- a/driver/level3/CMakeLists.txt
+++ b/driver/level3/CMakeLists.txt
@@ -18,6 +18,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES})
      GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16")
    endif ()
  endif ()
  if (BUILD_HFLOAT16)
    GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "HFLOAT16")
    if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3)
      GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "HFLOAT16")
    endif ()
  endif ()
 endforeach ()
 if ( BUILD_COMPLEX16 AND NOT  BUILD_DOUBLE)
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -23,6 +23,10 @@ ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS       += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 SHBLASOBJS       += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX)
 endif
 SBLASOBJS	+= \
 	sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \
 	strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \
@@ -210,6 +214,9 @@ ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS    += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 SHBLASOBJS    += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX)
 endif
 SBLASOBJS    += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
 DBLASOBJS    += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
 QBLASOBJS    += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX)
@@ -355,6 +362,18 @@ sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
 sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
 shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
 shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
 shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
@@ -562,6 +581,18 @@ sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
 shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
 shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
 shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
@@ -2747,6 +2778,18 @@ sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
 sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
 shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
 shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
 shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
@@ -2970,6 +3013,18 @@ sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
 shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
 shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
 shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
 sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
 	$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -218,7 +218,7 @@ mulx.$(SUFFIX) : $(ARCH)/mulx.c
 	$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F)
 detect_riscv64.$(SUFFIX): detect_riscv64.c
 	$(CC) $(CFLAGS) -c -march=rv64imafdcv $< -o $(@F)
 	$(CC) $(CFLAGS) -c -march=rv64imafdcv_zvfh_zfh $< -o $(@F)
 xerbla.$(PSUFFIX) : xerbla.c
 	$(CC) $(PFLAGS) -c $< -o $(@F)
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -67,6 +67,11 @@ BLASLONG sbgemm_p = DEFAULT_GEMM_P;
 #else
 BLASLONG sbgemm_p = SBGEMM_P;
 #endif
 #if SHGEMM_P == shgemm_p
 BLASLONG shgemm_p = DEFAULT_GEMM_P;
 #else
 BLASLONG shgemm_p = SHGEMM_P;
 #endif
 #if SGEMM_P == sgemm_p
 BLASLONG sgemm_p = DEFAULT_GEMM_P;
 #else
@@ -93,6 +98,11 @@ BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
 #else
 BLASLONG sbgemm_q = SBGEMM_Q;
 #endif
 #if SHGEMM_Q == shgemm_q
 BLASLONG shgemm_q = DEFAULT_GEMM_Q;
 #else
 BLASLONG shgemm_q = SHGEMM_Q;
 #endif
 #if SGEMM_Q == sgemm_q
 BLASLONG sgemm_q = DEFAULT_GEMM_Q;
 #else
@@ -119,6 +129,11 @@ BLASLONG sbgemm_r = DEFAULT_GEMM_R;
 #else
 BLASLONG sbgemm_r = SBGEMM_R;
 #endif
 #if SHGEMM_R == shgemm_r
 BLASLONG shgemm_r = DEFAULT_GEMM_R;
 #else
 BLASLONG shgemm_r = SHGEMM_R;
 #endif
 #if SGEMM_R == sgemm_r
 BLASLONG sgemm_r = DEFAULT_GEMM_R;
 #else
@@ -526,6 +541,9 @@ void blas_set_parameter(void){
 #ifdef BUILD_BFLOAT16
  sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
 #endif
 #ifdef BUILD_HFLOAT16
  shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q *  4)) - 15) & ~15;
 #endif
  sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
  dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
@@ -619,6 +637,7 @@ void blas_set_parameter(void){
  size = BITMASK(cpuid3, 16, 0xff);
  sbgemm_p = 192 * (size + 1);
  shgemm_p = 192 * (size + 1);
  sgemm_p = 192 * (size + 1);
  dgemm_p =  96 * (size + 1);
  cgemm_p =  96 * (size + 1);
@@ -634,6 +653,9 @@ void blas_set_parameter(void){
 #ifdef BUILD_BFLOAT16
  sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
 #endif
 #ifdef BUILD_HFLOAT16
  shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q *  4)) - 15) & ~15;
 #endif
  sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
  dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
--- a/exports/Makefile
+++ b/exports/Makefile
@@ -39,6 +39,9 @@ endif
 ifndef BUILD_BFLOAT16
 BUILD_BFLOAT16 = 0
 endif
 ifndef BUILD_HFLOAT16
 BUILD_HFLOAT16 = 0
 endif
 ifndef BUILD_SINGLE
 BUILD_SINGLE = 0
 endif
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -52,6 +52,7 @@ blasobjsz="
 blasobjs="lsame xerbla"
 bfblasobjs="sbgemm sbgemmt sbgemmtr sbgemv sbdot sbstobf16 sbdtobf16 sbf16tos dbf16tod"
 hfblasobjs="shgemm"
 cblasobjsc="
    cblas_caxpy cblas_ccopy cblas_cdotc cblas_cdotu cblas_cgbmv cblas_cgemm cblas_cgemv
    cblas_cgerc cblas_cgeru cblas_chbmv cblas_chemm cblas_chemv cblas_cher2 cblas_cher2k
@@ -100,6 +101,7 @@ cblasobjsz="
 cblasobjs="cblas_xerbla"
 bfcblasobjs="cblas_sbgemm cblas_sbgemv cblas_sbdot cblas_sbstobf16 cblas_sbdtobf16 cblas_sbf16tos cblas_dbf16tod cblas_sbgemm_batch"
 hfcblasobjs="cblas_shgemm"
 exblasobjs="
    qamax qamin qasum qaxpy qcabs1 qcopy qdot qgbmv qgemm
@@ -3816,8 +3818,8 @@ shift
 p17=$9
 if [ $p13 -eq 1 ]; then
 	blasobjs="$blasobjs $bfblasobjs"
 	cblasobjs="$cblasobjs $bfcblasobjs"
 	blasobjs="$blasobjs $bfblasobjs $hfblasobjs"
 	cblasobjs="$cblasobjs $bfcblasobjs $hfcblasobjs"
 fi
 if [ $p14 -eq 1 ]; then
--- a/exports/gensymbol.pl
+++ b/exports/gensymbol.pl
@@ -52,6 +52,7 @@
@blasobjs = (lsame, xerbla);
@bfblasobjs = (sbgemm, sbgemmt, sbgemmtr, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@hfblasobjs = (shgemm);
@cblasobjsc = (
    cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
    cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@@ -97,7 +98,7 @@
@cblasobjs = (  cblas_xerbla );
@bfcblasobjs = (cblas_sbgemm, cblas_sbgemmt, cblas_sbgemmtr, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod, cblas_sbgemm_batch);
@hfcblasobjs = (cblas_shgemm);
@exblasobjs = (
    qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
    qgemv,qger,qmax,qmin,
@@ -3773,8 +3774,8 @@ use File::Basename;
 my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib");
 if ($ARGV[12] == 1) {
 	@blasobjs = (@blasobjs, @bfblasobjs);
 	@cblasobjs = (@cblasobjs, @bfcblasobjs);
 	@blasobjs = (@blasobjs, @bfblasobjs, @hfblasobjs);
 	@cblasobjs = (@cblasobjs, @bfcblasobjs, @hfcblasobjs);
 }
 if ($ARGV[13] == 1) {
 	@blasobjs = (@blasobjs, @blasobjss);
--- a/getarch_2nd.c
+++ b/getarch_2nd.c
@@ -19,6 +19,8 @@ int main(int argc, char **argv) {
  if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
    printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M);
    printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N);
    printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M);
    printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N);
    printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
    printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
    printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
--- a/install/generate.py
+++ b/install/generate.py
@@ -0,0 +1,58 @@
 import numpy as np
 import torch
 # 设置矩阵尺寸
 M, K, N = 31, 31, 31  # 可修改为更大规模
 # 生成随机输入矩阵，类型为float16
 A = np.random.randint(0, 11, size=(M, K)).astype(np.float16)
 B = np.random.randint(0, 11, size=(K, N)).astype(np.float16)
 A_torch = torch.tensor(A, dtype=torch.float16, device='cuda')
 B_torch = torch.tensor(B, dtype=torch.float16, device='cuda')
 C_torch = torch.matmul(A_torch, B_torch)
 C_ref = C_torch.cpu().numpy().astype(np.float32)
 def format_array_c(name, array, c_type="hfloat16"):
    flat = array.flatten()
    elements = ", ".join(f"{x:.5f}" for x in flat)
    return f"{c_type} {name}[{len(flat)}] = {{ {elements} }};\n"
 def format_array_c_float(name, array):
    flat = array.flatten()
    elements = ", ".join(f"{x:.5f}" for x in flat)
    return f"float {name}[{len(flat)}] = {{ {elements} }};\n"
 # 写入C文件
 with open("generated_test.c", "w") as f:
    f.write('#include <stdio.h>\n')
    f.write('#include <stdlib.h>\n')
    f.write('#include <string.h>\n')
    f.write('#include <cblas.h>\n\n')
    f.write(f"const int M = {M}, K = {K}, N = {N};\n")
    f.write("const float alpha = 1.0f, beta = 0.0f;\n\n")
    f.write(format_array_c("A", A))
    f.write(format_array_c("B", B))
    f.write(f"float C[{M*N}] = {{ 0 }};\n\n")
    f.write("int main() {\n")
    f.write("    cblas_shgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,\n")
    f.write("                 M, N, K,\n")
    f.write("                 alpha,\n")
    f.write("                 A, K,\n")
    f.write("                 B, N,\n")
    f.write("                 beta,\n")
    f.write("                 C, N);\n\n")
    f.write('    printf("Result C = A * B:\\n");\n')
    f.write("    for (int i = 0; i < M * N; i++) {\n")
    f.write("        printf(\"%.5f \", C[i]);\n")
    f.write("        if ((i + 1) % N == 0) printf(\"\\n\");\n")
    f.write("    }\n")
    f.write("    return 0;\n")
    f.write("}\n\n")
    f.write("// Reference result computed in Python:\n")
    c_ref_flat = ", ".join(f"{x:.5f}" for x in C_ref.flatten())
    f.write(f"// C_ref = {{ {c_ref_flat} }}\n")
--- a/install/generated_test
+++ b/install/generated_test
--- a/install/generated_test.c
+++ b/install/generated_test.c
--- a/install/include/cblas.h
+++ b/install/include/cblas.h
@@ -0,0 +1,457 @@
 #ifndef CBLAS_H
 #define CBLAS_H
 #include <stddef.h>
 #include "openblas_config.h"
 #ifdef __cplusplus
 extern "C" {
 	/* Assume C declarations for C++ */
 #endif  /* __cplusplus */
 /*Set the number of threads on runtime.*/
 void openblas_set_num_threads(int num_threads);
 void goto_set_num_threads(int num_threads);
 int openblas_set_num_threads_local(int num_threads);
 /*Get the number of threads on runtime.*/
 int openblas_get_num_threads(void);
 /*Get the number of physical processors (cores).*/
 int openblas_get_num_procs(void);
 /*Get the build configure on runtime.*/
 char* openblas_get_config(void);
 /*Get the CPU corename on runtime.*/
 char* openblas_get_corename(void);
 /*Set the threading backend to a custom callback.*/
 typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
 typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
 void openblas_set_threads_callback_function(openblas_threads_callback callback);
 #ifdef OPENBLAS_OS_LINUX
 /* Sets thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
 int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
 /* Queries thread affinity for OpenBLAS threads. `thread_idx` is in [0, openblas_get_num_threads()-1]. */
 int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set);
 #endif
 /* Get the parallelization type which is used by OpenBLAS */
 int openblas_get_parallel(void);
 /* OpenBLAS is compiled for sequential use  */
 #define OPENBLAS_SEQUENTIAL  0
 /* OpenBLAS is compiled using normal threading model */
 #define OPENBLAS_THREAD  1
 /* OpenBLAS is compiled using OpenMP threading model */
 #define OPENBLAS_OPENMP 2
 /*
 * Since all of GotoBlas was written without const,
 * we disable it at build time.
 */
 #ifndef OPENBLAS_CONST
 # define OPENBLAS_CONST const
 #endif
 #define CBLAS_INDEX size_t
 typedef enum CBLAS_ORDER     {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER;
 typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE;
 typedef enum CBLAS_UPLO      {CblasUpper=121, CblasLower=122} CBLAS_UPLO;
 typedef enum CBLAS_DIAG      {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG;
 typedef enum CBLAS_SIDE      {CblasLeft=141, CblasRight=142} CBLAS_SIDE;
 typedef CBLAS_ORDER CBLAS_LAYOUT;
 float  cblas_sdsdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
 double cblas_dsdot (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float *y, OPENBLAS_CONST blasint incy);
 float  cblas_sdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float  *y, OPENBLAS_CONST blasint incy);
 double cblas_ddot(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST double *y, OPENBLAS_CONST blasint incy);
 openblas_complex_float  cblas_cdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy);
 openblas_complex_float  cblas_cdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy);
 openblas_complex_double cblas_zdotu(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);
 openblas_complex_double cblas_zdotc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy);
 void  cblas_cdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy, void  *ret);
 void  cblas_cdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void  *y, OPENBLAS_CONST blasint incy, void  *ret);
 void  cblas_zdotu_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);
 void  cblas_zdotc_sub(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST void *y, OPENBLAS_CONST blasint incy, void *ret);
 float  cblas_sasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float  cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 float  cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float  cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 float  cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX);
 double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX);
 float  cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX);
 double cblas_dznrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX);
 CBLAS_INDEX cblas_isamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_isamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
 void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_sswap(OPENBLAS_CONST blasint n, float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dswap(OPENBLAS_CONST blasint n, double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_cswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
 void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double  s);
 void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s);
 void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s);
 void cblas_srotg(float *a, float *b, float *c, float *s);
 void cblas_drotg(double *a, double *b, double *c, double *s);
 void cblas_crotg(void *a, void *b, float *c, void *s);
 void cblas_zrotg(void *a, void *b, double *c, void *s);
 void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P);
 void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P);
 void cblas_srotmg(float *d1, float *d2, float *b1, OPENBLAS_CONST float b2, float *P);
 void cblas_drotmg(double *d1, double *d2, double *b1, OPENBLAS_CONST double b2, double *P);
 void cblas_sscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, double *X, OPENBLAS_CONST blasint incX);
 void cblas_cscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX);
 void cblas_zscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, void *X, OPENBLAS_CONST blasint incX);
 void cblas_csscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, void *X, OPENBLAS_CONST blasint incX);
 void cblas_zdscal(OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, void *X, OPENBLAS_CONST blasint incX);
 void cblas_sgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST float beta,  float  *y, OPENBLAS_CONST blasint incy);
 void cblas_dgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST double  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST double beta,  double  *y, OPENBLAS_CONST blasint incy);
 void cblas_cgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST void *beta,  void  *y, OPENBLAS_CONST blasint incy);
 void cblas_zgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void  *a, OPENBLAS_CONST blasint lda,  OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx,  OPENBLAS_CONST void *beta,  void  *y, OPENBLAS_CONST blasint incy);
 void cblas_sger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float   alpha, OPENBLAS_CONST float  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float  *Y, OPENBLAS_CONST blasint incY, float  *A, OPENBLAS_CONST blasint lda);
 void cblas_dger (OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double  alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
 void cblas_cgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *alpha, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void  *Y, OPENBLAS_CONST blasint incY, void  *A, OPENBLAS_CONST blasint lda);
 void cblas_cgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void  *alpha, OPENBLAS_CONST void  *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void  *Y, OPENBLAS_CONST blasint incY, void  *A, OPENBLAS_CONST blasint lda);
 void cblas_zgeru(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
 void cblas_zgerc(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
 void cblas_strsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztrsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_strmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztrmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ssyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *A, OPENBLAS_CONST blasint lda);
 void cblas_dsyr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *A, OPENBLAS_CONST blasint lda);
 void cblas_cher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda);
 void cblas_zher(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A, OPENBLAS_CONST blasint lda);
 void cblas_ssyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo,OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X,
                OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A, OPENBLAS_CONST blasint lda);
 void cblas_dsyr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X,
                OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A, OPENBLAS_CONST blasint lda);
 void cblas_cher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX,
                OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
 void cblas_zher2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX,
                OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *A, OPENBLAS_CONST blasint lda);
 void cblas_sgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
 void cblas_cgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zgbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST blasint KL, OPENBLAS_CONST blasint KU, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_ssbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dsbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
 void cblas_stbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_stbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztbsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *X, OPENBLAS_CONST blasint incX);
 void cblas_stpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
 void cblas_stpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST float *Ap, float *X, OPENBLAS_CONST blasint incX);
 void cblas_dtpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST double *Ap, double *X, OPENBLAS_CONST blasint incX);
 void cblas_ctpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ztpsv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_DIAG Diag,
                 OPENBLAS_CONST blasint N, OPENBLAS_CONST void *Ap, void *X, OPENBLAS_CONST blasint incX);
 void cblas_ssymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dsymv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
 void cblas_chemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zhemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A,
                 OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_sspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *Ap,
                 OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float beta, float *Y, OPENBLAS_CONST blasint incY);
 void cblas_dspmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *Ap,
                 OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double beta, double *Y, OPENBLAS_CONST blasint incY);
 void cblas_sspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, float *Ap);
 void cblas_dspr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, double *Ap);
 void cblas_chpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, void *A);
 void cblas_zhpr(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST void *X,OPENBLAS_CONST blasint incX, void *A);
 void cblas_sspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST float *Y, OPENBLAS_CONST blasint incY, float *A);
 void cblas_dspr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST double *Y, OPENBLAS_CONST blasint incY, double *A);
 void cblas_chpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap);
 void cblas_zhpr2(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *Y, OPENBLAS_CONST blasint incY, void *Ap);
 void cblas_chbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zhbmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_chpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_zhpmv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint N,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *Ap, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX, OPENBLAS_CONST void *beta, void *Y, OPENBLAS_CONST blasint incY);
 void cblas_sgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_cgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_cgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_csymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zsymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_ssyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_csyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zsyrk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		 OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_ssyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_dsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
 void cblas_csyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zsyr2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans,
 		  OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_strmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
 void cblas_dtrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
 void cblas_ctrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
 void cblas_ztrmm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
 void cblas_strsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, float *B, OPENBLAS_CONST blasint ldb);
 void cblas_dtrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, double *B, OPENBLAS_CONST blasint ldb);
 void cblas_ctrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
 void cblas_ztrsm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA,
                 OPENBLAS_CONST enum CBLAS_DIAG Diag, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, void *B, OPENBLAS_CONST blasint ldb);
 void cblas_chemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zhemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_cherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                 OPENBLAS_CONST float alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zherk(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                 OPENBLAS_CONST double alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_cher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                  OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_zher2k(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE Trans, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
                  OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, void *C, OPENBLAS_CONST blasint ldc);
 void cblas_xerbla(blasint p, OPENBLAS_CONST char *rout, OPENBLAS_CONST char *form, ...);
 /*** BLAS extensions ***/
 void cblas_saxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
 void cblas_daxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST double beta, double *y, OPENBLAS_CONST blasint incy);
 void cblas_caxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpby(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx,OPENBLAS_CONST void *beta, void *y, OPENBLAS_CONST blasint incy);
 void cblas_somatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, 
 		     OPENBLAS_CONST blasint clda, float *b, OPENBLAS_CONST blasint cldb); 
 void cblas_domatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a,
 		     OPENBLAS_CONST blasint clda, double *b, OPENBLAS_CONST blasint cldb); 
 void cblas_comatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, OPENBLAS_CONST float* a, 
 		     OPENBLAS_CONST blasint clda, float*b, OPENBLAS_CONST blasint cldb); 
 void cblas_zomatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, OPENBLAS_CONST double* a, 
 		     OPENBLAS_CONST blasint clda,  double *b, OPENBLAS_CONST blasint cldb); 
 void cblas_simatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, float *a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 
 void cblas_dimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, double *a,
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 
 void cblas_cimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float* calpha, float* a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 
 void cblas_zimatcopy(OPENBLAS_CONST enum CBLAS_ORDER CORDER, OPENBLAS_CONST enum CBLAS_TRANSPOSE CTRANS, OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double* calpha, double* a, 
 		     OPENBLAS_CONST blasint clda, OPENBLAS_CONST blasint cldb); 
 void cblas_sgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
 void cblas_dgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 
 void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST float *calpha, OPENBLAS_CONST float *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST float *cbeta, 
 		  float *c, OPENBLAS_CONST blasint cldc); 
 void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, OPENBLAS_CONST double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, 
 		  double *c, OPENBLAS_CONST blasint cldc); 
 void cblas_sgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST float ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST float ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
 void cblas_dgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST double * alpha_array, OPENBLAS_CONST double ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST double ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST double * beta_array, double ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
 void cblas_cgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
 void cblas_zgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST void * alpha_array, OPENBLAS_CONST void ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST void ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST void * beta_array, void ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
 /*** BFLOAT16 and INT8 extensions ***/
 /* convert float array to BFLOAT16 array by rounding */
 void   cblas_sbstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
 /* convert double array to BFLOAT16 array by rounding */
 void   cblas_sbdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
 /* convert BFLOAT16 array to float array */
 void   cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float  *out, OPENBLAS_CONST blasint incout);
 /* convert BFLOAT16 array to double array */
 void   cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
 /* dot production of BFLOAT16 input arrays, and output as float */
 float  cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
 void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
 void   cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		    OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 void cblas_sbgemm_batch(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransA_array, OPENBLAS_CONST enum CBLAS_TRANSPOSE * TransB_array, OPENBLAS_CONST blasint * M_array, OPENBLAS_CONST blasint * N_array, OPENBLAS_CONST blasint * K_array,
 		       OPENBLAS_CONST float * alpha_array, OPENBLAS_CONST bfloat16 ** A_array, OPENBLAS_CONST blasint * lda_array, OPENBLAS_CONST bfloat16 ** B_array, OPENBLAS_CONST blasint * ldb_array, OPENBLAS_CONST float * beta_array, float ** C_array, OPENBLAS_CONST blasint * ldc_array, OPENBLAS_CONST blasint group_count, OPENBLAS_CONST blasint * group_size);
 /*** FLOAT16 extensions ***/
 void cblas_shgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		    OPENBLAS_CONST float alpha, OPENBLAS_CONST hfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST hfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
 #ifdef __cplusplus
 }
 #endif  /* __cplusplus */
 #endif
--- a/install/include/f77blas.h
+++ b/install/include/f77blas.h
@@ -0,0 +1,811 @@
 #ifndef OPENBLAS_F77BLAS_H
 #define OPENBLAS_F77BLAS_H
 #include "openblas_config.h"
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
 /* without modification, are permitted provided that the following   */
 /* conditions are met:                                               */
 /*                                                                   */
 /*   1. Redistributions of source code must retain the above         */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer.                                                  */
 /*                                                                   */
 /*   2. Redistributions in binary form must reproduce the above      */
 /*      copyright notice, this list of conditions and the following  */
 /*      disclaimer in the documentation and/or other materials       */
 /*      provided with the distribution.                              */
 /*                                                                   */
 /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 /*    POSSIBILITY OF SUCH DAMAGE.                                    */
 /*                                                                   */
 /* The views and conclusions contained in the software and           */
 /* documentation are those of the authors and should not be          */
 /* interpreted as representing official policies, either expressed   */
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 #ifndef ASSEMBLER
 #ifdef __cplusplus
 extern "C" {
 	/* Assume C declarations for C++ */
 #endif  /* __cplusplus */
 int    BLASFUNC(xerbla)(char *, blasint *info, blasint);
 void    openblas_set_num_threads_(int *);
 /*Set the threading backend to a custom callback.*/
 typedef void (*openblas_dojob_callback)(int thread_num, void *jobdata, int dojob_data);
 typedef void (*openblas_threads_callback)(int sync, openblas_dojob_callback dojob, int numjobs, size_t jobdata_elsize, void *jobdata, int dojob_data);
 extern openblas_threads_callback openblas_threads_callback_;
 FLOATRET  BLASFUNC(sdot)  (blasint *, float  *, blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(sdsdot)(blasint *, float  *,        float  *, blasint *, float  *, blasint *);
 double BLASFUNC(dsdot) (blasint *, float  *, blasint *, float  *, blasint *);
 double BLASFUNC(ddot)  (blasint *, double *, blasint *, double *, blasint *);
 xdouble BLASFUNC(qdot)  (blasint *, xdouble *, blasint *, xdouble *, blasint *);
 float  BLASFUNC(sbdot)     (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
 void   BLASFUNC(sbstobf16) (blasint *, float *,    blasint *, bfloat16 *, blasint *);
 void   BLASFUNC(sbdtobf16) (blasint *, double *,   blasint *, bfloat16 *, blasint *);
 void   BLASFUNC(sbf16tos)  (blasint *, bfloat16 *, blasint *, float *,    blasint *);
 void   BLASFUNC(dbf16tod)  (blasint *, bfloat16 *, blasint *, double *,   blasint *);
 #ifdef RETURN_BY_STRUCT
 typedef struct {
  float r, i;
 } myccomplex_t;
 typedef struct {
  double r, i;
 } myzcomplex_t;
 typedef struct {
  xdouble r, i;
 } myxcomplex_t;
 myccomplex_t    BLASFUNC(cdotu)  (blasint *, float  *, blasint *, float  *, blasint *);
 myccomplex_t    BLASFUNC(cdotc)  (blasint *, float  *, blasint *, float  *, blasint *);
 myzcomplex_t    BLASFUNC(zdotu)  (blasint *, double  *, blasint *, double  *, blasint *);
 myzcomplex_t    BLASFUNC(zdotc)  (blasint *, double  *, blasint *, double  *, blasint *);
 myxcomplex_t    BLASFUNC(xdotu)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 myxcomplex_t    BLASFUNC(xdotc)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 #elif defined RETURN_BY_STACK
 void  BLASFUNC(cdotu)  (openblas_complex_float   *,  blasint *, float  * , blasint *, float  *,  blasint *);
 void  BLASFUNC(cdotc)  (openblas_complex_float   *,  blasint *, float  *,  blasint *, float  *,  blasint *);
 void  BLASFUNC(zdotu)  (openblas_complex_double  *, blasint *, double  *, blasint *, double  *, blasint *);
 void  BLASFUNC(zdotc)  (openblas_complex_double  *, blasint *, double  *, blasint *, double  *, blasint *);
 void  BLASFUNC(xdotu)  (openblas_complex_xdouble *, blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 void  BLASFUNC(xdotc)  (openblas_complex_xdouble *, blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 #else
 openblas_complex_float   BLASFUNC(cdotu)  (blasint *, float  *, blasint *, float  *, blasint *);
 openblas_complex_float   BLASFUNC(cdotc)  (blasint *, float  *, blasint *, float  *, blasint *);
 openblas_complex_double  BLASFUNC(zdotu)  (blasint *, double  *, blasint *, double  *, blasint *);
 openblas_complex_double  BLASFUNC(zdotc)  (blasint *, double  *, blasint *, double  *, blasint *);
 openblas_complex_xdouble BLASFUNC(xdotu)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 openblas_complex_xdouble BLASFUNC(xdotc)  (blasint *, xdouble  *, blasint *, xdouble  *, blasint *);
 #endif
 void    BLASFUNC(saxpy) (blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(daxpy) (blasint *, double *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(qaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(caxpy) (blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zaxpy) (blasint *, double *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xaxpy) (blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(caxpyc)(blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zaxpyc)(blasint *, double *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xaxpyc)(blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(scopy) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(dcopy) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(qcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(ccopy) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zcopy) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xcopy) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(sswap) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(dswap) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(qswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void    BLASFUNC(cswap) (blasint *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zswap) (blasint *, double *, blasint *, double *, blasint *);
 void    BLASFUNC(xswap) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(sasum) (blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(scasum)(blasint *, float  *, blasint *);
 double BLASFUNC(dasum) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dzasum)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(ssum) (blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(scsum)(blasint *, float  *, blasint *);
 double BLASFUNC(dsum) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dzsum)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(isamax)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idamax)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqamax)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icamax)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(izamax)(blasint *, double *, blasint *);
 blasint    BLASFUNC(ixamax)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(ismax) (blasint *, float  *, blasint *);
 blasint    BLASFUNC(idmax) (blasint *, double *, blasint *);
 blasint    BLASFUNC(iqmax) (blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icmax) (blasint *, float  *, blasint *);
 blasint    BLASFUNC(izmax) (blasint *, double *, blasint *);
 blasint    BLASFUNC(ixmax) (blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(isamin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idamin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqamin)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icamin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(izamin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(ixamin)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(ismin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(idmin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(iqmin)(blasint *, xdouble *, blasint *);
 blasint    BLASFUNC(icmin)(blasint *, float  *, blasint *);
 blasint    BLASFUNC(izmin)(blasint *, double *, blasint *);
 blasint    BLASFUNC(ixmin)(blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(samax) (blasint *, float  *, blasint *);
 double BLASFUNC(damax) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qamax) (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scamax)(blasint *, float  *, blasint *);
 double BLASFUNC(dzamax)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxamax)(blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(samin) (blasint *, float  *, blasint *);
 double BLASFUNC(damin) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qamin) (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scamin)(blasint *, float  *, blasint *);
 double BLASFUNC(dzamin)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxamin)(blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(smax)  (blasint *, float  *, blasint *);
 double BLASFUNC(dmax)  (blasint *, double *, blasint *);
 xdouble BLASFUNC(qmax)  (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scmax) (blasint *, float  *, blasint *);
 double BLASFUNC(dzmax) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qxmax) (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(smin)  (blasint *, float  *, blasint *);
 double BLASFUNC(dmin)  (blasint *, double *, blasint *);
 xdouble BLASFUNC(qmin)  (blasint *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(scmin) (blasint *, float  *, blasint *);
 double BLASFUNC(dzmin) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qxmin) (blasint *, xdouble *, blasint *);
 void    BLASFUNC(sscal) (blasint *,  float  *, float  *, blasint *);
 void    BLASFUNC(dscal) (blasint *,  double *, double *, blasint *);
 void    BLASFUNC(qscal) (blasint *,  xdouble *, xdouble *, blasint *);
 void    BLASFUNC(cscal) (blasint *,  float  *, float  *, blasint *);
 void    BLASFUNC(zscal) (blasint *,  double *, double *, blasint *);
 void    BLASFUNC(xscal) (blasint *,  xdouble *, xdouble *, blasint *);
 void    BLASFUNC(csscal)(blasint *,  float  *, float  *, blasint *);
 void    BLASFUNC(zdscal)(blasint *,  double *, double *, blasint *);
 void    BLASFUNC(xqscal)(blasint *,  xdouble *, xdouble *, blasint *);
 FLOATRET  BLASFUNC(snrm2) (blasint *, float  *, blasint *);
 FLOATRET  BLASFUNC(scnrm2)(blasint *, float  *, blasint *);
 double BLASFUNC(dnrm2) (blasint *, double *, blasint *);
 xdouble BLASFUNC(qnrm2) (blasint *, xdouble *, blasint *);
 double BLASFUNC(dznrm2)(blasint *, double *, blasint *);
 xdouble BLASFUNC(qxnrm2)(blasint *, xdouble *, blasint *);
 void  BLASFUNC(srot)  (blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *);
 void  BLASFUNC(drot)  (blasint *, double *, blasint *, double *, blasint *, double *, double *);
 void  BLASFUNC(qrot)  (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *);
 void  BLASFUNC(csrot) (blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *);
 void  BLASFUNC(zdrot) (blasint *, double *, blasint *, double *, blasint *, double *, double *);
 void  BLASFUNC(xqrot) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *);
 void  BLASFUNC(srotg) (float  *, float  *, float  *, float  *);
 void  BLASFUNC(drotg) (double *, double *, double *, double *);
 void  BLASFUNC(qrotg) (xdouble *, xdouble *, xdouble *, xdouble *);
 void  BLASFUNC(crotg) (float  *, float  *, float  *, float  *);
 void  BLASFUNC(zrotg) (double *, double *, double *, double *);
 void  BLASFUNC(xrotg) (xdouble *, xdouble *, xdouble *, xdouble *);
 void  BLASFUNC(srotmg)(float  *, float  *, float  *, float  *, float  *);
 void  BLASFUNC(drotmg)(double *, double *, double *, double *, double *);
 void  BLASFUNC(srotm) (blasint *, float  *, blasint *, float  *, blasint *, float  *);
 void  BLASFUNC(drotm) (blasint *, double *, blasint *, double *, blasint *, double *);
 void  BLASFUNC(qrotm) (blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *);
 /* Level 2 routines */
 void BLASFUNC(sger)(blasint *,    blasint *, float *,  float *, blasint *,
 		   float *,  blasint *, float *,  blasint *);
 void BLASFUNC(dger)(blasint *,    blasint *, double *, double *, blasint *,
 		   double *, blasint *, double *, blasint *);
 void BLASFUNC(qger)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		   xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(cgeru)(blasint *,    blasint *, float *,  float *, blasint *,
 		    float *,  blasint *, float *,  blasint *);
 void BLASFUNC(cgerc)(blasint *,    blasint *, float *,  float *, blasint *,
 		    float *,  blasint *, float *,  blasint *);
 void BLASFUNC(zgeru)(blasint *,    blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, blasint *);
 void BLASFUNC(zgerc)(blasint *,    blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, blasint *);
 void BLASFUNC(xgeru)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(xgerc)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float  *, bfloat16 *, blasint *,
            bfloat16  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(sgemv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cgemv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgemv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(strsv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(dtrsv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(qtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);
 void BLASFUNC(ctrsv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(ztrsv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(xtrsv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);
 void BLASFUNC(strmv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(dtrmv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(qtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);
 void BLASFUNC(ctrmv) (char *, char *, char *, blasint *, float  *, blasint *,
 		     float  *, blasint *);
 void BLASFUNC(ztrmv) (char *, char *, char *, blasint *, double *, blasint *,
 		     double *, blasint *);
 void BLASFUNC(xtrmv) (char *, char *, char *, blasint *, xdouble *, blasint *,
 		     xdouble *, blasint *);
 void BLASFUNC(stpsv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dtpsv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(ctpsv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(ztpsv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xtpsv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(stpmv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dtpmv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(ctpmv) (char *, char *, char *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(ztpmv) (char *, char *, char *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xtpmv) (char *, char *, char *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(stbmv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(dtbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(qtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctbmv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(ztbmv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xtbmv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(stbsv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(dtbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(qtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctbsv) (char *, char *, char *, blasint *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(ztbsv) (char *, char *, char *, blasint *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xtbsv) (char *, char *, char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ssymv) (char *, blasint *, float  *, float *, blasint *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(dsymv) (char *, blasint *, double  *, double *, blasint *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsymv) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csymv) (char *, blasint *, float  *, float *, blasint *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zsymv) (char *, blasint *, double  *, double *, blasint *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsymv) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(sspmv) (char *, blasint *, float  *, float *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(dspmv) (char *, blasint *, double  *, double *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qspmv) (char *, blasint *, xdouble  *, xdouble *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cspmv) (char *, blasint *, float  *, float *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zspmv) (char *, blasint *, double  *, double *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xspmv) (char *, blasint *, xdouble  *, xdouble *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(ssyr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *, blasint *);
 void BLASFUNC(dsyr) (char *, blasint *, double  *, double *, blasint *,
 		    double *, blasint *);
 void BLASFUNC(qsyr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *, blasint *);
 void BLASFUNC(csyr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *, blasint *);
 void BLASFUNC(zsyr) (char *, blasint *, double  *, double *, blasint *,
 		    double *, blasint *);
 void BLASFUNC(xsyr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *, blasint *);
 void BLASFUNC(ssyr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(dsyr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(qsyr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(csyr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(zsyr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xsyr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(sspr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *);
 void BLASFUNC(dspr) (char *, blasint *, double  *, double *, blasint *,
 		    double *);
 void BLASFUNC(qspr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *);
 void BLASFUNC(cspr) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *);
 void BLASFUNC(zspr) (char *, blasint *, double  *, double *, blasint *,
 		    double *);
 void BLASFUNC(xspr) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *);
 void BLASFUNC(sspr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *);
 void BLASFUNC(dspr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *);
 void BLASFUNC(qspr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *);
 void BLASFUNC(cspr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *);
 void BLASFUNC(zspr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *);
 void BLASFUNC(xspr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *);
 void BLASFUNC(cher) (char *, blasint *, float   *, float  *, blasint *,
 		    float  *, blasint *);
 void BLASFUNC(zher) (char *, blasint *, double  *, double *, blasint *,
 		    double *, blasint *);
 void BLASFUNC(xher) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		    xdouble *, blasint *);
 void BLASFUNC(chpr) (char *, blasint *, float   *, float  *, blasint *, float  *);
 void BLASFUNC(zhpr) (char *, blasint *, double  *, double *, blasint *, double *);
 void BLASFUNC(xhpr) (char *, blasint *, xdouble  *, xdouble *, blasint *, xdouble *);
 void BLASFUNC(cher2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *, blasint *);
 void BLASFUNC(zher2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *, blasint *);
 void BLASFUNC(xher2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(chpr2) (char *, blasint *, float   *,
 		     float  *, blasint *, float  *, blasint *, float  *);
 void BLASFUNC(zhpr2) (char *, blasint *, double  *,
 		     double *, blasint *, double *, blasint *, double *);
 void BLASFUNC(xhpr2) (char *, blasint *, xdouble  *,
 		     xdouble *, blasint *, xdouble *, blasint *, xdouble *);
 void BLASFUNC(chemv) (char *, blasint *, float  *, float *, blasint *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zhemv) (char *, blasint *, double  *, double *, blasint *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhemv) (char *, blasint *, xdouble  *, xdouble *, blasint *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(chpmv) (char *, blasint *, float  *, float *,
 		     float  *, blasint *, float *, float *, blasint *);
 void BLASFUNC(zhpmv) (char *, blasint *, double  *, double *,
 		     double  *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhpmv) (char *, blasint *, xdouble  *, xdouble *,
 		     xdouble  *, blasint *, xdouble *, xdouble *, blasint *);
 int BLASFUNC(snorm)(char *, blasint *, blasint *, float  *, blasint *);
 int BLASFUNC(dnorm)(char *, blasint *, blasint *, double *, blasint *);
 int BLASFUNC(cnorm)(char *, blasint *, blasint *, float  *, blasint *);
 int BLASFUNC(znorm)(char *, blasint *, blasint *, double *, blasint *);
 void BLASFUNC(sgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cgbmv)(char *, blasint *, blasint *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgbmv)(char *, blasint *, blasint *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgbmv)(char *, blasint *, blasint *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(ssbmv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csbmv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(chbmv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zhbmv)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 /* Level 3 routines */
 void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   hfloat16  *, blasint *, hfloat16 *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
 void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemm)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgemm)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cgemm3m)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
 int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
 int BLASFUNC(dge2mm)(char *, char *, char *, blasint *, blasint *,
 		     double *, double  *, blasint *, double  *, blasint *,
 		     double *, double  *, blasint *);
 int BLASFUNC(cge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
 int BLASFUNC(zge2mm)(char *, char *, char *, blasint *, blasint *,
 		     double *, double  *, blasint *, double  *, blasint *,
 		     double *, double  *, blasint *);
 void BLASFUNC(strsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(dtrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(qtrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(ztrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(xtrsm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(strmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(dtrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(qtrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ctrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   float *,  float *, blasint *, float *, blasint *);
 void BLASFUNC(ztrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   double *,  double *, blasint *, double *, blasint *);
 void BLASFUNC(xtrmm)(char *, char *, char *, char *, blasint *, blasint *,
 	   xdouble *,  xdouble *, blasint *, xdouble *, blasint *);
 void BLASFUNC(ssymm)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csymm)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsymm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsymm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csymm3m)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsymm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsymm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(ssyrk)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, float  *, blasint *);
 void BLASFUNC(dsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, double *, blasint *);
 void BLASFUNC(qsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, xdouble *, blasint *);
 void BLASFUNC(csyrk)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, float  *, blasint *);
 void BLASFUNC(zsyrk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, double *, blasint *);
 void BLASFUNC(xsyrk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, xdouble *, blasint *);
 void BLASFUNC(ssyr2k)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 void BLASFUNC(qsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(csyr2k)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zsyr2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 void BLASFUNC(xsyr2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(chemm)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zhemm)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhemm)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(chemm3m)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zhemm3m)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, blasint *, double *, double *, blasint *);
 void BLASFUNC(xhemm3m)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 void BLASFUNC(cherk)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float  *, float  *, blasint *);
 void BLASFUNC(zherk)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double *, double *, blasint *);
 void BLASFUNC(xherk)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble *, xdouble *, blasint *);
 void BLASFUNC(cher2k)(char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(zher2k)(char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 void BLASFUNC(xher2k)(char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);
 int BLASFUNC(cher2m)(char *, char *, char *, blasint *, blasint *, float  *, float  *, blasint *,
 	   float *, blasint *, float  *, float  *, blasint *);
 int BLASFUNC(zher2m)(char *, char *, char *, blasint *, blasint *, double *, double *, blasint *,
 	   double*, blasint *, double *, double *, blasint *);
 int BLASFUNC(xher2m)(char *, char *, char *, blasint *, blasint *, xdouble *, xdouble *, blasint *,
 	   xdouble*, blasint *, xdouble *, xdouble *, blasint *);
 int BLASFUNC(sgemt)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *);
 int BLASFUNC(dgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *);
 int BLASFUNC(cgemt)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *);
 int BLASFUNC(zgemt)(char *, blasint *, blasint *, double *, double *, blasint *,
 		    double *, blasint *);
 int BLASFUNC(sgema)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(dgema)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);
 int BLASFUNC(cgema)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(zgema)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);
 int BLASFUNC(sgems)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(dgems)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);
 int BLASFUNC(cgems)(char *, char *, blasint *, blasint *, float  *,
 		    float  *, blasint *, float *, float  *, blasint *, float *, blasint *);
 int BLASFUNC(zgems)(char *, char *, blasint *, blasint *, double *,
 		    double *, blasint *, double*, double *, blasint *, double*, blasint *);
 int BLASFUNC(sgemc)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 int BLASFUNC(dgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *);
 int BLASFUNC(qgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *,  xdouble *, xdouble *, blasint *);
 int BLASFUNC(cgemc)(char *, char *, blasint *, blasint *, blasint *, float *,
 	   float  *, blasint *, float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
 int BLASFUNC(zgemc)(char *, char *, blasint *, blasint *, blasint *, double *,
 	   double *, blasint *, double *, blasint *, double *, blasint *, double *, double *, blasint *);
 int BLASFUNC(xgemc)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);
 /* Lapack routines */
 int BLASFUNC(sgetf2)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(dgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(qgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
 int BLASFUNC(cgetf2)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(zgetf2)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(xgetf2)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
 int BLASFUNC(sgetrf)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(dgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(qgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
 int BLASFUNC(cgetrf)(blasint *, blasint *, float  *, blasint *, blasint *, blasint *);
 int BLASFUNC(zgetrf)(blasint *, blasint *, double *, blasint *, blasint *, blasint *);
 int BLASFUNC(xgetrf)(blasint *, blasint *, xdouble *, blasint *, blasint *, blasint *);
 int BLASFUNC(slaswp)(blasint *, float  *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(dlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(qlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(claswp)(blasint *, float  *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(zlaswp)(blasint *, double *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(xlaswp)(blasint *, xdouble *, blasint *, blasint *, blasint *, blasint *, blasint *);
 int BLASFUNC(sgetrs)(char *, blasint *, blasint *, float  *, blasint *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cgetrs)(char *, blasint *, blasint *, float  *, blasint *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zgetrs)(char *, blasint *, blasint *, double *, blasint *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xgetrs)(char *, blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(sgesv)(blasint *, blasint *, float  *, blasint *, blasint *, float *, blasint *, blasint *);
 int BLASFUNC(dgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
 int BLASFUNC(qgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);
 int BLASFUNC(cgesv)(blasint *, blasint *, float  *, blasint *, blasint *, float *, blasint *, blasint *);
 int BLASFUNC(zgesv)(blasint *, blasint *, double *, blasint *, blasint *, double*, blasint *, blasint *);
 int BLASFUNC(xgesv)(blasint *, blasint *, xdouble *, blasint *, blasint *, xdouble*, blasint *, blasint *);
 int BLASFUNC(spotf2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dpotf2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotf2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zpotf2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xpotf2)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(spotrf)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dpotrf)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotrf)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(spotri)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotri)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(spotrs)(char *, blasint *, blasint *, float   *, blasint *, float   *, blasint *, blasint *);
 int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double  *, blasint *, double  *, blasint *, blasint *);
 int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(cpotrs)(char *, blasint *, blasint *, float   *, blasint *, float   *, blasint *, blasint *);
 int BLASFUNC(zpotrs)(char *, blasint *, blasint *, double  *, blasint *, double  *, blasint *, blasint *);
 int BLASFUNC(xpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(slauu2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dlauu2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(clauu2)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zlauu2)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xlauu2)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(slauum)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dlauum)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qlauum)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(clauum)(char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(zlauum)(char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xlauum)(char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(strti2)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dtrti2)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(ctrti2)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(ztrti2)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xtrti2)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(strtri)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(dtrtri)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(qtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
 int BLASFUNC(ctrtri)(char *, char *, blasint *, float  *, blasint *, blasint *);
 int BLASFUNC(ztrtri)(char *, char *, blasint *, double *, blasint *, blasint *);
 int BLASFUNC(xtrtri)(char *, char *, blasint *, xdouble *, blasint *, blasint *);
 FLOATRET  BLASFUNC(slamch)(char *);
 double    BLASFUNC(dlamch)(char *);
 xdouble   BLASFUNC(qlamch)(char *);
 FLOATRET  BLASFUNC(slamc3)(float *, float *);
 double    BLASFUNC(dlamc3)(double *, double *);
 xdouble   BLASFUNC(qlamc3)(xdouble *, xdouble *);
 /* BLAS extensions */
 void    BLASFUNC(saxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
 void    BLASFUNC(daxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
 void    BLASFUNC(caxpby) (blasint *, void  *, float  *, blasint *, void *, float  *, blasint *);
 void    BLASFUNC(zaxpby) (blasint *, void  *, double *, blasint *, void *, double  *, blasint *);
 void    BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);
 void    BLASFUNC(comatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(zomatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);
 void    BLASFUNC(simatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, blasint *);
 void    BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, blasint *);
 void    BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, blasint *);
 void    BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, blasint *);
 void    BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); 
 void    BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); 
 void    BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); 
 void    BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); 
 #ifdef __cplusplus
 }
 #endif  /* __cplusplus */
 #endif
 #endif
--- a/install/include/lapack.h
+++ b/install/include/lapack.h
--- a/install/include/lapacke.h
+++ b/install/include/lapacke.h
--- a/install/include/lapacke_config.h
+++ b/install/include/lapacke_config.h
@@ -0,0 +1,159 @@
 /*****************************************************************************
  Copyright (c) 2010, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 * Contents: Native C interface to LAPACK
 * Author: Intel Corporation
 *****************************************************************************/
 #ifndef _LAPACKE_CONFIG_H_
 #define _LAPACKE_CONFIG_H_
 #ifdef __cplusplus
 #if defined(LAPACK_COMPLEX_CPP)
 #include <complex>
 #endif
 extern "C" {
 #endif /* __cplusplus */
 #include <stdlib.h>
 #include <stdint.h>
 #include <inttypes.h>
 #ifndef lapack_int
 #if defined(LAPACK_ILP64)
 #define lapack_int        int64_t
 #else
 #define lapack_int        int32_t
 #endif
 #endif
 /*
 * Integer format string
 */
 #ifndef LAPACK_IFMT
 #if defined(LAPACK_ILP64)
 #define LAPACK_IFMT       PRId64
 #else
 #define LAPACK_IFMT       PRId32
 #endif
 #endif
 #ifndef lapack_logical
 #define lapack_logical    lapack_int
 #endif
 #if defined(_MSC_VER) && defined(__INTEL_CLANG_COMPILER)
 #define LAPACK_COMPLEX_STRUCTURE
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname
 #define NOCHANGE
 #endif
 #ifndef LAPACK_COMPLEX_CUSTOM
 #if defined(_MSC_VER) && !defined(__INTEL_CLANG_COMPILER)
 #if defined(LAPACK_COMPLEX_CPP)
    #include <complex>
    #define lapack_complex_float std::complex<float>
    #define lapack_complex_double std::complex<double>
    #define lapack_complex_float_real(z)       ((z).real())
    #define lapack_complex_float_imag(z)       ((z).imag())
    #define lapack_complex_double_real(z)       ((z).real())
    #define lapack_complex_double_imag(z)       ((z).imag())
    #define _CRT_USE_C_COMPLEX_H
 #else
    #include <complex.h>
    #define LAPACK_COMPLEX_CUSTOM
    #define lapack_complex_float _Fcomplex
    #define lapack_complex_double _Dcomplex
    #define lapack_complex_float_real(z)       (creal(z))
    #define lapack_complex_float_imag(z)       (cimag(z))
    #define lapack_complex_double_real(z)       (creal(z))
    #define lapack_complex_double_imag(z)       (cimag(z))
 #endif
 #else
 #if defined(LAPACK_COMPLEX_STRUCTURE)
 typedef struct { float real, imag; } _lapack_complex_float;
 typedef struct { double real, imag; } _lapack_complex_double;
 #define lapack_complex_float  _lapack_complex_float
 #define lapack_complex_double _lapack_complex_double
 #define lapack_complex_float_real(z)  ((z).real)
 #define lapack_complex_float_imag(z)  ((z).imag)
 #define lapack_complex_double_real(z)  ((z).real)
 #define lapack_complex_double_imag(z)  ((z).imag)
 #elif defined(LAPACK_COMPLEX_C99)
 #include <complex.h>
 #define lapack_complex_float    float _Complex
 #define lapack_complex_double   double _Complex
 #define lapack_complex_float_real(z)       (creal(z))
 #define lapack_complex_float_imag(z)       (cimag(z))
 #define lapack_complex_double_real(z)       (creal(z))
 #define lapack_complex_double_imag(z)       (cimag(z))
 #elif defined(LAPACK_COMPLEX_CPP)
 #define lapack_complex_float std::complex<float>
 #define lapack_complex_double std::complex<double>
 #define lapack_complex_float_real(z)       ((z).real())
 #define lapack_complex_float_imag(z)       ((z).imag())
 #define lapack_complex_double_real(z)       ((z).real())
 #define lapack_complex_double_imag(z)       ((z).imag())
 #else
 #include <complex.h>
 #define lapack_complex_float    float _Complex
 #define lapack_complex_double   double _Complex
 #define lapack_complex_float_real(z)       (creal(z))
 #define lapack_complex_float_imag(z)       (cimag(z))
 #define lapack_complex_double_real(z)       (creal(z))
 #define lapack_complex_double_imag(z)       (cimag(z))
 #endif
 #endif
 lapack_complex_float lapack_make_complex_float( float re, float im );
 lapack_complex_double lapack_make_complex_double( double re, double im );
 #endif
 #ifndef LAPACK_malloc
 #define LAPACK_malloc( size )   malloc( size )
 #endif
 #ifndef LAPACK_free
 #define LAPACK_free( p )        free( p )
 #endif
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
 #endif /* _LAPACKE_CONFIG_H_ */
--- a/install/include/lapacke_mangling.h
+++ b/install/include/lapacke_mangling.h
@@ -0,0 +1,17 @@
 #ifndef LAPACK_HEADER_INCLUDED
 #define LAPACK_HEADER_INCLUDED
 #ifndef LAPACK_GLOBAL
 #if defined(LAPACK_GLOBAL_PATTERN_LC) || defined(ADD_)
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
 #elif defined(LAPACK_GLOBAL_PATTERN_UC) || defined(UPPER)
 #define LAPACK_GLOBAL(lcname,UCNAME)  UCNAME
 #elif defined(LAPACK_GLOBAL_PATTERN_MC) || defined(NOCHANGE)
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname
 #else
 #define LAPACK_GLOBAL(lcname,UCNAME)  lcname##_
 #endif
 #endif
 #endif
--- a/install/include/lapacke_utils.h
+++ b/install/include/lapacke_utils.h
@@ -0,0 +1,612 @@
 /*****************************************************************************
  Copyright (c) 2014, Intel Corp.
  All rights reserved.
  Redistribution and use in source and binary forms, with or without
  modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright notice,
      this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of Intel Corporation nor the names of its contributors
      may be used to endorse or promote products derived from this software
      without specific prior written permission.
  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
  THE POSSIBILITY OF SUCH DAMAGE.
 ******************************************************************************
 * Contents: Native C interface to LAPACK utility functions
 * Author: Intel Corporation
 *****************************************************************************/
 #ifndef _LAPACKE_UTILS_H_
 #define _LAPACKE_UTILS_H_
 #include "lapacke.h"
 #ifdef __cplusplus
 extern "C" {
 #endif /* __cplusplus */
 #ifndef ABS
 #define ABS(x) (((x) < 0) ? -(x) : (x))
 #endif
 #ifndef MAX
 #define MAX(x,y) (((x) > (y)) ? (x) : (y))
 #endif
 #ifndef MIN
 #define MIN(x,y) (((x) < (y)) ? (x) : (y))
 #endif
 #ifndef MAX3
 #define MAX3(x,y,z) (((x) > MAX(y,z)) ? (x) : MAX(y,z))
 #endif
 #ifndef MIN3
 #define MIN3(x,y,z) (((x) < MIN(y,z)) ? (x) : MIN(y,z))
 #endif
 #define IS_S_NONZERO(x) ( (x) < 0 || (x) > 0 )
 #define IS_D_NONZERO(x) ( (x) < 0 || (x) > 0 )
 #define IS_C_NONZERO(x) ( IS_S_NONZERO(*((float*)&x)) ||  \
                          IS_S_NONZERO(*(((float*)&x)+1)) )
 #define IS_Z_NONZERO(x) ( IS_D_NONZERO(*((double*)&x)) || \
                          IS_D_NONZERO(*(((double*)&x)+1)) )
 /* Error handler */
 void LAPACKE_xerbla( const char *name, lapack_int info );
 /* Compare two chars (case-insensitive) */
 lapack_logical LAPACKE_lsame( char ca,  char cb )
 #if defined __GNUC__
  __attribute__((const))
 #endif
 	;
 /* Functions to convert column-major to row-major 2d arrays and vice versa. */
 void LAPACKE_cgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_float* in, lapack_int ldin,
                        lapack_complex_float* out, lapack_int ldout );
 void LAPACKE_cgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_float* in, lapack_int ldin,
                        lapack_complex_float* out, lapack_int ldout );
 void LAPACKE_chb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_che_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_chp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_chs_trans( int matrix_layout, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cpb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cpf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_cpo_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_cpp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_csp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_csy_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_ctb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_ctf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_ctp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const lapack_complex_float *in,
                        lapack_complex_float *out );
 void LAPACKE_ctr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_ctz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const lapack_complex_float *in, lapack_int ldin,
                        lapack_complex_float *out, lapack_int ldout );
 void LAPACKE_dgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const double* in, lapack_int ldin,
                        double* out, lapack_int ldout );
 void LAPACKE_dgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const double* in, lapack_int ldin,
                        double* out, lapack_int ldout );
 void LAPACKE_dhs_trans( int matrix_layout, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dpb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dpf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const double *in,
                        double *out );
 void LAPACKE_dpo_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dpp_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in,
                        double *out );
 void LAPACKE_dsb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dsp_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in,
                        double *out );
 void LAPACKE_dsy_trans( int matrix_layout, char uplo, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dtb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dtf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const double *in,
                        double *out );
 void LAPACKE_dtp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const double *in,
                        double *out );
 void LAPACKE_dtr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_dtz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const double *in, lapack_int ldin,
                        double *out, lapack_int ldout );
 void LAPACKE_sgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_sge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const float* in, lapack_int ldin,
                        float* out, lapack_int ldout );
 void LAPACKE_sgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const float* in, lapack_int ldin,
                        float* out, lapack_int ldout );
 void LAPACKE_shs_trans( int matrix_layout, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_spb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_spf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const float *in,
                        float *out );
 void LAPACKE_spo_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_spp_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in,
                        float *out );
 void LAPACKE_ssb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_ssp_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in,
                        float *out );
 void LAPACKE_ssy_trans( int matrix_layout, char uplo, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_stb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_stf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const float *in,
                        float *out );
 void LAPACKE_stp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const float *in,
                        float *out );
 void LAPACKE_str_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_stz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const float *in, lapack_int ldin,
                        float *out, lapack_int ldout );
 void LAPACKE_zgb_trans( int matrix_layout, lapack_int m, lapack_int n,
                        lapack_int kl, lapack_int ku,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zge_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_double* in, lapack_int ldin,
                        lapack_complex_double* out, lapack_int ldout );
 void LAPACKE_zgg_trans( int matrix_layout, lapack_int m, lapack_int n,
                        const lapack_complex_double* in, lapack_int ldin,
                        lapack_complex_double* out, lapack_int ldout );
 void LAPACKE_zhb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zhe_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zhp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zhs_trans( int matrix_layout, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zpb_trans( int matrix_layout, char uplo, lapack_int n,
                        lapack_int kd,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zpf_trans( int matrix_layout, char transr, char uplo,
                        lapack_int n, const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zpo_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_zpp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zsp_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_zsy_trans( int matrix_layout, char uplo, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_ztb_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, lapack_int kd,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_ztf_trans( int matrix_layout, char transr, char uplo, char diag,
                        lapack_int n, const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_ztp_trans( int matrix_layout, char uplo, char diag,
                        lapack_int n, const lapack_complex_double *in,
                        lapack_complex_double *out );
 void LAPACKE_ztr_trans( int matrix_layout, char uplo, char diag, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 void LAPACKE_ztz_trans( int matrix_layout, char direct, char uplo,
                        char diag, lapack_int m, lapack_int n,
                        const lapack_complex_double *in, lapack_int ldin,
                        lapack_complex_double *out, lapack_int ldout );
 /* NaN checkers */
 #define LAPACK_SISNAN( x ) ( x != x )
 #define LAPACK_DISNAN( x ) ( x != x )
 #define LAPACK_CISNAN( x ) ( LAPACK_SISNAN(*((float*) &x)) || \
                              LAPACK_SISNAN(*(((float*) &x)+1)) )
 #define LAPACK_ZISNAN( x ) ( LAPACK_DISNAN(*((double*)&x)) || \
                              LAPACK_DISNAN(*(((double*)&x)+1)) )
 /* NaN checkers for vectors */
 lapack_logical LAPACKE_c_nancheck( lapack_int n,
                                    const lapack_complex_float *x,
                                    lapack_int incx );
 lapack_logical LAPACKE_d_nancheck( lapack_int n,
                                    const double *x,
                                    lapack_int incx );
 lapack_logical LAPACKE_s_nancheck( lapack_int n,
                                    const float *x,
                                    lapack_int incx );
 lapack_logical LAPACKE_z_nancheck( lapack_int n,
                                    const lapack_complex_double *x,
                                    lapack_int incx );
 /* NaN checkers for matrices */
 lapack_logical LAPACKE_cgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const lapack_complex_float *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_cge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cgt_nancheck( lapack_int n,
                                      const lapack_complex_float *dl,
                                      const lapack_complex_float *d,
                                      const lapack_complex_float *du );
 lapack_logical LAPACKE_chb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_che_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_chp_nancheck( lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_chs_nancheck( int matrix_layout, lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cpb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_cpf_nancheck( lapack_int n,
                                      const lapack_complex_float *a );
 lapack_logical LAPACKE_cpo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_cpp_nancheck( lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_cpt_nancheck( lapack_int n,
                                      const float *d,
                                      const lapack_complex_float *e );
 lapack_logical LAPACKE_csp_nancheck( lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_cst_nancheck( lapack_int n,
                                      const lapack_complex_float *d,
                                      const lapack_complex_float *e );
 lapack_logical LAPACKE_csy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ctb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_ctf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_float *a );
 lapack_logical LAPACKE_ctp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_float *ap );
 lapack_logical LAPACKE_ctr_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ctz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const lapack_complex_float *a,
                                     lapack_int lda );
 lapack_logical LAPACKE_dgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const double *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dgt_nancheck( lapack_int n,
                                      const double *dl,
                                      const double *d,
                                      const double *du );
 lapack_logical LAPACKE_dhs_nancheck( int matrix_layout, lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dpb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dpf_nancheck( lapack_int n,
                                      const double *a );
 lapack_logical LAPACKE_dpo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dpp_nancheck( lapack_int n,
                                      const double *ap );
 lapack_logical LAPACKE_dpt_nancheck( lapack_int n,
                                      const double *d,
                                      const double *e );
 lapack_logical LAPACKE_dsb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dsp_nancheck( lapack_int n,
                                      const double *ap );
 lapack_logical LAPACKE_dst_nancheck( lapack_int n,
                                      const double *d,
                                      const double *e );
 lapack_logical LAPACKE_dsy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dtb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_dtf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const double *a );
 lapack_logical LAPACKE_dtp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const double *ap );
 lapack_logical LAPACKE_dtr_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_dtz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const double *a, lapack_int lda );
 lapack_logical LAPACKE_sgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const float *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_sge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_sgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_sgt_nancheck( lapack_int n,
                                      const float *dl,
                                      const float *d,
                                      const float *du );
 lapack_logical LAPACKE_shs_nancheck( int matrix_layout, lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_spb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_spf_nancheck( lapack_int n,
                                      const float *a );
 lapack_logical LAPACKE_spo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_spp_nancheck( lapack_int n,
                                      const float *ap );
 lapack_logical LAPACKE_spt_nancheck( lapack_int n,
                                      const float *d,
                                      const float *e );
 lapack_logical LAPACKE_ssb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_ssp_nancheck( lapack_int n,
                                      const float *ap );
 lapack_logical LAPACKE_sst_nancheck( lapack_int n,
                                      const float *d,
                                      const float *e );
 lapack_logical LAPACKE_ssy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_stb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const float* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_stf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const float *a );
 lapack_logical LAPACKE_stp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const float *ap );
 lapack_logical LAPACKE_str_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const float *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_stz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const float *a, lapack_int lda );
 lapack_logical LAPACKE_zgb_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n, lapack_int kl,
                                      lapack_int ku,
                                      const lapack_complex_double *ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_zge_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zgg_nancheck( int matrix_layout, lapack_int m,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zgt_nancheck( lapack_int n,
                                      const lapack_complex_double *dl,
                                      const lapack_complex_double *d,
                                      const lapack_complex_double *du );
 lapack_logical LAPACKE_zhb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_zhe_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zhp_nancheck( lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_zhs_nancheck( int matrix_layout, lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zpb_nancheck( int matrix_layout, char uplo,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_zpf_nancheck( lapack_int n,
                                      const lapack_complex_double *a );
 lapack_logical LAPACKE_zpo_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_zpp_nancheck( lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_zpt_nancheck( lapack_int n,
                                      const double *d,
                                      const lapack_complex_double *e );
 lapack_logical LAPACKE_zsp_nancheck( lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_zst_nancheck( lapack_int n,
                                      const lapack_complex_double *d,
                                      const lapack_complex_double *e );
 lapack_logical LAPACKE_zsy_nancheck( int matrix_layout, char uplo,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ztb_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n, lapack_int kd,
                                      const lapack_complex_double* ab,
                                      lapack_int ldab );
 lapack_logical LAPACKE_ztf_nancheck( int matrix_layout, char transr,
                                      char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_double *a );
 lapack_logical LAPACKE_ztp_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_double *ap );
 lapack_logical LAPACKE_ztr_nancheck( int matrix_layout, char uplo, char diag,
                                      lapack_int n,
                                      const lapack_complex_double *a,
                                      lapack_int lda );
 lapack_logical LAPACKE_ztz_nancheck( int matrix_layout, char direct, char uplo,
                                     char diag, lapack_int m, lapack_int n,
                                     const lapack_complex_double *a,
                                     lapack_int lda );
 #ifdef __cplusplus
 }
 #endif /* __cplusplus */
 #endif  /* _LAPACKE_UTILS_H_ */
--- a/install/include/openblas_config.h
+++ b/install/include/openblas_config.h
@@ -0,0 +1,136 @@
 #ifndef OPENBLAS_CONFIG_H
 #define OPENBLAS_CONFIG_H
 #define OPENBLAS_OS_LINUX 1
 #define OPENBLAS_ARCH_RISCV64 1
 #define OPENBLAS_C_GCC 1
 #define OPENBLAS___64BIT__ 1
 #define OPENBLAS_HAVE_C11 1
 #define OPENBLAS_PTHREAD_CREATE_FUNC pthread_create
 #define OPENBLAS_BUNDERSCORE _
 #define OPENBLAS_NEEDBUNDERSCORE 1
 #define OPENBLAS_RISCV64_ZVL128B 
 #define OPENBLAS_L1_DATA_SIZE 32768
 #define OPENBLAS_L1_DATA_LINESIZE 32
 #define OPENBLAS_L2_SIZE 1048576
 #define OPENBLAS_L2_LINESIZE 32
 #define OPENBLAS_DTB_DEFAULT_ENTRIES 128
 #define OPENBLAS_DTB_SIZE 4096
 #define OPENBLAS_L2_ASSOCIATIVE 4
 #define OPENBLAS_CORE_RISCV64_ZVL128B 
 #define OPENBLAS_CHAR_CORENAME "RISCV64_ZVL128B"
 #define OPENBLAS_GEMM_MULTITHREAD_THRESHOLD 4
 #define OPENBLAS_VERSION " OpenBLAS 0.3.29.dev "
 /*This is only for "make install" target.*/
 #if defined(OPENBLAS_OS_WINNT) || defined(OPENBLAS_OS_CYGWIN_NT) || defined(OPENBLAS_OS_INTERIX)
 #define OPENBLAS_WINDOWS_ABI
 #define OPENBLAS_OS_WINDOWS
 #ifdef DOUBLE
 #define DOUBLE_DEFINED DOUBLE
 #undef  DOUBLE
 #endif
 #endif
 #ifdef OPENBLAS_NEEDBUNDERSCORE
 #define BLASFUNC(FUNC) FUNC##_
 #else
 #define BLASFUNC(FUNC) FUNC
 #endif
 #ifdef OPENBLAS_QUAD_PRECISION
 typedef struct {
  unsigned long x[2];
 }  xdouble;
 #elif defined OPENBLAS_EXPRECISION
 #define xdouble long double
 #else
 #define xdouble double
 #endif
 #if defined(OPENBLAS_OS_WINDOWS) && defined(OPENBLAS___64BIT__)
 typedef long long BLASLONG;
 typedef unsigned long long BLASULONG;
 #else
 typedef long BLASLONG;
 typedef unsigned long BLASULONG;
 #endif
 #ifndef BFLOAT16
 #include <stdint.h>
 typedef uint16_t bfloat16;
 #endif
 #if defined(__GNUC__) && (__GNUC__ >= 12)
 typedef _Float16 hfloat16;
 #else
 #include <stdint.h>
 typedef uint16_t hfloat16;
 #endif
 #ifdef OPENBLAS_USE64BITINT
 typedef BLASLONG blasint;
 #else
 typedef int blasint;
 #endif
 #if defined(XDOUBLE) || defined(DOUBLE)
 #define FLOATRET	FLOAT
 #else
 #ifdef NEED_F2CCONV
 #define FLOATRET	double
 #else
 #define FLOATRET	float
 #endif
 #endif
 /* Inclusion of a standard header file is needed for definition of __STDC_*
   predefined macros with some compilers (e.g. GCC 4.7 on Linux).  This occurs
   as a side effect of including either <features.h> or <stdc-predef.h>. */
 #include <stdio.h>
 /* C99 supports complex floating numbers natively, which GCC also offers as an
   extension since version 3.0.  If neither are available, use a compatible
   structure as fallback (see Clause 6.2.5.13 of the C99 standard). */
 #if ((defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \
      (__GNUC__ >= 3 && !defined(__cplusplus))) && !(defined(FORCE_OPENBLAS_COMPLEX_STRUCT))) && !defined(_MSC_VER)
  #define OPENBLAS_COMPLEX_C99
 #ifndef __cplusplus
  #include <complex.h>
 #endif
  typedef float _Complex openblas_complex_float;
  typedef double _Complex openblas_complex_double;
  typedef xdouble _Complex openblas_complex_xdouble;
  #define openblas_make_complex_float(real, imag)    ((real) + ((imag) * _Complex_I))
  #define openblas_make_complex_double(real, imag)   ((real) + ((imag) * _Complex_I))
  #define openblas_make_complex_xdouble(real, imag)  ((real) + ((imag) * _Complex_I))
  #define openblas_complex_float_real(z)             (creal(z))
  #define openblas_complex_float_imag(z)             (cimag(z))
  #define openblas_complex_double_real(z)            (creal(z))
  #define openblas_complex_double_imag(z)            (cimag(z))
  #define openblas_complex_xdouble_real(z)           (creal(z))
  #define openblas_complex_xdouble_imag(z)           (cimag(z))
 #else
  #define OPENBLAS_COMPLEX_STRUCT
  typedef struct { float real, imag; } openblas_complex_float;
  typedef struct { double real, imag; } openblas_complex_double;
  typedef struct { xdouble real, imag; } openblas_complex_xdouble;
  #define openblas_make_complex_float(real, imag)    {(real), (imag)}
  #define openblas_make_complex_double(real, imag)   {(real), (imag)}
  #define openblas_make_complex_xdouble(real, imag)  {(real), (imag)}
  #define openblas_complex_float_real(z)             ((z).real)
  #define openblas_complex_float_imag(z)             ((z).imag)
  #define openblas_complex_double_real(z)            ((z).real)
  #define openblas_complex_double_imag(z)            ((z).imag)
  #define openblas_complex_xdouble_real(z)           ((z).real)
  #define openblas_complex_xdouble_imag(z)           ((z).imag)
 #endif
 /* Inclusion of Linux-specific header is needed for definition of cpu_set_t. */
 #ifdef OPENBLAS_OS_LINUX
 #ifndef _GNU_SOURCE
 #define _GNU_SOURCE
 #endif
 #include <sched.h>
 #endif
 #endif /* OPENBLAS_CONFIG_H */
--- a/install/lib/cmake/openblas/OpenBLASConfig.cmake
+++ b/install/lib/cmake/openblas/OpenBLASConfig.cmake
@@ -0,0 +1,4 @@
 SET(OpenBLAS_VERSION "0.3.29.dev")
 file(REAL_PATH "../../.." _OpenBLAS_ROOT_DIR BASE_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} )
 SET(OpenBLAS_INCLUDE_DIRS ${_OpenBLAS_ROOT_DIR}/include)
 SET(OpenBLAS_LIBRARIES ${_OpenBLAS_ROOT_DIR}/lib/libopenblas.so)
--- a/install/lib/cmake/openblas/OpenBLASConfigVersion.cmake
+++ b/install/lib/cmake/openblas/OpenBLASConfigVersion.cmake
@@ -0,0 +1,9 @@
 set (PACKAGE_VERSION "0.3.29.dev")
 if (PACKAGE_VERSION VERSION_LESS PACKAGE_FIND_VERSION)
  set (PACKAGE_VERSION_COMPATIBLE FALSE)
 else ()
  set (PACKAGE_VERSION_COMPATIBLE TRUE)
  if (PACKAGE_FIND_VERSION STREQUAL PACKAGE_VERSION)
    set (PACKAGE_VERSION_EXACT TRUE)
  endif ()
 endif ()
--- a/install/lib/pkgconfig/openblas.pc
+++ b/install/lib/pkgconfig/openblas.pc
@@ -0,0 +1,16 @@
 libdir=/home/da/OpenBLAS/install/lib
 libprefix=
 libnamesuffix=
 libsuffix=
 includedir=/home/da/OpenBLAS/install/include
 omp_opt=
 openblas_config= USE_64BITINT= DYNAMIC_ARCH= DYNAMIC_OLDER= NO_CBLAS= NO_LAPACK= NO_LAPACKE= NO_AFFINITY=1 USE_OPENMP= RISCV64_ZVL128B MAX_THREADS=32
 version=0.3.29.dev
 extralib=-lm -lpthread -lgfortran -lm -lpthread -lgfortran
 Name: openblas
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
 Version: ${version}
 URL: https://github.com/xianyi/OpenBLAS
 Libs: -L${libdir} -l${libprefix}openblas${libsuffix}${libnamesuffix}
 Libs.private: ${extralib}
 Cflags: -I${includedir} ${omp_opt}
--- a/install/test_shgemm
+++ b/install/test_shgemm
--- a/install/test_shgemm.c
+++ b/install/test_shgemm.c
@@ -0,0 +1,45 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <cblas.h>
 #include <riscv_vector.h>
 void print_matrix(float *C, int M, int N) {
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++) {
            printf("%f ", C[i * N + j]);
        }
        printf("\n");
    }
 }
 int main() {
    const int M = 2, N = 2, K = 2;
    const float alpha = 1.0f;
    const float beta = 0.0f;
    // A[M x K], row-major
    hfloat16 A[4] = {1.0, 2.0,
                     3.0, 4.0};
    // B[K x N], row-major
    hfloat16 B[4] = {5.0, 6.0,
                     7.0, 8.0};
    // C[M x N], row-major
    float C[4] = {0};
    // Call OpenBLAS float16 GEMM
    cblas_shgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                 M, N, K,
                 alpha,
                 A, K,  // lda = K
                 B, N,  // ldb = N
                 beta,
                 C, N); // ldc = N
    printf("Result C = A*B:\n");
    print_matrix(C, M, N);
    return 0;
 }
--- a/install/zvl_test
+++ b/install/zvl_test
--- a/install/zvl_test.c
+++ b/install/zvl_test.c
@@ -0,0 +1,22 @@
 #include <riscv_vector.h>
 #include <stdio.h>
 #include <stdlib.h>
 int main(){
 	unsigned int gvl = __riscv_vsetvl_e32m2(8);
 	float *A = (float *)malloc(4 * 4 * sizeof(float));
 	for (int i =0;i<4*4;i++){
 		A[i]=i%10;
 	}
 	vfloat32m2_t A0 = __riscv_vle32_v_f32m2(&A[0], gvl);
 	float tmp[8];
    	__riscv_vse32_v_f32m2(tmp, A0, gvl);
 	printf("A0 vector contents:\n");
 	    for (int i = 0; i < gvl; i++) {
 		printf("tmp[%d] = %.2f\n", i, tmp[i]);
 	    }
 	    free(A);
 	return 0;
 }
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@@ -136,6 +136,9 @@ if (BUILD_BFLOAT16)
 	GenerateNamedObjects("gemm_batch.c" "" "sbgemm_batch" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 endif ()
 endif ()
 if (BUILD_HFLOAT16)
  GenerateNamedObjects("gemm.c" "" "shgemm" ${CBLAS_FLAG} "" "" true "HFLOAT16")
 endif ()
 # complex-specific sources
 foreach (float_type ${FLOAT_TYPES})
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -53,6 +53,10 @@ SBBLAS3OBJS    = sbgemm.$(SUFFIX) sbgemmt.$(SUFFIX) sbgemmtr.$(SUFFIX)
 SBEXTOBJS      = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 SHBLAS3OBJS    = shgemm.$(SUFFIX)
 endif
 DBLAS1OBJS    = \
 		daxpy.$(SUFFIX) dswap.$(SUFFIX) \
 		dcopy.$(SUFFIX) dscal.$(SUFFIX) \
@@ -291,6 +295,10 @@ CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) cblas_sbgemmt.$(SUFFIX) cblas_sbgemmtr.$(S
 CSBEXTOBJS   = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
 endif
 CDBLAS1OBJS   = \
 	cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
@@ -388,6 +396,7 @@ SBLAS3OBJS   += $(CSBLAS3OBJS)
 SBBLAS1OBJS  += $(CSBBLAS1OBJS)
 SBBLAS2OBJS  += $(CSBBLAS2OBJS)
 SBBLAS3OBJS  += $(CSBBLAS3OBJS)
 SHBLAS3OBJS  += $(CSHBLAS3OBJS)
 DBLAS1OBJS   += $(CDBLAS1OBJS)
 DBLAS2OBJS   += $(CDBLAS2OBJS)
 DBLAS3OBJS   += $(CDBLAS3OBJS)
@@ -405,6 +414,7 @@ endif
 SBLASOBJS    = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
 SBBLASOBJS   = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
 SHBLASOBJS   = $(SHBLAS3OBJS)
 DBLASOBJS    = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
 QBLASOBJS    = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
 CBLASOBJS    = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@@ -512,7 +522,7 @@ ifneq ($(BUILD_COMPLEX16),1)
 	ZBLASOBJS=
 endif
 FUNCOBJS    = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
 FUNCOBJS    = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(SHBLASOBJS)
 ifeq ($(EXPRECISION), 1)
 FUNCOBJS   += $(QBLASOBJS) $(XBLASOBJS)
@@ -550,7 +560,7 @@ level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $
 level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) 
 level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(SHBLAS3OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 aux :	$(CBAUXOBJS)
@@ -1309,6 +1319,11 @@ sbgemmtr.$(SUFFIX) sbgemmtr.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -c $(CFLAGS) -DRNAME $< -o $(@F)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif
 sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
@@ -1968,6 +1983,11 @@ cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 endif
 cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
--- a/kernel/CMakeLists.txt
+++ b/kernel/CMakeLists.txt
@@ -351,6 +351,22 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
 	GenerateNamedObjects("${KERNELDIR}/${SBGEMMKERNEL}" "" "gemm_kernel" false "" "" false "BFLOAT16")
 	GenerateNamedObjects("${KERNELDIR}/${SBGEMM_BETA}" "" "gemm_beta" false "" "" false "BFLOAT16")
    endif ()
  if (BUILD_HFLOAT16)
        if (SHGEMMINCOPY)
 		GenerateNamedObjects("${KERNELDIR}/${SHGEMMINCOPY}" "" "${SHGEMMINCOPYOBJ}" false "" "" true "HFLOAT16")
        endif ()
        if (SHGEMMITCOPY)
 		GenerateNamedObjects("${KERNELDIR}/${SHGEMMITCOPY}" "" "${SHGEMMITCOPYOBJ}" false "" "" true "HFLOAT16")
        endif ()
        if (SHGEMMONCOPY)
 		GenerateNamedObjects("${KERNELDIR}/${SHGEMMONCOPY}" "" "${SHGEMMONCOPYOBJ}" false "" "" true "HFLOAT16")
        endif ()
        if (SHGEMMOTCOPY)
 		GenerateNamedObjects("${KERNELDIR}/${SHGEMMOTCOPY}" "" "${SHGEMMOTCOPYOBJ}" false "" "" true "HFLOAT16")
        endif ()
 	GenerateNamedObjects("${KERNELDIR}/${SHGEMMKERNEL}" "" "gemm_kernel" false "" "" false "HFLOAT16")
  GenerateNamedObjects("${KERNELDIR}/${SHGEMM_BETA}" "" "gemm_beta" false "" "" false "HFLOAT16")
    endif ()
    foreach (float_type ${FLOAT_TYPES})
      string(SUBSTRING ${float_type} 0 1 float_char)
      if (${float_char}GEMMINCOPY)
@@ -769,6 +785,45 @@ endif ()
            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "BFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SBGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "BFLOAT16")
        endif ()
    if (BUILD_HFLOAT16)
      if (NOT DEFINED SHGEMM_SMALL_M_PERMIT)
          set(SHGEMM_SMALL_M_PERMIT ../generic/gemm_small_matrix_permit.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_NN)
          set(SHGEMM_SMALL_K_NN ../generic/gemm_small_matrix_kernel_nn.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_NT)
          set(SHGEMM_SMALL_K_NT ../generic/gemm_small_matrix_kernel_nt.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_TN)
          set(SHGEMM_SMALL_K_TN ../generic/gemm_small_matrix_kernel_tn.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_TT)
          set(SHGEMM_SMALL_K_TT ../generic/gemm_small_matrix_kernel_tt.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_B0_NN)
          set(SHGEMM_SMALL_K_B0_NN ../generic/gemm_small_matrix_kernel_nn.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_B0_NT)
          set(SHGEMM_SMALL_K_B0_NT ../generic/gemm_small_matrix_kernel_nt.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_B0_TN)
          set(SHGEMM_SMALL_K_B0_TN ../generic/gemm_small_matrix_kernel_tn.c)
      endif ()
      if (NOT DEFINED SHGEMM_SMALL_K_B0_TT)
          set(SHGEMM_SMALL_K_B0_TT ../generic/gemm_small_matrix_kernel_tt.c)
      endif ()
 	    GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_M_PERMIT}" "" "gemm_small_matrix_permit" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NN}" "" "gemm_small_kernel_nn" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_NT}" "" "gemm_small_kernel_nt" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TN}" "" "gemm_small_kernel_tn" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_TT}" "" "gemm_small_kernel_tt" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NN}" "B0" "gemm_small_kernel_b0_nn" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_NT}" "B0" "gemm_small_kernel_b0_nt" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TN}" "B0" "gemm_small_kernel_b0_tn" false "" "" false "HFLOAT16")
            GenerateNamedObjects("${KERNELDIR}/${SHGEMM_SMALL_K_B0_TT}" "B0" "gemm_small_kernel_b0_tt" false "" "" false "HFLOAT16")
        endif ()
      endif ()
      if (NOT DEFINED ${float_char}OMATCOPY_CN)
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -129,6 +129,26 @@ SBKERNELOBJS	+= \
 	$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 ifndef SHGEMMKERNEL
 SHGEMM_BETA = ../generic/gemm_beta.c
 SHGEMMKERNEL    = ../generic/gemmkernel_2x2.c
 SHGEMMINCOPY    = ../generic/gemm_ncopy_2.c
 SHGEMMITCOPY    = ../generic/gemm_tcopy_2.c
 SHGEMMONCOPY    = ../generic/gemm_ncopy_2.c
 SHGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
 SHGEMMINCOPYOBJ =  shgemm_incopy$(TSUFFIX).$(SUFFIX)
 SHGEMMITCOPYOBJ =  shgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SHGEMMONCOPYOBJ =  shgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SHGEMMOTCOPYOBJ =  shgemm_otcopy$(TSUFFIX).$(SUFFIX)
 endif
 SHKERNELOBJS	+= \
 	shgemm_kernel$(TSUFFIX).$(SUFFIX) \
 	$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
 	$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
 endif
 ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
 SKERNELOBJS	+= \
 	sgemm_kernel$(TSUFFIX).$(SUFFIX) \
@@ -192,6 +212,9 @@ XKERNELOBJS	+= \
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS      += $(SBKERNELOBJS)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 SHBLASOBJS      += $(SHKERNELOBJS)
 endif
 SBLASOBJS	+= $(SKERNELOBJS)
 DBLASOBJS	+= $(DKERNELOBJS)
 QBLASOBJS	+= $(QKERNELOBJS)
@@ -202,6 +225,9 @@ XBLASOBJS	+= $(XKERNELOBJS)
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
 endif
 ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
 SBLASOBJS	+= \
@@ -493,6 +519,15 @@ SBBLASOBJS += \
 	sbgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 SHBLASOBJS += \
 	shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
 	shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
 	shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) \
 	shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) \
 	shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX)
 endif
 SBLASOBJS += \
 	sgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) \
 	sgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) sgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) \
@@ -599,6 +634,13 @@ SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 endif
 SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
 SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
@@ -629,6 +671,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -671,6 +718,25 @@ $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
 endif
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
 $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 endif
 $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY)
 	$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -853,6 +919,12 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMM
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND)
 ifeq ($(OS), AIX)
 	$(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s
@@ -2840,6 +2912,11 @@ $(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
 	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 ifeq ($(BUILD_HFLOAT16),1)
 $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
 	$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
 	$(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@
@@ -2873,6 +2950,23 @@ $(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
 endif
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
 	$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY)
 	$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
 $(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY)
 	$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
 	$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 endif
 $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY)
 	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -2983,6 +3077,11 @@ $(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEM
 	$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
 	$(CC) $(PFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 endif
 $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
 	$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
@@ -4843,6 +4942,71 @@ $(KDIR)sbgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_SMA
 	$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 endif
 ifeq ($(BUILD_HFLOAT16), 1)
 ifndef SHGEMM_SMALL_M_PERMIT
 SHGEMM_SMALL_M_PERMIT = ../generic/gemm_small_matrix_permit.c
 endif
 ifndef SHGEMM_SMALL_K_NN
 SHGEMM_SMALL_K_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
 ifndef SHGEMM_SMALL_K_NT
 SHGEMM_SMALL_K_NT = ../generic/gemm_small_matrix_kernel_nt.c
 endif
 ifndef SHGEMM_SMALL_K_TN
 SHGEMM_SMALL_K_TN = ../generic/gemm_small_matrix_kernel_tn.c
 endif
 ifndef SHGEMM_SMALL_K_TT
 SHGEMM_SMALL_K_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 $(KDIR)shgemm_small_matrix_permit$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_M_PERMIT)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(KDIR)shgemm_small_kernel_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NN)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(KDIR)shgemm_small_kernel_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_NT)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(KDIR)shgemm_small_kernel_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TN)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 $(KDIR)shgemm_small_kernel_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_TT)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
 ifndef SHGEMM_SMALL_K_B0_NN
 SHGEMM_SMALL_K_B0_NN = ../generic/gemm_small_matrix_kernel_nn.c
 endif
 ifndef SHGEMM_SMALL_K_B0_NT
 SHGEMM_SMALL_K_B0_NT = ../generic/gemm_small_matrix_kernel_nt.c
 endif
 ifndef SHGEMM_SMALL_K_B0_TN
 SHGEMM_SMALL_K_B0_TN = ../generic/gemm_small_matrix_kernel_tn.c
 endif
 ifndef SHGEMM_SMALL_K_B0_TT
 SHGEMM_SMALL_K_B0_TT = ../generic/gemm_small_matrix_kernel_tt.c
 endif
 $(KDIR)shgemm_small_kernel_b0_nn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NN)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 $(KDIR)shgemm_small_kernel_b0_nt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_NT)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 $(KDIR)shgemm_small_kernel_b0_tn$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TN)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 $(KDIR)shgemm_small_kernel_b0_tt$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_SMALL_K_B0_TT)
 	$(CC) $(CFLAGS) -c -DHFLOAT16 -UDOUBLE -UCOMPLEX -DB0 $< -o $@
 endif
 ifndef CGEMM_SMALL_M_PERMIT
 CGEMM_SMALL_M_PERMIT = ../generic/zgemm_small_matrix_permit.c
 endif
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL128B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL128B
@@ -245,3 +245,12 @@ endif
 ifndef ZGEMM_BETA
 ZGEMM_BETA = zgemm_beta_rvv.c
 endif
 SHGEMMKERNEL    =  shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl128b.c
 SHGEMMONCOPY    =  ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
 SHGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
 SHGEMMONCOPYOBJ =  shgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SHGEMMOTCOPYOBJ =  shgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifndef SHGEMM_BETA
 SHGEMM_BETA =  gemm_beta_rvv.c
 endif
--- a/kernel/riscv64/KERNEL.RISCV64_ZVL256B
+++ b/kernel/riscv64/KERNEL.RISCV64_ZVL256B
@@ -207,3 +207,19 @@ COMATCOPY_CN = zomatcopy_cn_vector.c
 DOMATCOPY_CN = omatcopy_cn_vector.c
 SOMATCOPY_CN = omatcopy_cn_vector.c
 SHGEMMKERNEL    =  shgemm_kernel_$(SHGEMM_UNROLL_M)x$(SHGEMM_UNROLL_N)_zvl256b.c
 ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
 SHGEMMINCOPY    =  ../generic/gemm_ncopy_$(SHGEMM_UNROLL_M).c
 SHGEMMITCOPY    =  ../generic/gemm_tcopy_$(SHGEMM_UNROLL_M).c
 SHGEMMINCOPYOBJ =  shgemm_incopy$(TSUFFIX).$(SUFFIX)
 SHGEMMITCOPYOBJ =  shgemm_itcopy$(TSUFFIX).$(SUFFIX)
 endif
 SHGEMMONCOPY    =  ../generic/gemm_ncopy_$(SHGEMM_UNROLL_N).c
 SHGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SHGEMM_UNROLL_N).c
 SHGEMMONCOPYOBJ =  shgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SHGEMMOTCOPYOBJ =  shgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ifndef SHGEMM_BETA
 SHGEMM_BETA =  gemm_beta_rvv.c
 endif
--- a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
+++ b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c
--- a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c
+++ b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c
@@ -1,5 +1,6 @@
 #include "common.h"
 #include <riscv_vector.h>
 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc)
 {
@@ -14,7 +15,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
        for (BLASLONG i=0; i<M/8; i+=1) {
            BLASLONG ai=m_top*K;	
            BLASLONG bi=n_top*K;	
            BLASLONG bi=n_top*K;
            _Float16 B0 = B[bi+0];
            _Float16 B1 = B[bi+1];
@@ -50,17 +51,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                bi += 8;
                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 16;
                ai += 8;
                result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
                result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
                result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
                result4 = __riscv_vfwmacc_vf_f32m2(result4, A0, B4, gvl);
                result5 = __riscv_vfwmacc_vf_f32m2(result5, A0, B5, gvl);
                result6 = __riscv_vfwmacc_vf_f32m2(result6, A0, B6, gvl);
                result7 = __riscv_vfwmacc_vf_f32m2(result7, A0, B7, gvl);
                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
                result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
                result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
                result4 = __riscv_vfwmacc_vf_f32m2(result4, B4, A0, gvl);
                result5 = __riscv_vfwmacc_vf_f32m2(result5, B5, A0, gvl);
                result6 = __riscv_vfwmacc_vf_f32m2(result6, B6, A0, gvl);
                result7 = __riscv_vfwmacc_vf_f32m2(result7, B7, A0, gvl);
            }
@@ -86,14 +87,14 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            ci = n_top * ldc + m_top;
            __riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c3, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c4, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c5, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c6, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c7, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c3, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c4, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c5, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c6, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c7, gvl); ci += ldc-gvl*0;
            m_top += 8;
        }
@@ -332,10 +333,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 8;
                result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
                result2 = __riscv_vfwmacc_vf_f32m2(result2, A0, B2, gvl);
                result3 = __riscv_vfwmacc_vf_f32m2(result3, A0, B3, gvl);
                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
                result2 = __riscv_vfwmacc_vf_f32m2(result2, B2, A0, gvl);
                result3 = __riscv_vfwmacc_vf_f32m2(result3, B3, A0, gvl);
            }
@@ -353,10 +354,10 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            ci = n_top * ldc + m_top;
            __riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c1, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c2, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c3, gvl);
            __riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c1, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c2, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c3, gvl);
            m_top += 8;
        }
@@ -521,8 +522,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 8;
                result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, A0, B1, gvl);
                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
                result1 = __riscv_vfwmacc_vf_f32m2(result1, B1, A0, gvl);
            }
@@ -536,8 +537,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            ci = n_top * ldc + m_top;
            __riscv_vse16_v_f16m1( &C[ci], c0, gvl); ci += ldc-gvl*0;
            __riscv_vse16_v_f16m1( &C[ci], c1, gvl); 
            __riscv_vse32_v_f32m2( &C[ci], c0, gvl); ci += ldc-gvl*0;
            __riscv_vse32_v_f32m2( &C[ci], c1, gvl); 
            m_top += 8;
        }
@@ -604,7 +605,6 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                bi+=2;
            }
            BLASLONG ci=n_top*ldc+m_top;
            C[ci + 0 * ldc + 0] += alpha * result0;
            C[ci + 0 * ldc + 1] += alpha * result1;
@@ -665,7 +665,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
                A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl );
                ai += 8;
                result0 = __riscv_vfwmacc_vf_f32m2(result0, A0, B0, gvl);
                result0 = __riscv_vfwmacc_vf_f32m2(result0, B0, A0, gvl);
            }
@@ -677,7 +677,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B,
            ci = n_top * ldc + m_top;
            __riscv_vse16_v_f16m1( &C[ci], c0, gvl);
            __riscv_vse32_v_f32m2( &C[ci], c0, gvl);
            m_top += 8;
        }
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -125,6 +125,23 @@ gotoblas_t TABLE_NAME = {
 #endif
 #endif
 #ifdef BUILD_HFLOAT16
  0, 0, 0,
  SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N,
 #ifdef SHGEMM_DEFAULT_UNROLL_MN
 SHGEMM_DEFAULT_UNROLL_MN,
 #else
 MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N),
 #endif
  shgemm_kernelTS, shgemm_betaTS,
 #if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N
  shgemm_incopyTS, shgemm_itcopyTS,
 #else
  shgemm_oncopyTS, shgemm_otcopyTS,
 #endif
  shgemm_oncopyTS, shgemm_otcopyTS,
 #endif
 #if ( BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1)
  0, 0, 0,
  SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
@@ -1252,6 +1269,9 @@ static void init_parameter(void) {
 #ifdef BUILD_BFLOAT16
  TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
 #endif
 #ifdef BUILD_HFLOAT16
  TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
 #endif
  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
@@ -1260,6 +1280,9 @@ static void init_parameter(void) {
 #ifdef BUILD_BFLOAT16
  TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
 #endif
 #ifdef BUILD_HFLOAT16
  TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
 #endif
  TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
  TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
@@ -1269,6 +1292,9 @@ static void init_parameter(void) {
 #ifdef BUILD_BFLOAT16
  TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
 #endif
 #ifdef BUILD_HFLOAT16
  TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
 #endif
  TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
  TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
@@ -1417,6 +1443,10 @@ static void init_parameter(void) {
  TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
  TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
 #endif
 #ifdef BUILD_HFLOAT16
  TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
  TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
 #endif
 #if  (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
  TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
 #endif
@@ -2012,6 +2042,13 @@ static void init_parameter(void) {
 			       ) / (TABLE_NAME.sbgemm_q *  4) - 15) & ~15);
 #endif
 #if BUILD_HFLOAT16==1
  TABLE_NAME.shgemm_r = (((BUFFER_SIZE -
 			       ((TABLE_NAME.shgemm_p * TABLE_NAME.shgemm_q *  4 + TABLE_NAME.offsetA
 				 + TABLE_NAME.align) & ~TABLE_NAME.align)
 			       ) / (TABLE_NAME.shgemm_q *  4) - 15) & ~15);
 #endif
 #if BUILD_SINGLE==1
  TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
 			       ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q *  4 + TABLE_NAME.offsetA
--- a/lapack/CMakeLists.txt
+++ b/lapack/CMakeLists.txt
@@ -3,6 +3,7 @@ include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_BINARY_DIR})
 list (REMOVE_ITEM FLOAT_TYPES "BFLOAT16")
 list (REMOVE_ITEM FLOAT_TYPES "HFLOAT16")
 set(LAPACK_SOURCES
  potrf/potrf_U_single.c
--- a/openblas_config_template.h
+++ b/openblas_config_template.h
@@ -39,7 +39,9 @@ typedef unsigned long BLASULONG;
 typedef uint16_t bfloat16;
 #endif
 #ifndef HFLOAT16
 #if defined(__GNUC__) && (__GNUC__ >= 12)
 typedef _Float16 hfloat16;
 #else
 #include <stdint.h>
 typedef uint16_t hfloat16;
 #endif
--- a/param.h
+++ b/param.h
@@ -74,6 +74,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SHGEMM_DEFAULT_UNROLL_N 8
 #define SHGEMM_DEFAULT_UNROLL_M 8
 #define SHGEMM_DEFAULT_UNROLL_MN 32
 #define SHGEMM_DEFAULT_P 128
 #define SHGEMM_DEFAULT_R 240
 #define SHGEMM_DEFAULT_Q 12288