From c167a3d6f41ed2f5680c2a72846b4c9b5d416543 Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Mon, 16 Apr 2018 12:34:43 -0700
Subject: [PATCH 001/121] Added RISCV build

---
 Makefile.riscv64               |   0
 Makefile.system                |   4 +
 c_check                        |   1 +
 common.h                       |   5 ++
 common_riscv64.h               |  93 +++++++++++++++++++
 cpuid_riscv64.c                | 111 +++++++++++++++++++++++
 ctest.c                        |   4 +
 getarch.c                      |  19 ++++
 kernel/riscv64/KERNEL          | 149 +++++++++++++++++++++++++++++++
 kernel/riscv64/amax.c          |  75 ++++++++++++++++
 kernel/riscv64/amin.c          |  75 ++++++++++++++++
 kernel/riscv64/asum.c          |  67 ++++++++++++++
 kernel/riscv64/axpby.c         |  96 ++++++++++++++++++++
 kernel/riscv64/axpy.c          |  64 ++++++++++++++
 kernel/riscv64/copy.c          |  59 +++++++++++++
 kernel/riscv64/dot.c           |  64 ++++++++++++++
 kernel/riscv64/gemv_n.c        |  67 ++++++++++++++
 kernel/riscv64/gemv_t.c        |  68 ++++++++++++++
 kernel/riscv64/iamax.c         |  77 ++++++++++++++++
 kernel/riscv64/iamin.c         |  77 ++++++++++++++++
 kernel/riscv64/imax.c          |  69 +++++++++++++++
 kernel/riscv64/imin.c          |  67 ++++++++++++++
 kernel/riscv64/izamax.c        |  81 +++++++++++++++++
 kernel/riscv64/izamin.c        |  81 +++++++++++++++++
 kernel/riscv64/max.c           |  65 ++++++++++++++
 kernel/riscv64/min.c           |  65 ++++++++++++++
 kernel/riscv64/nrm2.c          |  88 ++++++++++++++++++
 kernel/riscv64/omatcopy_cn.c   |  90 +++++++++++++++++++
 kernel/riscv64/omatcopy_ct.c   |  89 +++++++++++++++++++
 kernel/riscv64/omatcopy_rn.c   |  90 +++++++++++++++++++
 kernel/riscv64/omatcopy_rt.c   |  62 +++++++++++++
 kernel/riscv64/rot.c           |  62 +++++++++++++
 kernel/riscv64/scal.c          |  63 +++++++++++++
 kernel/riscv64/swap.c          |  62 +++++++++++++
 kernel/riscv64/symv_L.c        |  70 +++++++++++++++
 kernel/riscv64/symv_U.c        |  71 +++++++++++++++
 kernel/riscv64/zamax.c         |  79 +++++++++++++++++
 kernel/riscv64/zamin.c         |  79 +++++++++++++++++
 kernel/riscv64/zasum.c         |  72 +++++++++++++++
 kernel/riscv64/zaxpby.c        | 118 +++++++++++++++++++++++++
 kernel/riscv64/zaxpy.c         |  74 ++++++++++++++++
 kernel/riscv64/zcopy.c         |  65 ++++++++++++++
 kernel/riscv64/zdot.c          |  80 +++++++++++++++++
 kernel/riscv64/zgemv_n.c       | 157 +++++++++++++++++++++++++++++++++
 kernel/riscv64/zgemv_t.c       | 140 +++++++++++++++++++++++++++++
 kernel/riscv64/znrm2.c         | 106 ++++++++++++++++++++++
 kernel/riscv64/zomatcopy_cn.c  |  70 +++++++++++++++
 kernel/riscv64/zomatcopy_cnc.c |  69 +++++++++++++++
 kernel/riscv64/zomatcopy_ct.c  |  71 +++++++++++++++
 kernel/riscv64/zomatcopy_ctc.c |  71 +++++++++++++++
 kernel/riscv64/zomatcopy_rn.c  |  70 +++++++++++++++
 kernel/riscv64/zomatcopy_rnc.c |  69 +++++++++++++++
 kernel/riscv64/zomatcopy_rt.c  |  72 +++++++++++++++
 kernel/riscv64/zomatcopy_rtc.c |  72 +++++++++++++++
 kernel/riscv64/zrot.c          |  70 +++++++++++++++
 kernel/riscv64/zscal.c         |  88 ++++++++++++++++++
 kernel/riscv64/zswap.c         |  72 +++++++++++++++
 lapack/laswp/riscv64/Makefile  |  13 +++
 param.h                        |  39 ++++++++
 59 files changed, 4166 insertions(+)
 create mode 100644 Makefile.riscv64
 create mode 100644 common_riscv64.h
 create mode 100644 cpuid_riscv64.c
 create mode 100644 kernel/riscv64/KERNEL
 create mode 100644 kernel/riscv64/amax.c
 create mode 100644 kernel/riscv64/amin.c
 create mode 100644 kernel/riscv64/asum.c
 create mode 100644 kernel/riscv64/axpby.c
 create mode 100644 kernel/riscv64/axpy.c
 create mode 100644 kernel/riscv64/copy.c
 create mode 100644 kernel/riscv64/dot.c
 create mode 100644 kernel/riscv64/gemv_n.c
 create mode 100644 kernel/riscv64/gemv_t.c
 create mode 100644 kernel/riscv64/iamax.c
 create mode 100644 kernel/riscv64/iamin.c
 create mode 100644 kernel/riscv64/imax.c
 create mode 100644 kernel/riscv64/imin.c
 create mode 100644 kernel/riscv64/izamax.c
 create mode 100644 kernel/riscv64/izamin.c
 create mode 100644 kernel/riscv64/max.c
 create mode 100644 kernel/riscv64/min.c
 create mode 100644 kernel/riscv64/nrm2.c
 create mode 100644 kernel/riscv64/omatcopy_cn.c
 create mode 100644 kernel/riscv64/omatcopy_ct.c
 create mode 100644 kernel/riscv64/omatcopy_rn.c
 create mode 100644 kernel/riscv64/omatcopy_rt.c
 create mode 100644 kernel/riscv64/rot.c
 create mode 100644 kernel/riscv64/scal.c
 create mode 100644 kernel/riscv64/swap.c
 create mode 100644 kernel/riscv64/symv_L.c
 create mode 100644 kernel/riscv64/symv_U.c
 create mode 100644 kernel/riscv64/zamax.c
 create mode 100644 kernel/riscv64/zamin.c
 create mode 100644 kernel/riscv64/zasum.c
 create mode 100644 kernel/riscv64/zaxpby.c
 create mode 100644 kernel/riscv64/zaxpy.c
 create mode 100644 kernel/riscv64/zcopy.c
 create mode 100644 kernel/riscv64/zdot.c
 create mode 100644 kernel/riscv64/zgemv_n.c
 create mode 100644 kernel/riscv64/zgemv_t.c
 create mode 100644 kernel/riscv64/znrm2.c
 create mode 100644 kernel/riscv64/zomatcopy_cn.c
 create mode 100644 kernel/riscv64/zomatcopy_cnc.c
 create mode 100644 kernel/riscv64/zomatcopy_ct.c
 create mode 100644 kernel/riscv64/zomatcopy_ctc.c
 create mode 100644 kernel/riscv64/zomatcopy_rn.c
 create mode 100644 kernel/riscv64/zomatcopy_rnc.c
 create mode 100644 kernel/riscv64/zomatcopy_rt.c
 create mode 100644 kernel/riscv64/zomatcopy_rtc.c
 create mode 100644 kernel/riscv64/zrot.c
 create mode 100644 kernel/riscv64/zscal.c
 create mode 100644 kernel/riscv64/zswap.c
 create mode 100644 lapack/laswp/riscv64/Makefile

diff --git a/Makefile.riscv64 b/Makefile.riscv64
new file mode 100644
index 000000000..e69de29bb
diff --git a/Makefile.system b/Makefile.system
index 142cb420f..02d392d9c 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -593,7 +593,9 @@ endif
 ifndef BINARY_DEFINED
 ifneq ($(OSNAME), AIX)
 ifdef BINARY64
+ifneq ($(ARCH), riscv64)
 CCOMMON_OPT += -m64
+endif
 else
 CCOMMON_OPT += -m32
 endif
@@ -687,8 +689,10 @@ endif
 else
 ifdef BINARY64
 ifneq ($(OSNAME), AIX)
+ifneq ($(ARCH), riscv64)
 FCOMMON_OPT += -m64
 endif
+endif
 ifdef INTERFACE64
 ifneq ($(INTERFACE64), 0)
 FCOMMON_OPT +=  -fdefault-integer-8
diff --git a/c_check b/c_check
index a3b337602..c564855f3 100644
--- a/c_check
+++ b/c_check
@@ -76,6 +76,7 @@ $architecture = ia64   if ($data =~ /ARCH_IA64/);
 $architecture = arm    if ($data =~ /ARCH_ARM/);
 $architecture = arm64  if ($data =~ /ARCH_ARM64/);
 $architecture = zarch  if ($data =~ /ARCH_ZARCH/);
+$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
 
 $defined = 0;
 
diff --git a/common.h b/common.h
index 5a599a5af..3d23d9ee6 100644
--- a/common.h
+++ b/common.h
@@ -408,6 +408,11 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips.h"
 #endif
 
+    
+#ifdef ARCH_RISCV64
+#include "common_riscv64.h"
+#endif
+
 #ifdef ARCH_MIPS64
 #include "common_mips64.h"
 #endif
diff --git a/common_riscv64.h b/common_riscv64.h
new file mode 100644
index 000000000..fe4e0a6d3
--- /dev/null
+++ b/common_riscv64.h
@@ -0,0 +1,93 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_RISCV64
+#define COMMON_RISCV64
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#endif
+
+
+
+#define BUFFER_SIZE     ( 32 << 20)
+#define SEEK_ADDRESS
+
+#endif
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
new file mode 100644
index 000000000..129ed11b0
--- /dev/null
+++ b/cpuid_riscv64.c
@@ -0,0 +1,111 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define CPU_UNKNOWN     0
+
+static char *cpuname[] = {
+  "UNKOWN",
+};
+
+int detect(void){
+    return CPU_UNKNOWN;
+}
+
+char *get_corename(void){
+  return cpuname[detect()];
+}
+
+void get_architecture(void){
+  printf("RISCV64");
+}
+
+void get_subarchitecture(void){
+}
+
+void get_subdirname(void){
+  printf("riscv64");
+}
+
+void get_cpuconfig(void){
+  printf("#define UNKNOWN\n");
+  printf("#define L1_DATA_SIZE 65536\n");
+  printf("#define L1_DATA_LINESIZE 32\n");
+  printf("#define L2_SIZE 512488\n");
+  printf("#define L2_LINESIZE 32\n");
+  printf("#define DTB_DEFAULT_ENTRIES 64\n");
+  printf("#define DTB_SIZE 4096\n");
+  printf("#define L2_ASSOCIATIVE 4\n");
+}
+
+void get_libname(void){
+  printf("riscv64\n");
+}
diff --git a/ctest.c b/ctest.c
index 00be423d1..cab939887 100644
--- a/ctest.c
+++ b/ctest.c
@@ -149,3 +149,7 @@ ARCH_ARM
 ARCH_ARM64
 #endif
 
+#if defined(__riscv)
+ARCH_RISCV64
+#endif
+
diff --git a/getarch.c b/getarch.c
index 992fc2b95..7f7fd97c4 100644
--- a/getarch.c
+++ b/getarch.c
@@ -604,6 +604,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
+
 #ifdef FORCE_PPCG4
 #define FORCE
 #define ARCHITECTURE    "POWER"
@@ -859,6 +860,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_RISCV64
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64 " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64"
+#define CORENAME  "RISCV64"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA15
 #define FORCE
 #define ARCHITECTURE    "ARM"
@@ -1051,6 +1066,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __riscv
+#include "cpuid_riscv64.c"
+#endif
+
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED
diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL
new file mode 100644
index 000000000..7d854ced6
--- /dev/null
+++ b/kernel/riscv64/KERNEL
@@ -0,0 +1,149 @@
+SAMAXKERNEL  = ../riscv64/amax.c
+DAMAXKERNEL  = ../riscv64/amax.c
+CAMAXKERNEL  = ../riscv64/zamax.c
+ZAMAXKERNEL  = ../riscv64/zamax.c
+
+SAMINKERNEL  = ../riscv64/amin.c
+DAMINKERNEL  = ../riscv64/amin.c
+CAMINKERNEL  = ../riscv64/zamin.c
+ZAMINKERNEL  = ../riscv64/zamin.c
+
+SMAXKERNEL   = ../riscv64/max.c
+DMAXKERNEL   = ../riscv64/max.c
+
+SMINKERNEL   = ../riscv64/min.c
+DMINKERNEL   = ../riscv64/min.c
+
+ISAMAXKERNEL = ../riscv64/iamax.c
+IDAMAXKERNEL = ../riscv64/iamax.c
+ICAMAXKERNEL = ../riscv64/izamax.c
+IZAMAXKERNEL = ../riscv64/izamax.c
+
+ISAMINKERNEL = ../riscv64/iamin.c
+IDAMINKERNEL = ../riscv64/iamin.c
+ICAMINKERNEL = ../riscv64/izamin.c
+IZAMINKERNEL = ../riscv64/izamin.c
+
+ISMAXKERNEL  = ../riscv64/imax.c
+IDMAXKERNEL  = ../riscv64/imax.c
+
+ISMINKERNEL  = ../riscv64/imin.c
+IDMINKERNEL  = ../riscv64/imin.c
+
+SASUMKERNEL  = ../riscv64/asum.c
+DASUMKERNEL  = ../riscv64/asum.c
+CASUMKERNEL  = ../riscv64/zasum.c
+ZASUMKERNEL  = ../riscv64/zasum.c
+
+SAXPYKERNEL  = ../riscv64/axpy.c
+DAXPYKERNEL  = ../riscv64/axpy.c
+CAXPYKERNEL  = ../riscv64/zaxpy.c
+ZAXPYKERNEL  = ../riscv64/zaxpy.c
+
+SCOPYKERNEL  = ../riscv64/copy.c
+DCOPYKERNEL  = ../riscv64/copy.c
+CCOPYKERNEL  = ../riscv64/zcopy.c
+ZCOPYKERNEL  = ../riscv64/zcopy.c
+
+SDOTKERNEL   = ../riscv64/dot.c
+DDOTKERNEL   = ../riscv64/dot.c
+CDOTKERNEL   = ../riscv64/zdot.c
+ZDOTKERNEL   = ../riscv64/zdot.c
+
+SNRM2KERNEL  = ../riscv64/nrm2.c
+DNRM2KERNEL  = ../riscv64/nrm2.c
+CNRM2KERNEL  = ../riscv64/znrm2.c
+ZNRM2KERNEL  = ../riscv64/znrm2.c
+
+SROTKERNEL   = ../riscv64/rot.c
+DROTKERNEL   = ../riscv64/rot.c
+CROTKERNEL   = ../riscv64/zrot.c
+ZROTKERNEL   = ../riscv64/zrot.c
+
+SSCALKERNEL  = ../riscv64/scal.c
+DSCALKERNEL  = ../riscv64/scal.c
+CSCALKERNEL  = ../riscv64/zscal.c
+ZSCALKERNEL  = ../riscv64/zscal.c
+
+SSWAPKERNEL  = ../riscv64/swap.c
+DSWAPKERNEL  = ../riscv64/swap.c
+CSWAPKERNEL  = ../riscv64/zswap.c
+ZSWAPKERNEL  = ../riscv64/zswap.c
+
+SGEMVNKERNEL = ../riscv64/gemv_n.c
+DGEMVNKERNEL = ../riscv64/gemv_n.c
+CGEMVNKERNEL = ../riscv64/zgemv_n.c
+ZGEMVNKERNEL = ../riscv64/zgemv_n.c
+
+SGEMVTKERNEL = ../riscv64/gemv_t.c
+DGEMVTKERNEL = ../riscv64/gemv_t.c
+CGEMVTKERNEL = ../riscv64/zgemv_t.c
+ZGEMVTKERNEL = ../riscv64/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/amax.c b/kernel/riscv64/amax.c
new file mode 100644
index 000000000..792e68bd9
--- /dev/null
+++ b/kernel/riscv64/amax.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c
new file mode 100644
index 000000000..78495a8e3
--- /dev/null
+++ b/kernel/riscv64/amin.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < minf )
+		{
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/asum.c b/kernel/riscv64/asum.c
new file mode 100644
index 000000000..b284ae3fc
--- /dev/null
+++ b/kernel/riscv64/asum.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	n *= inc_x;
+	while(i < n)
+	{
+		sumf += ABS(x[i]);
+		i += inc_x;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c
new file mode 100644
index 000000000..278747f75
--- /dev/null
+++ b/kernel/riscv64/axpby.c
@@ -0,0 +1,96 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	if ( beta == 0.0 )
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] = 0.0 ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] =  beta * y[iy] ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] + beta * y[iy] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c
new file mode 100644
index 000000000..fb1094dd9
--- /dev/null
+++ b/kernel/riscv64/axpy.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+	if ( da == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c
new file mode 100644
index 000000000..7b4f04f30
--- /dev/null
+++ b/kernel/riscv64/copy.c
@@ -0,0 +1,59 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		y[iy] = x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
new file mode 100644
index 000000000..46a84ad18
--- /dev/null
+++ b/kernel/riscv64/dot.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+	while(i < n)
+	{
+
+		dot += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+
diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c
new file mode 100644
index 000000000..ef61b245b
--- /dev/null
+++ b/kernel/riscv64/gemv_n.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	ix = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = alpha * x[ix];
+		iy = 0;
+		for (i=0; i<m; i++)
+		{
+			y[iy] += temp * a_ptr[i];
+			iy += inc_y;
+		}
+		a_ptr += lda;
+		ix    += inc_x;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/gemv_t.c b/kernel/riscv64/gemv_t.c
new file mode 100644
index 000000000..169047b72
--- /dev/null
+++ b/kernel/riscv64/gemv_t.c
@@ -0,0 +1,68 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	iy = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = 0.0;
+		ix = 0;
+		for (i=0; i<m; i++)
+		{
+			temp += a_ptr[i] * x[ix];
+			ix    += inc_x;
+		}
+		y[iy] += alpha * temp;
+		iy += inc_y;
+		a_ptr += lda;
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/iamax.c b/kernel/riscv64/iamax.c
new file mode 100644
index 000000000..8c016ce4d
--- /dev/null
+++ b/kernel/riscv64/iamax.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			max = i;
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c
new file mode 100644
index 000000000..155292bd5
--- /dev/null
+++ b/kernel/riscv64/iamin.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < ABS(minf) )
+		{
+			min = i;
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c
new file mode 100644
index 000000000..5072dd16e
--- /dev/null
+++ b/kernel/riscv64/imax.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			max = i;
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c
new file mode 100644
index 000000000..598cba387
--- /dev/null
+++ b/kernel/riscv64/imin.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/08/19 Saar
+*	 BLASTEST float
+* 	 BLASTEST double
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > minf )
+		{
+			min = i;
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c
new file mode 100644
index 000000000..8fe33e95b
--- /dev/null
+++ b/kernel/riscv64/izamax.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG max=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			max = i;
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c
new file mode 100644
index 000000000..fb5a0d4cb
--- /dev/null
+++ b/kernel/riscv64/izamin.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG min=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			min = i;
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c
new file mode 100644
index 000000000..2ad956bc0
--- /dev/null
+++ b/kernel/riscv64/max.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c
new file mode 100644
index 000000000..2812fe397
--- /dev/null
+++ b/kernel/riscv64/min.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] < minf )
+		{
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c
new file mode 100644
index 000000000..fcff09337
--- /dev/null
+++ b/kernel/riscv64/nrm2.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	FLOAT absxi = 0.0;
+
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	if ( n == 1 ) return( ABS(x[0]) );
+
+	n *= inc_x;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			absxi = ABS( x[i] );
+			if ( scale < absxi )
+			{
+				ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+				scale = absxi ;
+			}
+			else
+			{
+				ssq += ( absxi/scale ) * ( absxi/scale );
+			}
+
+		}
+		i += inc_x;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_cn.c b/kernel/riscv64/omatcopy_cn.c
new file mode 100644
index 000000000..4d11b9125
--- /dev/null
+++ b/kernel/riscv64/omatcopy_cn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		for(j=0; j<rows; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_ct.c b/kernel/riscv64/omatcopy_ct.c
new file mode 100644
index 000000000..b2587813f
--- /dev/null
+++ b/kernel/riscv64/omatcopy_ct.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = 0.0;
+			}
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = aptr[j];
+			}
+			aptr += lda;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<rows; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rn.c b/kernel/riscv64/omatcopy_rn.c
new file mode 100644
index 000000000..57515e729
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<rows ; i++ )
+	{
+		for(j=0; j<cols; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rt.c b/kernel/riscv64/omatcopy_rt.c
new file mode 100644
index 000000000..9d58350d5
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rt.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<cols; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/rot.c b/kernel/riscv64/rot.c
new file mode 100644
index 000000000..18b4ca252
--- /dev/null
+++ b/kernel/riscv64/rot.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	while(i < n)
+	{
+		temp   = c*x[ix] + s*y[iy] ;
+		y[iy]  = c*y[iy] - s*x[ix] ;
+		x[ix]  = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/scal.c b/kernel/riscv64/scal.c
new file mode 100644
index 000000000..4ef49e293
--- /dev/null
+++ b/kernel/riscv64/scal.c
@@ -0,0 +1,63 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+
+	if ( (n <= 0) || (inc_x <= 0))
+		return(0);
+	
+
+	while(j < n)
+	{
+
+		if ( da == 0.0 )
+			x[i]=0.0;
+		else
+			x[i] = da * x[i] ;
+
+		i += inc_x ;
+		j++;
+
+	}
+	return 0;
+
+}
+
+
diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c
new file mode 100644
index 000000000..eac621fb2
--- /dev/null
+++ b/kernel/riscv64/swap.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/08/20 Saar
+*	 BLASTEST float		OK
+* 	 BLASTEST double	OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		temp  = x[ix] ;
+		x[ix] = y[iy] ;
+		y[iy] = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/symv_L.c b/kernel/riscv64/symv_L.c
new file mode 100644
index 000000000..8f48d03f5
--- /dev/null
+++ b/kernel/riscv64/symv_L.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if ( m != offset )
+		printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+	jx = 0;
+	jy = 0;
+
+	for (j=0; j<offset; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		y[jy] += temp1 * a[j*lda+j];
+		iy = jy;
+		ix = jx;
+		for (i=j+1; i<m; i++)
+		{
+			ix += inc_x;
+			iy += inc_y;
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			
+		}
+		y[jy] += alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_U.c b/kernel/riscv64/symv_U.c
new file mode 100644
index 000000000..b5a0c96e9
--- /dev/null
+++ b/kernel/riscv64/symv_U.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if( m != offset )
+		printf("Symv_U: m=%d offset=%d\n",m,offset);
+#endif
+
+	BLASLONG m1 = m - offset;
+
+	jx = m1 * inc_x;
+	jy = m1 * inc_y;
+
+	for (j=m1; j<m; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		iy = 0;
+		ix = 0;
+		for (i=0; i<j; i++)
+		{
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			ix += inc_x;
+			iy += inc_y;
+			
+		}
+		y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zamax.c b/kernel/riscv64/zamax.c
new file mode 100644
index 000000000..a39bd7821
--- /dev/null
+++ b/kernel/riscv64/zamax.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c
new file mode 100644
index 000000000..02eab3e75
--- /dev/null
+++ b/kernel/riscv64/zamin.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/zasum.c b/kernel/riscv64/zasum.c
new file mode 100644
index 000000000..61e85cae6
--- /dev/null
+++ b/kernel/riscv64/zasum.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+		sumf += CABS1(x,i);
+		i += inc_x2;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/zaxpby.c b/kernel/riscv64/zaxpby.c
new file mode 100644
index 000000000..445354416
--- /dev/null
+++ b/kernel/riscv64/zaxpby.c
@@ -0,0 +1,118 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***************************************************************************
+* 2014/06/07 Saar
+*
+***************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	FLOAT temp;
+	BLASLONG inc_x2, inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	if ( beta_r == 0.0 && beta_i == 0.0)
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				y[iy]   = 0.0 ;
+				y[iy+1] = 0.0 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				y[iy]   = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) ;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				temp    = ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				temp    = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) + ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) + ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c
new file mode 100644
index 000000000..1dcaeac27
--- /dev/null
+++ b/kernel/riscv64/zaxpy.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/15 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+		y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+		y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+		y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c
new file mode 100644
index 000000000..07fe584c5
--- /dev/null
+++ b/kernel/riscv64/zcopy.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		y[iy]   = x[ix] ;
+		y[iy+1] = x[ix+1] ;
+		ix += inc_x2;
+		iy += inc_y2;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zdot.c b/kernel/riscv64/zdot.c
new file mode 100644
index 000000000..733c235c6
--- /dev/null
+++ b/kernel/riscv64/zdot.c
@@ -0,0 +1,80 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: FAIL
+* 	 BLASTEST double	: FAIL
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT dot[2];
+	OPENBLAS_COMPLEX_FLOAT result;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	dot[0]=0.0;
+	dot[1]=0.0;
+
+	CREAL(result) = 0.0 ;
+	CIMAG(result) = 0.0 ;
+
+	if ( n < 1 )  return(result);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		dot[0] += ( x[ix]   * y[iy] - x[ix+1] * y[iy+1] ) ;
+		dot[1] += ( x[ix+1] * y[iy] + x[ix]   * y[iy+1] ) ;
+#else
+		dot[0] += ( x[ix]   * y[iy] + x[ix+1] * y[iy+1] ) ;
+		dot[1] -= ( x[ix+1] * y[iy] - x[ix]   * y[iy+1] ) ;
+#endif
+		ix  += inc_x2 ;
+		iy  += inc_y2 ;
+		i++ ;
+
+	}
+	CREAL(result) = dot[0];
+	CIMAG(result) = dot[1];
+	return(result);
+
+}
+
+
diff --git a/kernel/riscv64/zgemv_n.c b/kernel/riscv64/zgemv_n.c
new file mode 100644
index 000000000..b9b03f792
--- /dev/null
+++ b/kernel/riscv64/zgemv_n.c
@@ -0,0 +1,157 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	ix = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += 2;
+		}
+		a_ptr += lda2;
+		ix    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += inc_y2;
+		}
+		a_ptr += lda2;
+		ix    += inc_x2;
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zgemv_t.c b/kernel/riscv64/zgemv_t.c
new file mode 100644
index 000000000..1239cf3f7
--- /dev/null
+++ b/kernel/riscv64/zgemv_t.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	iy = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += 2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += inc_x2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += inc_y2;
+	}
+
+	return(0);
+
+}
+
+
+
diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c
new file mode 100644
index 000000000..fc1c8b54a
--- /dev/null
+++ b/kernel/riscv64/znrm2.c
@@ -0,0 +1,106 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	BLASLONG inc_x2;
+	FLOAT temp;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			temp = ABS( x[i] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+		if ( x[i+1] != 0.0 )
+		{
+			temp = ABS( x[i+1] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+
+		i += inc_x2;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cn.c b/kernel/riscv64/zomatcopy_cn.c
new file mode 100644
index 000000000..f5a7a6284
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia+=2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cnc.c b/kernel/riscv64/zomatcopy_cnc.c
new file mode 100644
index 000000000..210c3f716
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ct.c b/kernel/riscv64/zomatcopy_ct.c
new file mode 100644
index 000000000..38bc9b9f7
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ct.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ctc.c b/kernel/riscv64/zomatcopy_ctc.c
new file mode 100644
index 000000000..34e7e919a
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ctc.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rn.c b/kernel/riscv64/zomatcopy_rn.c
new file mode 100644
index 000000000..ded381e15
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]    - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rnc.c b/kernel/riscv64/zomatcopy_rnc.c
new file mode 100644
index 000000000..fc27f17ec
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans , conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]    + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rt.c b/kernel/riscv64/zomatcopy_rt.c
new file mode 100644
index 000000000..d34db24e0
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rt.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rtc.c b/kernel/riscv64/zomatcopy_rtc.c
new file mode 100644
index 000000000..a80ee6dfe
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rtc.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zrot.c b/kernel/riscv64/zrot.c
new file mode 100644
index 000000000..98be68db8
--- /dev/null
+++ b/kernel/riscv64/zrot.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+		temp[0]   = c*x[ix]   + s*y[iy] ;
+		temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+		y[iy]     = c*y[iy]   - s*x[ix] ;
+		y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+		x[ix]     = temp[0] ;
+		x[ix+1]   = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c
new file mode 100644
index 000000000..0521aaa0b
--- /dev/null
+++ b/kernel/riscv64/zscal.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG inc_x2;
+	BLASLONG ip = 0;
+	FLOAT temp;
+
+        if ( (n <= 0) || (inc_x <= 0))
+                return(0);
+
+
+	inc_x2 = 2 * inc_x;
+	for ( i=0; i<n; i++ )
+	{
+		if ( da_r == 0.0 )
+		{
+			if ( da_i == 0.0 )
+			{
+				temp = 0.0;
+				x[ip+1] = 0.0 ;
+			}
+			else
+			{
+				temp = - da_i * x[ip+1] ;
+				x[ip+1] = da_i * x[ip]  ;
+			}
+		}
+		else
+		{
+			if ( da_i == 0.0 )
+			{
+				temp    = da_r * x[ip]  ;
+				x[ip+1] = da_r * x[ip+1];
+			}
+			else
+			{
+				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
+				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
+			}
+		}
+		x[ip]   = temp;
+
+		ip += inc_x2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c
new file mode 100644
index 000000000..ae4760ae0
--- /dev/null
+++ b/kernel/riscv64/zswap.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		temp[0]  = x[ix]   ;
+		temp[1]  = x[ix+1] ;
+		x[ix]    = y[iy]   ;
+		x[ix+1]  = y[iy+1] ;
+		y[iy]    = temp[0] ;
+		y[iy+1]  = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile
new file mode 100644
index 000000000..75411deb5
--- /dev/null
+++ b/lapack/laswp/riscv64/Makefile
@@ -0,0 +1,13 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
+
diff --git a/param.h b/param.h
index 189cdc4a0..52675bc25 100644
--- a/param.h
+++ b/param.h
@@ -2343,6 +2343,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P  16
 #endif
 
+#ifdef RISCV64
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  4
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef ARMV7
 #define SNUMOPT		2
 #define DNUMOPT		2

From 0ee395db35ee824aff77d4d2b812aaedb111addd Mon Sep 17 00:00:00 2001
From: Jerry Zhao <jerryz123@berkeley.edu>
Date: Wed, 18 Apr 2018 18:03:32 -0700
Subject: [PATCH 002/121] Fixed TRMM and SYMM for RISCV

---
 kernel/Makefile.L3    |  4 ++++
 kernel/riscv64/KERNEL | 10 ++++++++++
 param.h               |  8 ++++----
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 4284fbfa0..63e09a56d 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -20,6 +20,10 @@ ifeq ($(ARCH), arm64)
 USE_TRMM = 1
 endif
 
+ifeq ($(ARCH), riscv64)
+USE_TRMM = 1
+endif
+
 ifeq ($(TARGET), LOONGSON3B)
 USE_TRMM = 1
 endif
diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL
index 7d854ced6..04d82b4ce 100644
--- a/kernel/riscv64/KERNEL
+++ b/kernel/riscv64/KERNEL
@@ -129,6 +129,16 @@ ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+
 LSAME_KERNEL = ../generic/lsame.c
 
 SCABS_KERNEL	= ../generic/cabs.c
diff --git a/param.h b/param.h
index 52675bc25..22d837960 100644
--- a/param.h
+++ b/param.h
@@ -2348,11 +2348,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  4
-#define SGEMM_DEFAULT_UNROLL_N  4
+#define SGEMM_DEFAULT_UNROLL_M  2
+#define SGEMM_DEFAULT_UNROLL_N  2
 
-#define DGEMM_DEFAULT_UNROLL_M  4
-#define DGEMM_DEFAULT_UNROLL_N  4
+#define DGEMM_DEFAULT_UNROLL_M  2
+#define DGEMM_DEFAULT_UNROLL_N  2
 
 #define CGEMM_DEFAULT_UNROLL_M  2
 #define CGEMM_DEFAULT_UNROLL_N  2

From db17ce896fbbf53cbef34f81e1f1ec6887965ec4 Mon Sep 17 00:00:00 2001
From: Dumi Loghin <dumiloghin@gmail.com>
Date: Wed, 5 Sep 2018 12:49:37 +0800
Subject: [PATCH 003/121] replace ARCH with AR in lapack-netlib

---
 Makefile                                      |  4 +--
 c_check                                       |  4 +++
 lapack-netlib/BLAS/SRC/Makefile               | 10 +++---
 lapack-netlib/CBLAS/src/Makefile              | 32 +++++++++----------
 lapack-netlib/DOCS/lawn81.tex                 |  2 +-
 lapack-netlib/INSTALL/make.inc.ALPHA          |  4 +--
 lapack-netlib/INSTALL/make.inc.HPPA           |  4 +--
 lapack-netlib/INSTALL/make.inc.IRIX64         |  4 +--
 lapack-netlib/INSTALL/make.inc.O2K            |  4 +--
 lapack-netlib/INSTALL/make.inc.SGI5           |  4 +--
 lapack-netlib/INSTALL/make.inc.SUN4           |  4 +--
 lapack-netlib/INSTALL/make.inc.SUN4SOL2       |  4 +--
 lapack-netlib/INSTALL/make.inc.XLF            |  4 +--
 lapack-netlib/INSTALL/make.inc.gfortran       |  4 +--
 lapack-netlib/INSTALL/make.inc.gfortran_debug |  4 +--
 lapack-netlib/INSTALL/make.inc.ifort          |  4 +--
 lapack-netlib/INSTALL/make.inc.pgf95          |  4 +--
 lapack-netlib/INSTALL/make.inc.pghpf          |  4 +--
 lapack-netlib/LAPACKE/src/Makefile            | 10 +++---
 lapack-netlib/LAPACKE/utils/Makefile          |  2 +-
 lapack-netlib/SRC/Makefile                    | 10 +++---
 lapack-netlib/SRC/VARIANTS/Makefile           | 12 +++----
 lapack-netlib/TESTING/MATGEN/Makefile         | 10 +++---
 lapack-netlib/make.inc.example                |  4 +--
 make.inc                                      |  2 +-
 25 files changed, 79 insertions(+), 75 deletions(-)

diff --git a/Makefile b/Makefile
index c0e5fbcf8..aaeb0c498 100644
--- a/Makefile
+++ b/Makefile
@@ -237,8 +237,8 @@ ifndef NOFORTRAN
 	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "ARCHFLAGS   = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "AR          = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "ARFLAGS     = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "RANLIB      = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "LAPACKLIB   = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "TMGLIB      = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
diff --git a/c_check b/c_check
index c564855f3..f86a37b5b 100644
--- a/c_check
+++ b/c_check
@@ -121,6 +121,10 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
     $binary =32;
 }
 
+if ($architecture eq "riscv64") {
+    $defined = 1;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile
index a436365aa..f7236318b 100644
--- a/lapack-netlib/BLAS/SRC/Makefile
+++ b/lapack-netlib/BLAS/SRC/Makefile
@@ -138,23 +138,23 @@ ALLOBJ = $(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \
 	$(ZBLAS2) $(ZBLAS3) $(ALLBLAS)
 
 $(BLASLIB): $(ALLOBJ)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3)
-	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
+	$(AR) $(ARFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 FRC:
diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile
index 6c0518ac7..9b9063d8d 100644
--- a/lapack-netlib/CBLAS/src/Makefile
+++ b/lapack-netlib/CBLAS/src/Makefile
@@ -45,22 +45,22 @@ sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o
 
 # Single precision real
 slib1: $(slev1) $(sclev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib1: $(dlev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib1: $(clev1) $(sclev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib1: $(zlev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 #
@@ -97,22 +97,22 @@ zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \
 
 # Single precision real
 slib2: $(slev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib2: $(dlev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib2: $(clev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib2: $(zlev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 #
@@ -143,22 +143,22 @@ zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o \
 
 # Single precision real
 slib3: $(slev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib3: $(dlev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib3: $(clev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib3: $(zlev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 
@@ -168,22 +168,22 @@ alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3)
 
 # All level 1
 all1: $(alev1)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All level 2
 all2: $(alev2) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All level 3
 all3: $(alev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
+	$(AR) $(ARFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All levels and precisions
 $(CBLASLIB): $(alev1) $(alev2) $(alev3) $(errhand)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 FRC:
diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex
index 291735299..01c7c39e2 100644
--- a/lapack-netlib/DOCS/lawn81.tex
+++ b/lapack-netlib/DOCS/lawn81.tex
@@ -466,7 +466,7 @@ TIMER    = EXT_ETIME
 Refer to the section~\ref{second} to get more information.
 
 
-Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver,
+Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver,
 archiver options, and ranlib for your machine.  If your architecture
 does not require \texttt{ranlib} to be run after each archive command (as
 is the case with CRAY computers running UNICOS, Hewlett Packard
diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA
index 0ceeaa155..049cf0b13 100644
--- a/lapack-netlib/INSTALL/make.inc.ALPHA
+++ b/lapack-netlib/INSTALL/make.inc.ALPHA
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA
index 8eabbbdf4..2bd8ee16e 100644
--- a/lapack-netlib/INSTALL/make.inc.HPPA
+++ b/lapack-netlib/INSTALL/make.inc.HPPA
@@ -29,8 +29,8 @@ LOADOPTS = -Aa +U77
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64
index d9e71e1bf..0f57941b5 100644
--- a/lapack-netlib/INSTALL/make.inc.IRIX64
+++ b/lapack-netlib/INSTALL/make.inc.IRIX64
@@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K
index 3ffcadacc..d99beca41 100644
--- a/lapack-netlib/INSTALL/make.inc.O2K
+++ b/lapack-netlib/INSTALL/make.inc.O2K
@@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5
index c7019ac16..c4a702d48 100644
--- a/lapack-netlib/INSTALL/make.inc.SGI5
+++ b/lapack-netlib/INSTALL/make.inc.SGI5
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4
index 4e44f1beb..6a78e9576 100644
--- a/lapack-netlib/INSTALL/make.inc.SUN4
+++ b/lapack-netlib/INSTALL/make.inc.SUN4
@@ -29,8 +29,8 @@ LOADOPTS = -dalign -O4 -fast
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2
index e6d79add3..0ac3cc4e4 100644
--- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2
+++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2
@@ -33,8 +33,8 @@ LOADOPTS = -f -dalign -native -xO2 -xarch=v8plusa
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF
index 9466ee332..27e22cac9 100644
--- a/lapack-netlib/INSTALL/make.inc.XLF
+++ b/lapack-netlib/INSTALL/make.inc.XLF
@@ -30,8 +30,8 @@ LOADOPTS = -qnosave
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran
index 39d98d4d4..b342b18a8 100644
--- a/lapack-netlib/INSTALL/make.inc.gfortran
+++ b/lapack-netlib/INSTALL/make.inc.gfortran
@@ -33,8 +33,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug
index 10e6381df..1eaed2102 100644
--- a/lapack-netlib/INSTALL/make.inc.gfortran_debug
+++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug
@@ -33,8 +33,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort
index b067bd484..a3c37428e 100644
--- a/lapack-netlib/INSTALL/make.inc.ifort
+++ b/lapack-netlib/INSTALL/make.inc.ifort
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95
index a9a5cec98..931ff378f 100644
--- a/lapack-netlib/INSTALL/make.inc.pgf95
+++ b/lapack-netlib/INSTALL/make.inc.pgf95
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf
index 1d9bf549c..0dfe8c683 100644
--- a/lapack-netlib/INSTALL/make.inc.pghpf
+++ b/lapack-netlib/INSTALL/make.inc.pghpf
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 44884d4a5..03c140bf7 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -2455,16 +2455,16 @@ endif
 all: ../../$(LAPACKELIB)
 
 ../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_A)
-	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_B)
+	$(AR) $(ARFLAGS) $@ $(OBJ_A)
+	$(AR) $(ARFLAGS) $@ $(OBJ_B)
 ifdef BUILD_DEPRECATED
-	$(ARCH) $(ARCHFLAGS) $@ $(DEPRECATED)
+	$(AR) $(ARFLAGS) $@ $(DEPRECATED)
 endif
 ifdef (USEXBLAS)
-	$(ARCH) $(ARCHFLAGS) $@ $(EXTENDED)
+	$(AR) $(ARFLAGS) $@ $(EXTENDED)
 endif
 ifdef LAPACKE_WITH_TMG
-	$(ARCH) $(ARCHFLAGS) $@ $(MATGEN)
+	$(AR) $(ARFLAGS) $@ $(MATGEN)
 endif
 	$(RANLIB) $@
 
diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile
index 1f639c6ea..c6204ee3b 100644
--- a/lapack-netlib/LAPACKE/utils/Makefile
+++ b/lapack-netlib/LAPACKE/utils/Makefile
@@ -186,7 +186,7 @@ OBJ = lapacke_cgb_nancheck.o \
 all: lib
 
 lib: $(OBJ)
-	$(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $^
+	$(AR) $(ARFLAGS) ../../$(LAPACKELIB) $^
 	$(RANLIB) ../../$(LAPACKELIB)
 
 clean: cleanobj
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index 531cb51fc..e5bb7a3db 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -553,26 +553,26 @@ endif
 all: ../$(LAPACKLIB)
 
 ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
-	$(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
+	$(AR) $(ARFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
 	$(RANLIB) $@
 
 single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \
+	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \
 	$(SXLASRC) $(SCLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
 complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \
+	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \
 	$(CXLASRC) $(SCLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
 double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \
+	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \
 	$(DXLASRC) $(DZLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
 complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX)
-	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \
+	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \
 	$(ZXLASRC) $(DZLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile
index 9f1410755..7d0e8824c 100644
--- a/lapack-netlib/SRC/VARIANTS/Makefile
+++ b/lapack-netlib/SRC/VARIANTS/Makefile
@@ -33,27 +33,27 @@ QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o
 all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a
 
 cholrl.a: $(CHOLRL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 choltop.a: $(CHOLTOP)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lucr.a: $(LUCR)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lull.a: $(LULL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 lurec.a: $(LUREC)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 qrll.a: $(QRLL)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 clean: cleanobj cleanlib
diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile
index e20004c2f..f5ea5a8c0 100644
--- a/lapack-netlib/TESTING/MATGEN/Makefile
+++ b/lapack-netlib/TESTING/MATGEN/Makefile
@@ -58,23 +58,23 @@ ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \
 	$(DZATGEN)
 
 ../../$(TMGLIB): $(ALLOBJ)
-	$(ARCH) $(ARCHFLAGS) $@ $^
+	$(AR) $(ARFLAGS) $@ $^
 	$(RANLIB) $@
 
 single: $(SMATGEN) $(SCATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
+	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 complex: $(CMATGEN) $(SCATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
+	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 double: $(DMATGEN) $(DZATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
+	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 complex16: $(ZMATGEN) $(DZATGEN)
-	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
+	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 $(SCATGEN): $(FRC)
diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example
index d780c3a23..3ddb9eafc 100644
--- a/lapack-netlib/make.inc.example
+++ b/lapack-netlib/make.inc.example
@@ -33,8 +33,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-ARCH      = ar
-ARCHFLAGS = cr
+AR        = ar
+ARFLAGS   = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/make.inc b/make.inc
index b6ed098c0..93b355103 100644
--- a/make.inc
+++ b/make.inc
@@ -1,6 +1,6 @@
 SHELL = /bin/sh
 PLAT = _LINUX
 DRVOPTS  = $(NOOPT)
-#ARCHFLAGS= $(ARFLAGS) -ru
+#ARFLAGS= $(ARFLAGS) -ru
 #RANLIB   = ranlib
 

From 0b7ccb9e381d4bc3d0149c158631389c2c2d411c Mon Sep 17 00:00:00 2001
From: Dumi Loghin <dumiloghin@gmail.com>
Date: Thu, 6 Sep 2018 13:08:30 +0800
Subject: [PATCH 004/121] Revert "replace ARCH with AR in lapack-netlib"

This reverts commit db17ce896fbbf53cbef34f81e1f1ec6887965ec4.
---
 Makefile                                      |  4 +--
 c_check                                       |  4 ---
 lapack-netlib/BLAS/SRC/Makefile               | 10 +++---
 lapack-netlib/CBLAS/src/Makefile              | 32 +++++++++----------
 lapack-netlib/DOCS/lawn81.tex                 |  2 +-
 lapack-netlib/INSTALL/make.inc.ALPHA          |  4 +--
 lapack-netlib/INSTALL/make.inc.HPPA           |  4 +--
 lapack-netlib/INSTALL/make.inc.IRIX64         |  4 +--
 lapack-netlib/INSTALL/make.inc.O2K            |  4 +--
 lapack-netlib/INSTALL/make.inc.SGI5           |  4 +--
 lapack-netlib/INSTALL/make.inc.SUN4           |  4 +--
 lapack-netlib/INSTALL/make.inc.SUN4SOL2       |  4 +--
 lapack-netlib/INSTALL/make.inc.XLF            |  4 +--
 lapack-netlib/INSTALL/make.inc.gfortran       |  4 +--
 lapack-netlib/INSTALL/make.inc.gfortran_debug |  4 +--
 lapack-netlib/INSTALL/make.inc.ifort          |  4 +--
 lapack-netlib/INSTALL/make.inc.pgf95          |  4 +--
 lapack-netlib/INSTALL/make.inc.pghpf          |  4 +--
 lapack-netlib/LAPACKE/src/Makefile            | 10 +++---
 lapack-netlib/LAPACKE/utils/Makefile          |  2 +-
 lapack-netlib/SRC/Makefile                    | 10 +++---
 lapack-netlib/SRC/VARIANTS/Makefile           | 12 +++----
 lapack-netlib/TESTING/MATGEN/Makefile         | 10 +++---
 lapack-netlib/make.inc.example                |  4 +--
 make.inc                                      |  2 +-
 25 files changed, 75 insertions(+), 79 deletions(-)

diff --git a/Makefile b/Makefile
index aaeb0c498..c0e5fbcf8 100644
--- a/Makefile
+++ b/Makefile
@@ -237,8 +237,8 @@ ifndef NOFORTRAN
 	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "AR          = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "ARFLAGS     = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "ARCHFLAGS   = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "RANLIB      = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "LAPACKLIB   = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "TMGLIB      = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
diff --git a/c_check b/c_check
index f86a37b5b..c564855f3 100644
--- a/c_check
+++ b/c_check
@@ -121,10 +121,6 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
     $binary =32;
 }
 
-if ($architecture eq "riscv64") {
-    $defined = 1;
-}
-
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
diff --git a/lapack-netlib/BLAS/SRC/Makefile b/lapack-netlib/BLAS/SRC/Makefile
index f7236318b..a436365aa 100644
--- a/lapack-netlib/BLAS/SRC/Makefile
+++ b/lapack-netlib/BLAS/SRC/Makefile
@@ -138,23 +138,23 @@ ALLOBJ = $(SBLAS1) $(SBLAS2) $(SBLAS3) $(DBLAS1) $(DBLAS2) $(DBLAS3) \
 	$(ZBLAS2) $(ZBLAS3) $(ALLBLAS)
 
 $(BLASLIB): $(ALLOBJ)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 single: $(SBLAS1) $(ALLBLAS) $(SBLAS2) $(SBLAS3)
-	$(AR) $(ARFLAGS) $(BLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 double: $(DBLAS1) $(ALLBLAS) $(DBLAS2) $(DBLAS3)
-	$(AR) $(ARFLAGS) $(BLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 complex: $(CBLAS1) $(CB1AUX) $(ALLBLAS) $(CBLAS2) $(CBLAS3)
-	$(AR) $(ARFLAGS) $(BLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 complex16: $(ZBLAS1) $(ZB1AUX) $(ALLBLAS) $(ZBLAS2) $(ZBLAS3)
-	$(AR) $(ARFLAGS) $(BLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(BLASLIB) $^
 	$(RANLIB) $(BLASLIB)
 
 FRC:
diff --git a/lapack-netlib/CBLAS/src/Makefile b/lapack-netlib/CBLAS/src/Makefile
index 9b9063d8d..6c0518ac7 100644
--- a/lapack-netlib/CBLAS/src/Makefile
+++ b/lapack-netlib/CBLAS/src/Makefile
@@ -45,22 +45,22 @@ sclev1 = cblas_scasum.o scasumsub.o cblas_scnrm2.o scnrm2sub.o
 
 # Single precision real
 slib1: $(slev1) $(sclev1)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib1: $(dlev1)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib1: $(clev1) $(sclev1)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib1: $(zlev1)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 #
@@ -97,22 +97,22 @@ zlev2 = cblas_zgemv.o cblas_zgbmv.o cblas_zhemv.o cblas_zhbmv.o cblas_zhpmv.o \
 
 # Single precision real
 slib2: $(slev2) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib2: $(dlev2) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib2: $(clev2) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib2: $(zlev2) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 #
@@ -143,22 +143,22 @@ zlev3 = cblas_zgemm.o cblas_zsymm.o cblas_zhemm.o cblas_zherk.o \
 
 # Single precision real
 slib3: $(slev3) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision real
 dlib3: $(dlev3) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Single precision complex
 clib3: $(clev3) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # Double precision complex
 zlib3: $(zlev3) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 
@@ -168,22 +168,22 @@ alev3 = $(slev3) $(dlev3) $(clev3) $(zlev3)
 
 # All level 1
 all1: $(alev1)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All level 2
 all2: $(alev2) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All level 3
 all3: $(alev3) $(errhand)
-	$(AR) $(ARFLAGS) $(CBLASLIB) $^
+	$(ARCH) $(ARCHFLAGS) $(CBLASLIB) $^
 	$(RANLIB) $(CBLASLIB)
 
 # All levels and precisions
 $(CBLASLIB): $(alev1) $(alev2) $(alev3) $(errhand)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 FRC:
diff --git a/lapack-netlib/DOCS/lawn81.tex b/lapack-netlib/DOCS/lawn81.tex
index 01c7c39e2..291735299 100644
--- a/lapack-netlib/DOCS/lawn81.tex
+++ b/lapack-netlib/DOCS/lawn81.tex
@@ -466,7 +466,7 @@ TIMER    = EXT_ETIME
 Refer to the section~\ref{second} to get more information.
 
 
-Next, you will need to modify \texttt{AR}, \texttt{ARFLAGS}, and \texttt{RANLIB} to specify archiver,
+Next, you will need to modify \texttt{ARCH}, \texttt{ARCHFLAGS}, and \texttt{RANLIB} to specify archiver,
 archiver options, and ranlib for your machine.  If your architecture
 does not require \texttt{ranlib} to be run after each archive command (as
 is the case with CRAY computers running UNICOS, Hewlett Packard
diff --git a/lapack-netlib/INSTALL/make.inc.ALPHA b/lapack-netlib/INSTALL/make.inc.ALPHA
index 049cf0b13..0ceeaa155 100644
--- a/lapack-netlib/INSTALL/make.inc.ALPHA
+++ b/lapack-netlib/INSTALL/make.inc.ALPHA
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.HPPA b/lapack-netlib/INSTALL/make.inc.HPPA
index 2bd8ee16e..8eabbbdf4 100644
--- a/lapack-netlib/INSTALL/make.inc.HPPA
+++ b/lapack-netlib/INSTALL/make.inc.HPPA
@@ -29,8 +29,8 @@ LOADOPTS = -Aa +U77
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.IRIX64 b/lapack-netlib/INSTALL/make.inc.IRIX64
index 0f57941b5..d9e71e1bf 100644
--- a/lapack-netlib/INSTALL/make.inc.IRIX64
+++ b/lapack-netlib/INSTALL/make.inc.IRIX64
@@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000 -OPT:IEEE_NaN_inf=ON
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.O2K b/lapack-netlib/INSTALL/make.inc.O2K
index d99beca41..3ffcadacc 100644
--- a/lapack-netlib/INSTALL/make.inc.O2K
+++ b/lapack-netlib/INSTALL/make.inc.O2K
@@ -32,8 +32,8 @@ LOADOPTS = -O3 -64 -mips4 -r10000
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.SGI5 b/lapack-netlib/INSTALL/make.inc.SGI5
index c4a702d48..c7019ac16 100644
--- a/lapack-netlib/INSTALL/make.inc.SGI5
+++ b/lapack-netlib/INSTALL/make.inc.SGI5
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.SUN4 b/lapack-netlib/INSTALL/make.inc.SUN4
index 6a78e9576..4e44f1beb 100644
--- a/lapack-netlib/INSTALL/make.inc.SUN4
+++ b/lapack-netlib/INSTALL/make.inc.SUN4
@@ -29,8 +29,8 @@ LOADOPTS = -dalign -O4 -fast
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.SUN4SOL2 b/lapack-netlib/INSTALL/make.inc.SUN4SOL2
index 0ac3cc4e4..e6d79add3 100644
--- a/lapack-netlib/INSTALL/make.inc.SUN4SOL2
+++ b/lapack-netlib/INSTALL/make.inc.SUN4SOL2
@@ -33,8 +33,8 @@ LOADOPTS = -f -dalign -native -xO2 -xarch=v8plusa
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.XLF b/lapack-netlib/INSTALL/make.inc.XLF
index 27e22cac9..9466ee332 100644
--- a/lapack-netlib/INSTALL/make.inc.XLF
+++ b/lapack-netlib/INSTALL/make.inc.XLF
@@ -30,8 +30,8 @@ LOADOPTS = -qnosave
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.gfortran b/lapack-netlib/INSTALL/make.inc.gfortran
index b342b18a8..39d98d4d4 100644
--- a/lapack-netlib/INSTALL/make.inc.gfortran
+++ b/lapack-netlib/INSTALL/make.inc.gfortran
@@ -33,8 +33,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.gfortran_debug b/lapack-netlib/INSTALL/make.inc.gfortran_debug
index 1eaed2102..10e6381df 100644
--- a/lapack-netlib/INSTALL/make.inc.gfortran_debug
+++ b/lapack-netlib/INSTALL/make.inc.gfortran_debug
@@ -33,8 +33,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.ifort b/lapack-netlib/INSTALL/make.inc.ifort
index a3c37428e..b067bd484 100644
--- a/lapack-netlib/INSTALL/make.inc.ifort
+++ b/lapack-netlib/INSTALL/make.inc.ifort
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.pgf95 b/lapack-netlib/INSTALL/make.inc.pgf95
index 931ff378f..a9a5cec98 100644
--- a/lapack-netlib/INSTALL/make.inc.pgf95
+++ b/lapack-netlib/INSTALL/make.inc.pgf95
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/INSTALL/make.inc.pghpf b/lapack-netlib/INSTALL/make.inc.pghpf
index 0dfe8c683..1d9bf549c 100644
--- a/lapack-netlib/INSTALL/make.inc.pghpf
+++ b/lapack-netlib/INSTALL/make.inc.pghpf
@@ -29,8 +29,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = echo
 
 #  Timer for the SECOND and DSECND routines
diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile
index 03c140bf7..44884d4a5 100644
--- a/lapack-netlib/LAPACKE/src/Makefile
+++ b/lapack-netlib/LAPACKE/src/Makefile
@@ -2455,16 +2455,16 @@ endif
 all: ../../$(LAPACKELIB)
 
 ../../$(LAPACKELIB): $(OBJ_A) $(OBJ_B) $(DEPRECATED) $(EXTENDED) $(MATGEN)
-	$(AR) $(ARFLAGS) $@ $(OBJ_A)
-	$(AR) $(ARFLAGS) $@ $(OBJ_B)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_A)
+	$(ARCH) $(ARCHFLAGS) $@ $(OBJ_B)
 ifdef BUILD_DEPRECATED
-	$(AR) $(ARFLAGS) $@ $(DEPRECATED)
+	$(ARCH) $(ARCHFLAGS) $@ $(DEPRECATED)
 endif
 ifdef (USEXBLAS)
-	$(AR) $(ARFLAGS) $@ $(EXTENDED)
+	$(ARCH) $(ARCHFLAGS) $@ $(EXTENDED)
 endif
 ifdef LAPACKE_WITH_TMG
-	$(AR) $(ARFLAGS) $@ $(MATGEN)
+	$(ARCH) $(ARCHFLAGS) $@ $(MATGEN)
 endif
 	$(RANLIB) $@
 
diff --git a/lapack-netlib/LAPACKE/utils/Makefile b/lapack-netlib/LAPACKE/utils/Makefile
index c6204ee3b..1f639c6ea 100644
--- a/lapack-netlib/LAPACKE/utils/Makefile
+++ b/lapack-netlib/LAPACKE/utils/Makefile
@@ -186,7 +186,7 @@ OBJ = lapacke_cgb_nancheck.o \
 all: lib
 
 lib: $(OBJ)
-	$(AR) $(ARFLAGS) ../../$(LAPACKELIB) $^
+	$(ARCH) $(ARCHFLAGS) ../../$(LAPACKELIB) $^
 	$(RANLIB) ../../$(LAPACKELIB)
 
 clean: cleanobj
diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile
index e5bb7a3db..531cb51fc 100644
--- a/lapack-netlib/SRC/Makefile
+++ b/lapack-netlib/SRC/Makefile
@@ -553,26 +553,26 @@ endif
 all: ../$(LAPACKLIB)
 
 ../$(LAPACKLIB): $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
-	$(AR) $(ARFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
+	$(ARCH) $(ARCHFLAGS) $@ $(ALLOBJ) $(ALLXOBJ) $(DEPRECATED)
 	$(RANLIB) $@
 
 single: $(SLASRC) $(DSLASRC) $(SXLASRC) $(SCLAUX) $(ALLAUX)
-	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \
+	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(SLASRC) $(DSLASRC) \
 	$(SXLASRC) $(SCLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
 complex: $(CLASRC) $(ZCLASRC) $(CXLASRC) $(SCLAUX) $(ALLAUX)
-	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \
+	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(CLASRC) $(ZCLASRC) \
 	$(CXLASRC) $(SCLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
 double: $(DLASRC) $(DSLASRC) $(DXLASRC) $(DZLAUX) $(ALLAUX)
-	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \
+	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(DLASRC) $(DSLASRC) \
 	$(DXLASRC) $(DZLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
 complex16: $(ZLASRC) $(ZCLASRC) $(ZXLASRC) $(DZLAUX) $(ALLAUX)
-	$(AR) $(ARFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \
+	$(ARCH) $(ARCHFLAGS) ../$(LAPACKLIB) $(ZLASRC) $(ZCLASRC) \
 	$(ZXLASRC) $(DZLAUX) $(ALLAUX)
 	$(RANLIB) ../$(LAPACKLIB)
 
diff --git a/lapack-netlib/SRC/VARIANTS/Makefile b/lapack-netlib/SRC/VARIANTS/Makefile
index 7d0e8824c..9f1410755 100644
--- a/lapack-netlib/SRC/VARIANTS/Makefile
+++ b/lapack-netlib/SRC/VARIANTS/Makefile
@@ -33,27 +33,27 @@ QRLL = qr/LL/cgeqrf.o qr/LL/dgeqrf.o qr/LL/sgeqrf.o qr/LL/zgeqrf.o qr/LL/sceil.o
 all: cholrl.a choltop.a lucr.a lull.a lurec.a qrll.a
 
 cholrl.a: $(CHOLRL)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 choltop.a: $(CHOLTOP)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 lucr.a: $(LUCR)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 lull.a: $(LULL)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 lurec.a: $(LUREC)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 qrll.a: $(QRLL)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 clean: cleanobj cleanlib
diff --git a/lapack-netlib/TESTING/MATGEN/Makefile b/lapack-netlib/TESTING/MATGEN/Makefile
index f5ea5a8c0..e20004c2f 100644
--- a/lapack-netlib/TESTING/MATGEN/Makefile
+++ b/lapack-netlib/TESTING/MATGEN/Makefile
@@ -58,23 +58,23 @@ ALLOBJ = $(SMATGEN) $(CMATGEN) $(SCATGEN) $(DMATGEN) $(ZMATGEN) \
 	$(DZATGEN)
 
 ../../$(TMGLIB): $(ALLOBJ)
-	$(AR) $(ARFLAGS) $@ $^
+	$(ARCH) $(ARCHFLAGS) $@ $^
 	$(RANLIB) $@
 
 single: $(SMATGEN) $(SCATGEN)
-	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
+	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 complex: $(CMATGEN) $(SCATGEN)
-	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
+	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 double: $(DMATGEN) $(DZATGEN)
-	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
+	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 complex16: $(ZMATGEN) $(DZATGEN)
-	$(AR) $(ARFLAGS) ../../$(TMGLIB) $^
+	$(ARCH) $(ARCHFLAGS) ../../$(TMGLIB) $^
 	$(RANLIB) ../../$(TMGLIB)
 
 $(SCATGEN): $(FRC)
diff --git a/lapack-netlib/make.inc.example b/lapack-netlib/make.inc.example
index 3ddb9eafc..d780c3a23 100644
--- a/lapack-netlib/make.inc.example
+++ b/lapack-netlib/make.inc.example
@@ -33,8 +33,8 @@ LOADOPTS =
 #  The archiver and the flag(s) to use when building an archive
 #  (library).  If your system has no ranlib, set RANLIB = echo.
 #
-AR        = ar
-ARFLAGS   = cr
+ARCH      = ar
+ARCHFLAGS = cr
 RANLIB    = ranlib
 
 #  Timer for the SECOND and DSECND routines
diff --git a/make.inc b/make.inc
index 93b355103..b6ed098c0 100644
--- a/make.inc
+++ b/make.inc
@@ -1,6 +1,6 @@
 SHELL = /bin/sh
 PLAT = _LINUX
 DRVOPTS  = $(NOOPT)
-#ARFLAGS= $(ARFLAGS) -ru
+#ARCHFLAGS= $(ARFLAGS) -ru
 #RANLIB   = ranlib
 

From a1bdc308b8d4dcb924f339ca5018c12a455d2652 Mon Sep 17 00:00:00 2001
From: Dumi Loghin <dumiloghin@gmail.com>
Date: Thu, 6 Sep 2018 13:13:36 +0800
Subject: [PATCH 005/121] override ARCH (archiver) in lapack-netlib/make.inc

---
 Makefile | 2 +-
 c_check  | 5 +++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index c0e5fbcf8..547feb0d2 100644
--- a/Makefile
+++ b/Makefile
@@ -237,7 +237,7 @@ ifndef NOFORTRAN
 	-@echo "LOADOPTS    = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
-	-@echo "ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
+	-@echo "override ARCH        = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "ARCHFLAGS   = $(ARFLAGS) -ru" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "RANLIB      = $(RANLIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "LAPACKLIB   = ../$(LIBNAME)" >> $(NETLIB_LAPACK_DIR)/make.inc
diff --git a/c_check b/c_check
index c564855f3..eb302b71a 100644
--- a/c_check
+++ b/c_check
@@ -121,6 +121,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
     $binary =32;
 }
 
+if ($architecture eq "riscv64") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");

From 44020a42a453579740fd16cd23e76f4267c41b65 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Thu, 27 Feb 2020 14:29:42 +0800
Subject: [PATCH 006/121] Fixed compile bug for RV64.

---
 kernel/riscv64/KERNEL | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL
index 04d82b4ce..ea6a8cf21 100644
--- a/kernel/riscv64/KERNEL
+++ b/kernel/riscv64/KERNEL
@@ -35,6 +35,11 @@ DASUMKERNEL  = ../riscv64/asum.c
 CASUMKERNEL  = ../riscv64/zasum.c
 ZASUMKERNEL  = ../riscv64/zasum.c
 
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
 SAXPYKERNEL  = ../riscv64/axpy.c
 DAXPYKERNEL  = ../riscv64/axpy.c
 CAXPYKERNEL  = ../riscv64/zaxpy.c

From 265ab484c89d10dfff2d5df678221918d7a880e3 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Thu, 27 Feb 2020 14:46:15 +0800
Subject: [PATCH 007/121] Change default RISC-V 64-bit corename to
 RISCV64_GENERIC

e.g. make CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran TARGET=RISCV64_GENERIC HOSTCC=gcc
---
 TargetList.txt                        |   3 +
 getarch.c                             |  10 +-
 kernel/riscv64/KERNEL                 | 162 +++----------------------
 kernel/riscv64/KERNEL.RISCV64_GENERIC | 164 ++++++++++++++++++++++++++
 param.h                               |   2 +-
 5 files changed, 187 insertions(+), 154 deletions(-)
 create mode 100644 kernel/riscv64/KERNEL.RISCV64_GENERIC

diff --git a/TargetList.txt b/TargetList.txt
index 6a57bf1af..3b018e17a 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -97,3 +97,6 @@ TSV110
 ZARCH_GENERIC
 Z13
 Z14
+
+10.RISC-V 64:
+RISCV64_GENERIC
diff --git a/getarch.c b/getarch.c
index d0d260577..58706c452 100644
--- a/getarch.c
+++ b/getarch.c
@@ -906,17 +906,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
-#ifdef FORCE_RISCV64
+#ifdef FORCE_RISCV64_GENERIC
 #define FORCE
 #define ARCHITECTURE    "RISCV64"
-#define SUBARCHITECTURE "RISCV64"
+#define SUBARCHITECTURE "RISCV64_GENERIC"
 #define SUBDIRNAME      "riscv64"
-#define ARCHCONFIG   "-DRISCV64 " \
+#define ARCHCONFIG   "-DRISCV64_GENERIC " \
        "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
        "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "riscv64"
-#define CORENAME  "RISCV64"
+#define LIBNAME   "riscv64_generic"
+#define CORENAME  "RISCV64_GENERIC"
 #else
 #endif
 
diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL
index ea6a8cf21..68d68b5f8 100644
--- a/kernel/riscv64/KERNEL
+++ b/kernel/riscv64/KERNEL
@@ -1,154 +1,18 @@
-SAMAXKERNEL  = ../riscv64/amax.c
-DAMAXKERNEL  = ../riscv64/amax.c
-CAMAXKERNEL  = ../riscv64/zamax.c
-ZAMAXKERNEL  = ../riscv64/zamax.c
-
-SAMINKERNEL  = ../riscv64/amin.c
-DAMINKERNEL  = ../riscv64/amin.c
-CAMINKERNEL  = ../riscv64/zamin.c
-ZAMINKERNEL  = ../riscv64/zamin.c
-
-SMAXKERNEL   = ../riscv64/max.c
-DMAXKERNEL   = ../riscv64/max.c
-
-SMINKERNEL   = ../riscv64/min.c
-DMINKERNEL   = ../riscv64/min.c
-
-ISAMAXKERNEL = ../riscv64/iamax.c
-IDAMAXKERNEL = ../riscv64/iamax.c
-ICAMAXKERNEL = ../riscv64/izamax.c
-IZAMAXKERNEL = ../riscv64/izamax.c
-
-ISAMINKERNEL = ../riscv64/iamin.c
-IDAMINKERNEL = ../riscv64/iamin.c
-ICAMINKERNEL = ../riscv64/izamin.c
-IZAMINKERNEL = ../riscv64/izamin.c
-
-ISMAXKERNEL  = ../riscv64/imax.c
-IDMAXKERNEL  = ../riscv64/imax.c
-
-ISMINKERNEL  = ../riscv64/imin.c
-IDMINKERNEL  = ../riscv64/imin.c
-
-SASUMKERNEL  = ../riscv64/asum.c
-DASUMKERNEL  = ../riscv64/asum.c
-CASUMKERNEL  = ../riscv64/zasum.c
-ZASUMKERNEL  = ../riscv64/zasum.c
-
-SSUMKERNEL  = ../arm/sum.c
-DSUMKERNEL  = ../arm/sum.c
-CSUMKERNEL  = ../arm/zsum.c
-ZSUMKERNEL  = ../arm/zsum.c
-
-SAXPYKERNEL  = ../riscv64/axpy.c
-DAXPYKERNEL  = ../riscv64/axpy.c
-CAXPYKERNEL  = ../riscv64/zaxpy.c
-ZAXPYKERNEL  = ../riscv64/zaxpy.c
-
-SCOPYKERNEL  = ../riscv64/copy.c
-DCOPYKERNEL  = ../riscv64/copy.c
-CCOPYKERNEL  = ../riscv64/zcopy.c
-ZCOPYKERNEL  = ../riscv64/zcopy.c
-
-SDOTKERNEL   = ../riscv64/dot.c
-DDOTKERNEL   = ../riscv64/dot.c
-CDOTKERNEL   = ../riscv64/zdot.c
-ZDOTKERNEL   = ../riscv64/zdot.c
-
-SNRM2KERNEL  = ../riscv64/nrm2.c
-DNRM2KERNEL  = ../riscv64/nrm2.c
-CNRM2KERNEL  = ../riscv64/znrm2.c
-ZNRM2KERNEL  = ../riscv64/znrm2.c
-
-SROTKERNEL   = ../riscv64/rot.c
-DROTKERNEL   = ../riscv64/rot.c
-CROTKERNEL   = ../riscv64/zrot.c
-ZROTKERNEL   = ../riscv64/zrot.c
-
-SSCALKERNEL  = ../riscv64/scal.c
-DSCALKERNEL  = ../riscv64/scal.c
-CSCALKERNEL  = ../riscv64/zscal.c
-ZSCALKERNEL  = ../riscv64/zscal.c
-
-SSWAPKERNEL  = ../riscv64/swap.c
-DSWAPKERNEL  = ../riscv64/swap.c
-CSWAPKERNEL  = ../riscv64/zswap.c
-ZSWAPKERNEL  = ../riscv64/zswap.c
-
-SGEMVNKERNEL = ../riscv64/gemv_n.c
-DGEMVNKERNEL = ../riscv64/gemv_n.c
-CGEMVNKERNEL = ../riscv64/zgemv_n.c
-ZGEMVNKERNEL = ../riscv64/zgemv_n.c
-
-SGEMVTKERNEL = ../riscv64/gemv_t.c
-DGEMVTKERNEL = ../riscv64/gemv_t.c
-CGEMVTKERNEL = ../riscv64/zgemv_t.c
-ZGEMVTKERNEL = ../riscv64/zgemv_t.c
-
-STRMMKERNEL	= ../generic/trmmkernel_2x2.c
-DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
-CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
-DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
-DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
-
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
-
-ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SSYMV_U_KERNEL =  ../generic/symv_k.c
-SSYMV_L_KERNEL =  ../generic/symv_k.c
-DSYMV_U_KERNEL =  ../generic/symv_k.c
-DSYMV_L_KERNEL =  ../generic/symv_k.c
-CSYMV_U_KERNEL =  ../generic/zsymv_k.c
-CSYMV_L_KERNEL =  ../generic/zsymv_k.c
-ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
-ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
-
-
-LSAME_KERNEL = ../generic/lsame.c
-
+ifndef SCABS_KERNEL
 SCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
 DCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
 QCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL	= ../generic/lsame.c
+endif
 
 ifndef SGEMM_BETA
 SGEMM_BETA = ../generic/gemm_beta.c
@@ -162,3 +26,5 @@ endif
 ifndef ZGEMM_BETA
 ZGEMM_BETA = ../generic/zgemm_beta.c
 endif
+
+
diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
new file mode 100644
index 000000000..ea6a8cf21
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -0,0 +1,164 @@
+SAMAXKERNEL  = ../riscv64/amax.c
+DAMAXKERNEL  = ../riscv64/amax.c
+CAMAXKERNEL  = ../riscv64/zamax.c
+ZAMAXKERNEL  = ../riscv64/zamax.c
+
+SAMINKERNEL  = ../riscv64/amin.c
+DAMINKERNEL  = ../riscv64/amin.c
+CAMINKERNEL  = ../riscv64/zamin.c
+ZAMINKERNEL  = ../riscv64/zamin.c
+
+SMAXKERNEL   = ../riscv64/max.c
+DMAXKERNEL   = ../riscv64/max.c
+
+SMINKERNEL   = ../riscv64/min.c
+DMINKERNEL   = ../riscv64/min.c
+
+ISAMAXKERNEL = ../riscv64/iamax.c
+IDAMAXKERNEL = ../riscv64/iamax.c
+ICAMAXKERNEL = ../riscv64/izamax.c
+IZAMAXKERNEL = ../riscv64/izamax.c
+
+ISAMINKERNEL = ../riscv64/iamin.c
+IDAMINKERNEL = ../riscv64/iamin.c
+ICAMINKERNEL = ../riscv64/izamin.c
+IZAMINKERNEL = ../riscv64/izamin.c
+
+ISMAXKERNEL  = ../riscv64/imax.c
+IDMAXKERNEL  = ../riscv64/imax.c
+
+ISMINKERNEL  = ../riscv64/imin.c
+IDMINKERNEL  = ../riscv64/imin.c
+
+SASUMKERNEL  = ../riscv64/asum.c
+DASUMKERNEL  = ../riscv64/asum.c
+CASUMKERNEL  = ../riscv64/zasum.c
+ZASUMKERNEL  = ../riscv64/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../riscv64/axpy.c
+DAXPYKERNEL  = ../riscv64/axpy.c
+CAXPYKERNEL  = ../riscv64/zaxpy.c
+ZAXPYKERNEL  = ../riscv64/zaxpy.c
+
+SCOPYKERNEL  = ../riscv64/copy.c
+DCOPYKERNEL  = ../riscv64/copy.c
+CCOPYKERNEL  = ../riscv64/zcopy.c
+ZCOPYKERNEL  = ../riscv64/zcopy.c
+
+SDOTKERNEL   = ../riscv64/dot.c
+DDOTKERNEL   = ../riscv64/dot.c
+CDOTKERNEL   = ../riscv64/zdot.c
+ZDOTKERNEL   = ../riscv64/zdot.c
+
+SNRM2KERNEL  = ../riscv64/nrm2.c
+DNRM2KERNEL  = ../riscv64/nrm2.c
+CNRM2KERNEL  = ../riscv64/znrm2.c
+ZNRM2KERNEL  = ../riscv64/znrm2.c
+
+SROTKERNEL   = ../riscv64/rot.c
+DROTKERNEL   = ../riscv64/rot.c
+CROTKERNEL   = ../riscv64/zrot.c
+ZROTKERNEL   = ../riscv64/zrot.c
+
+SSCALKERNEL  = ../riscv64/scal.c
+DSCALKERNEL  = ../riscv64/scal.c
+CSCALKERNEL  = ../riscv64/zscal.c
+ZSCALKERNEL  = ../riscv64/zscal.c
+
+SSWAPKERNEL  = ../riscv64/swap.c
+DSWAPKERNEL  = ../riscv64/swap.c
+CSWAPKERNEL  = ../riscv64/zswap.c
+ZSWAPKERNEL  = ../riscv64/zswap.c
+
+SGEMVNKERNEL = ../riscv64/gemv_n.c
+DGEMVNKERNEL = ../riscv64/gemv_n.c
+CGEMVNKERNEL = ../riscv64/zgemv_n.c
+ZGEMVNKERNEL = ../riscv64/zgemv_n.c
+
+SGEMVTKERNEL = ../riscv64/gemv_t.c
+DGEMVTKERNEL = ../riscv64/gemv_t.c
+CGEMVTKERNEL = ../riscv64/zgemv_t.c
+ZGEMVTKERNEL = ../riscv64/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/param.h b/param.h
index d42724a57..4a7765012 100644
--- a/param.h
+++ b/param.h
@@ -2509,7 +2509,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P  16
 #endif
 
-#ifdef RISCV64
+#ifdef RISCV64_GENERIC
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL

From ef8e7d0279dfd1f9d9bec32b514a853d10bfdda7 Mon Sep 17 00:00:00 2001
From: damonyu <Damonyu@linux.alibaba.com>
Date: Thu, 15 Oct 2020 16:05:37 +0800
Subject: [PATCH 008/121] Add the support for RISC-V Vector.

Change-Id: Iae7800a32f5af3903c330882cdf6f292d885f266
---
 Makefile.prebuild                        |    4 +
 Makefile.riscv64                         |    4 +
 Makefile.system                          |    5 +-
 TargetList.txt                           |    3 +
 c_check                                  |    6 +
 common.h                                 |    4 +
 common_riscv64.h                         |   98 +
 cpuid_riscv64.c                          |  113 ++
 ctest.c                                  |    4 +
 getarch.c                                |   33 +
 kernel/Makefile.L3                       |    4 +
 kernel/generic/trmmkernel_16x4.c         | 2092 ++++++++++++++++++++
 kernel/generic/trmmkernel_8x4.c          | 1317 +++++++++++++
 kernel/generic/trmmkernel_8x8.c          | 2207 ++++++++++++++++++++++
 kernel/riscv64/KERNEL                    |   30 +
 kernel/riscv64/KERNEL.C910V              |  190 ++
 kernel/riscv64/KERNEL.RISCV64_GENERIC    |  164 ++
 kernel/riscv64/amax.c                    |   75 +
 kernel/riscv64/amax_vector.c             |  245 +++
 kernel/riscv64/amin.c                    |   75 +
 kernel/riscv64/amin_vector.c             |  241 +++
 kernel/riscv64/asum.c                    |   67 +
 kernel/riscv64/asum_vector.c             |  131 ++
 kernel/riscv64/axpby.c                   |   96 +
 kernel/riscv64/axpby_vector.c            |  378 ++++
 kernel/riscv64/axpy.c                    |   64 +
 kernel/riscv64/axpy_vector.c             |  179 ++
 kernel/riscv64/copy.c                    |   59 +
 kernel/riscv64/copy_vector.c             |  148 ++
 kernel/riscv64/dgemm_kernel_8x4_c910v.c  |  977 ++++++++++
 kernel/riscv64/dot.c                     |   64 +
 kernel/riscv64/dot_vector.c              |  172 ++
 kernel/riscv64/gemv_n.c                  |   67 +
 kernel/riscv64/gemv_n_vector.c           |  146 ++
 kernel/riscv64/gemv_t.c                  |   68 +
 kernel/riscv64/gemv_t_vector.c           |  126 ++
 kernel/riscv64/iamax.c                   |   77 +
 kernel/riscv64/iamax_vector.c            |  191 ++
 kernel/riscv64/iamin.c                   |   77 +
 kernel/riscv64/iamin_vector.c            |  192 ++
 kernel/riscv64/imax.c                    |   69 +
 kernel/riscv64/imax_vector.c             |  176 ++
 kernel/riscv64/imin.c                    |   67 +
 kernel/riscv64/imin_vector.c             |  212 +++
 kernel/riscv64/izamax.c                  |   81 +
 kernel/riscv64/izamax_vector.c           |  246 +++
 kernel/riscv64/izamin.c                  |   81 +
 kernel/riscv64/izamin_vector.c           |  247 +++
 kernel/riscv64/max.c                     |   65 +
 kernel/riscv64/max_vector.c              |  116 ++
 kernel/riscv64/min.c                     |   65 +
 kernel/riscv64/min_vector.c              |  116 ++
 kernel/riscv64/nrm2.c                    |   88 +
 kernel/riscv64/nrm2_vector.c             |  220 +++
 kernel/riscv64/nrm2_vector_dot.c         |  128 ++
 kernel/riscv64/omatcopy_cn.c             |   90 +
 kernel/riscv64/omatcopy_ct.c             |   89 +
 kernel/riscv64/omatcopy_rn.c             |   90 +
 kernel/riscv64/omatcopy_rt.c             |   62 +
 kernel/riscv64/rot.c                     |   62 +
 kernel/riscv64/rot_vector.c              |  196 ++
 kernel/riscv64/scal.c                    |   63 +
 kernel/riscv64/scal_vector.c             |  133 ++
 kernel/riscv64/sgemm_kernel_16x4_c910v.c | 1575 +++++++++++++++
 kernel/riscv64/swap.c                    |   62 +
 kernel/riscv64/swap_vector.c             |  173 ++
 kernel/riscv64/symv_L.c                  |   70 +
 kernel/riscv64/symv_L_vector.c           |  265 +++
 kernel/riscv64/symv_U.c                  |   71 +
 kernel/riscv64/symv_U_vector.c           |  264 +++
 kernel/riscv64/zamax.c                   |   79 +
 kernel/riscv64/zamax_vector.c            |  104 +
 kernel/riscv64/zamin.c                   |   79 +
 kernel/riscv64/zamin_vector.c            |  104 +
 kernel/riscv64/zasum.c                   |   72 +
 kernel/riscv64/zasum_vector.c            |  136 ++
 kernel/riscv64/zaxpby.c                  |  118 ++
 kernel/riscv64/zaxpby_vector.c           |  197 ++
 kernel/riscv64/zaxpy.c                   |   74 +
 kernel/riscv64/zaxpy_vector.c            |  107 ++
 kernel/riscv64/zcopy.c                   |   65 +
 kernel/riscv64/zcopy_vector.c            |   92 +
 kernel/riscv64/zdot.c                    |   80 +
 kernel/riscv64/zdot_vector.c             |  135 ++
 kernel/riscv64/zgemv_n.c                 |  157 ++
 kernel/riscv64/zgemv_n_vector.c          |  175 ++
 kernel/riscv64/zgemv_t.c                 |  140 ++
 kernel/riscv64/zgemv_t_vector.c          |  134 ++
 kernel/riscv64/zhemv_LM_vector.c         |  191 ++
 kernel/riscv64/zhemv_UV_vector.c         |  192 ++
 kernel/riscv64/znrm2.c                   |  106 ++
 kernel/riscv64/znrm2_vector.c            |  278 +++
 kernel/riscv64/zomatcopy_cn.c            |   70 +
 kernel/riscv64/zomatcopy_cnc.c           |   69 +
 kernel/riscv64/zomatcopy_ct.c            |   71 +
 kernel/riscv64/zomatcopy_ctc.c           |   71 +
 kernel/riscv64/zomatcopy_rn.c            |   70 +
 kernel/riscv64/zomatcopy_rnc.c           |   69 +
 kernel/riscv64/zomatcopy_rt.c            |   72 +
 kernel/riscv64/zomatcopy_rtc.c           |   72 +
 kernel/riscv64/zrot.c                    |   70 +
 kernel/riscv64/zrot_vector.c             |  162 ++
 kernel/riscv64/zscal.c                   |   88 +
 kernel/riscv64/zscal_vector.c            |  152 ++
 kernel/riscv64/zswap.c                   |   72 +
 kernel/riscv64/zswap_vector.c            |  117 ++
 lapack/laswp/riscv64/Makefile            |   13 +
 param.h                                  |   78 +
 test/Makefile                            |    6 +
 109 files changed, 19571 insertions(+), 1 deletion(-)
 create mode 100644 Makefile.riscv64
 create mode 100644 common_riscv64.h
 create mode 100644 cpuid_riscv64.c
 create mode 100644 kernel/generic/trmmkernel_16x4.c
 create mode 100644 kernel/generic/trmmkernel_8x4.c
 create mode 100644 kernel/generic/trmmkernel_8x8.c
 create mode 100644 kernel/riscv64/KERNEL
 create mode 100644 kernel/riscv64/KERNEL.C910V
 create mode 100644 kernel/riscv64/KERNEL.RISCV64_GENERIC
 create mode 100644 kernel/riscv64/amax.c
 create mode 100644 kernel/riscv64/amax_vector.c
 create mode 100644 kernel/riscv64/amin.c
 create mode 100644 kernel/riscv64/amin_vector.c
 create mode 100644 kernel/riscv64/asum.c
 create mode 100644 kernel/riscv64/asum_vector.c
 create mode 100644 kernel/riscv64/axpby.c
 create mode 100644 kernel/riscv64/axpby_vector.c
 create mode 100644 kernel/riscv64/axpy.c
 create mode 100644 kernel/riscv64/axpy_vector.c
 create mode 100644 kernel/riscv64/copy.c
 create mode 100644 kernel/riscv64/copy_vector.c
 create mode 100644 kernel/riscv64/dgemm_kernel_8x4_c910v.c
 create mode 100644 kernel/riscv64/dot.c
 create mode 100644 kernel/riscv64/dot_vector.c
 create mode 100644 kernel/riscv64/gemv_n.c
 create mode 100644 kernel/riscv64/gemv_n_vector.c
 create mode 100644 kernel/riscv64/gemv_t.c
 create mode 100644 kernel/riscv64/gemv_t_vector.c
 create mode 100644 kernel/riscv64/iamax.c
 create mode 100644 kernel/riscv64/iamax_vector.c
 create mode 100644 kernel/riscv64/iamin.c
 create mode 100644 kernel/riscv64/iamin_vector.c
 create mode 100644 kernel/riscv64/imax.c
 create mode 100644 kernel/riscv64/imax_vector.c
 create mode 100644 kernel/riscv64/imin.c
 create mode 100644 kernel/riscv64/imin_vector.c
 create mode 100644 kernel/riscv64/izamax.c
 create mode 100644 kernel/riscv64/izamax_vector.c
 create mode 100644 kernel/riscv64/izamin.c
 create mode 100644 kernel/riscv64/izamin_vector.c
 create mode 100644 kernel/riscv64/max.c
 create mode 100644 kernel/riscv64/max_vector.c
 create mode 100644 kernel/riscv64/min.c
 create mode 100644 kernel/riscv64/min_vector.c
 create mode 100644 kernel/riscv64/nrm2.c
 create mode 100644 kernel/riscv64/nrm2_vector.c
 create mode 100644 kernel/riscv64/nrm2_vector_dot.c
 create mode 100644 kernel/riscv64/omatcopy_cn.c
 create mode 100644 kernel/riscv64/omatcopy_ct.c
 create mode 100644 kernel/riscv64/omatcopy_rn.c
 create mode 100644 kernel/riscv64/omatcopy_rt.c
 create mode 100644 kernel/riscv64/rot.c
 create mode 100644 kernel/riscv64/rot_vector.c
 create mode 100644 kernel/riscv64/scal.c
 create mode 100644 kernel/riscv64/scal_vector.c
 create mode 100644 kernel/riscv64/sgemm_kernel_16x4_c910v.c
 create mode 100644 kernel/riscv64/swap.c
 create mode 100644 kernel/riscv64/swap_vector.c
 create mode 100644 kernel/riscv64/symv_L.c
 create mode 100644 kernel/riscv64/symv_L_vector.c
 create mode 100644 kernel/riscv64/symv_U.c
 create mode 100644 kernel/riscv64/symv_U_vector.c
 create mode 100644 kernel/riscv64/zamax.c
 create mode 100644 kernel/riscv64/zamax_vector.c
 create mode 100644 kernel/riscv64/zamin.c
 create mode 100644 kernel/riscv64/zamin_vector.c
 create mode 100644 kernel/riscv64/zasum.c
 create mode 100644 kernel/riscv64/zasum_vector.c
 create mode 100644 kernel/riscv64/zaxpby.c
 create mode 100644 kernel/riscv64/zaxpby_vector.c
 create mode 100644 kernel/riscv64/zaxpy.c
 create mode 100644 kernel/riscv64/zaxpy_vector.c
 create mode 100644 kernel/riscv64/zcopy.c
 create mode 100644 kernel/riscv64/zcopy_vector.c
 create mode 100644 kernel/riscv64/zdot.c
 create mode 100644 kernel/riscv64/zdot_vector.c
 create mode 100644 kernel/riscv64/zgemv_n.c
 create mode 100644 kernel/riscv64/zgemv_n_vector.c
 create mode 100644 kernel/riscv64/zgemv_t.c
 create mode 100644 kernel/riscv64/zgemv_t_vector.c
 create mode 100644 kernel/riscv64/zhemv_LM_vector.c
 create mode 100644 kernel/riscv64/zhemv_UV_vector.c
 create mode 100644 kernel/riscv64/znrm2.c
 create mode 100644 kernel/riscv64/znrm2_vector.c
 create mode 100644 kernel/riscv64/zomatcopy_cn.c
 create mode 100644 kernel/riscv64/zomatcopy_cnc.c
 create mode 100644 kernel/riscv64/zomatcopy_ct.c
 create mode 100644 kernel/riscv64/zomatcopy_ctc.c
 create mode 100644 kernel/riscv64/zomatcopy_rn.c
 create mode 100644 kernel/riscv64/zomatcopy_rnc.c
 create mode 100644 kernel/riscv64/zomatcopy_rt.c
 create mode 100644 kernel/riscv64/zomatcopy_rtc.c
 create mode 100644 kernel/riscv64/zrot.c
 create mode 100644 kernel/riscv64/zrot_vector.c
 create mode 100644 kernel/riscv64/zscal.c
 create mode 100644 kernel/riscv64/zscal_vector.c
 create mode 100644 kernel/riscv64/zswap.c
 create mode 100644 kernel/riscv64/zswap_vector.c
 create mode 100644 lapack/laswp/riscv64/Makefile

diff --git a/Makefile.prebuild b/Makefile.prebuild
index 48fb5e991..d6395da7b 100644
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@@ -41,6 +41,10 @@ ifeq ($(TARGET), I6500)
 TARGET_FLAGS = -mips64r6
 endif
 
+ifeq ($(TARGET), C910V)
+TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
diff --git a/Makefile.riscv64 b/Makefile.riscv64
new file mode 100644
index 000000000..15d7b059c
--- /dev/null
+++ b/Makefile.riscv64
@@ -0,0 +1,4 @@
+ifeq ($(CORE), C910V)
+CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v
+FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static
+endif
diff --git a/Makefile.system b/Makefile.system
index 461f7370b..fe2aecd82 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -724,7 +724,10 @@ endif
 endif
 endif
 
-
+ifeq ($(ARCH), riscv64)
+NO_BINARY_MODE  = 1
+BINARY_DEFINED  = 1
+endif
 
 
 #
diff --git a/TargetList.txt b/TargetList.txt
index 66eca4506..86177ebca 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -104,3 +104,6 @@ VORTEX
 ZARCH_GENERIC
 Z13
 Z14
+
+10.RISC-V 64:
+RISCV64_GENERIC
diff --git a/c_check b/c_check
index 5ea93b75c..405963ae6 100644
--- a/c_check
+++ b/c_check
@@ -92,6 +92,7 @@ $architecture = ia64   if ($data =~ /ARCH_IA64/);
 $architecture = arm    if ($data =~ /ARCH_ARM/);
 $architecture = arm64  if ($data =~ /ARCH_ARM64/);
 $architecture = zarch  if ($data =~ /ARCH_ZARCH/);
+$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
 
 $defined = 0;
 
@@ -136,6 +137,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) {
     $binary =32;
 }
 
+if ($architecture eq "riscv64") {
+    $defined = 1;
+    $binary = 64;
+}
+
 if ($compiler eq "PGI") {
     $compiler_name .= " -tp p7"    if ($binary eq "32");
     $compiler_name .= " -tp p7-64" if ($binary eq "64");
diff --git a/common.h b/common.h
index a3ef99b59..faa75c447 100644
--- a/common.h
+++ b/common.h
@@ -437,6 +437,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_mips.h"
 #endif
 
+#ifdef ARCH_RISCV64
+#include "common_riscv64.h"
+#endif
+
 #ifdef ARCH_MIPS64
 #include "common_mips64.h"
 #endif
diff --git a/common_riscv64.h b/common_riscv64.h
new file mode 100644
index 000000000..49368c613
--- /dev/null
+++ b/common_riscv64.h
@@ -0,0 +1,98 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#ifndef COMMON_RISCV64
+#define COMMON_RISCV64
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#endif
+
+
+
+#define BUFFER_SIZE     ( 32 << 20)
+#define SEEK_ADDRESS
+
+#if defined(C910V)
+#include <riscv-vector.h>
+#endif
+
+#endif
diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c
new file mode 100644
index 000000000..8a3209cb3
--- /dev/null
+++ b/cpuid_riscv64.c
@@ -0,0 +1,113 @@
+/*****************************************************************************
+Copyright (c) 2011-2014, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#define CPU_UNKNOWN     0
+#define CPU_C910V       1
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "C910V"
+};
+
+int detect(void){
+    return CPU_UNKNOWN;
+}
+
+char *get_corename(void){
+  return cpuname[detect()];
+}
+
+void get_architecture(void){
+  printf("RISCV64");
+}
+
+void get_subarchitecture(void){
+}
+
+void get_subdirname(void){
+  printf("riscv64");
+}
+
+void get_cpuconfig(void){
+  printf("#define UNKNOWN\n");
+  printf("#define L1_DATA_SIZE 65536\n");
+  printf("#define L1_DATA_LINESIZE 32\n");
+  printf("#define L2_SIZE 512488\n");
+  printf("#define L2_LINESIZE 32\n");
+  printf("#define DTB_DEFAULT_ENTRIES 64\n");
+  printf("#define DTB_SIZE 4096\n");
+  printf("#define L2_ASSOCIATIVE 4\n");
+}
+
+void get_libname(void){
+  printf("riscv64\n");
+}
diff --git a/ctest.c b/ctest.c
index cd84ab1bb..83a3b7d6c 100644
--- a/ctest.c
+++ b/ctest.c
@@ -153,6 +153,10 @@ ARCH_ARM
 ARCH_ARM64
 #endif
 
+#if defined(__riscv)
+ARCH_RISCV64
+#endif
+
 #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
 HAVE_C11
 #endif
diff --git a/getarch.c b/getarch.c
index e2c22d3a0..58465fb56 100644
--- a/getarch.c
+++ b/getarch.c
@@ -981,6 +981,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_RISCV64_GENERIC
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_GENERIC"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_GENERIC " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_generic"
+#define CORENAME  "RISCV64_GENERIC"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA15
 #define FORCE
 #define ARCHITECTURE    "ARM"
@@ -1252,6 +1266,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "Z14"
 #endif
 
+#ifdef FORCE_C910V
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "C910V"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DC910V " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "c910v"
+#define CORENAME  "C910V"
+#else
+#endif
+
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1306,6 +1335,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __riscv
+#include "cpuid_riscv64.c"
+#endif
+
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 2ba593c2e..893713769 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -25,6 +25,10 @@ ifeq ($(ARCH), arm64)
 USE_TRMM = 1
 endif
 
+ifeq ($(ARCH), riscv64)
+USE_TRMM = 1
+endif
+
 ifeq ($(TARGET), LOONGSON3B)
 USE_TRMM = 1
 endif
diff --git a/kernel/generic/trmmkernel_16x4.c b/kernel/generic/trmmkernel_16x4.c
new file mode 100644
index 000000000..7ea4e108c
--- /dev/null
+++ b/kernel/generic/trmmkernel_16x4.c
@@ -0,0 +1,2092 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+   FLOAT res0_4;
+   FLOAT res0_5;
+   FLOAT res0_6;
+   FLOAT res0_7;
+
+   FLOAT res0_8;
+   FLOAT res0_9;
+   FLOAT res0_10;
+   FLOAT res0_11;
+   FLOAT res0_12;
+   FLOAT res0_13;
+   FLOAT res0_14;
+   FLOAT res0_15;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+   FLOAT res1_4;
+   FLOAT res1_5;
+   FLOAT res1_6;
+   FLOAT res1_7;
+
+   FLOAT res1_8;
+   FLOAT res1_9;
+   FLOAT res1_10;
+   FLOAT res1_11;
+   FLOAT res1_12;
+   FLOAT res1_13;
+   FLOAT res1_14;
+   FLOAT res1_15;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+   FLOAT res2_4;
+   FLOAT res2_5;
+   FLOAT res2_6;
+   FLOAT res2_7;
+
+   FLOAT res2_8;
+   FLOAT res2_9;
+   FLOAT res2_10;
+   FLOAT res2_11;
+   FLOAT res2_12;
+   FLOAT res2_13;
+   FLOAT res2_14;
+   FLOAT res2_15;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+   FLOAT res3_4;
+   FLOAT res3_5;
+   FLOAT res3_6;
+   FLOAT res3_7;
+
+   FLOAT res3_8;
+   FLOAT res3_9;
+   FLOAT res3_10;
+   FLOAT res3_11;
+   FLOAT res3_12;
+   FLOAT res3_13;
+   FLOAT res3_14;
+   FLOAT res3_15;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+   BLASLONG off, temp;
+
+#if !defined(LEFT)
+   off = -offset;
+#else
+   off = 0;
+#endif
+
+   for (j=0; j<bn/4; j+=1)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C0+2*ldc;
+        C3 = C0+3*ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res2_8  = 0;
+		res2_9  = 0;
+		res2_10 = 0;
+		res2_11 = 0;
+		res2_12 = 0;
+		res2_13 = 0;
+		res2_14 = 0;
+		res2_15 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+		res3_8  = 0;
+		res3_9  = 0;
+		res3_10 = 0;
+		res3_11 = 0;
+		res3_12 = 0;
+		res3_13 = 0;
+		res3_14 = 0;
+		res3_15 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+			res2_8 += a0*b2;
+			res3_8 += a0*b3;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+			res2_9 += a1*b2;
+			res3_9 += a1*b3;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+			res2_10 += a0*b2;
+			res3_10 += a0*b3;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+			res2_11 += a1*b2;
+			res3_11 += a1*b3;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+			res2_12 += a0*b2;
+			res3_12 += a0*b3;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+			res2_13 += a1*b2;
+			res3_13 += a1*b3;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+			res2_14 += a0*b2;
+			res3_14 += a0*b3;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+			res2_15 += a1*b2;
+			res3_15 += a1*b3;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+		
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res2_8  *= alpha;
+		res2_9  *= alpha;
+		res2_10 *= alpha;
+		res2_11 *= alpha;
+		res2_12 *= alpha;
+		res2_13 *= alpha;
+		res2_14 *= alpha;
+		res2_15 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		res3_8  *= alpha;
+		res3_9  *= alpha;
+		res3_10 *= alpha;
+		res3_11 *= alpha;
+		res3_12 *= alpha;
+		res3_13 *= alpha;
+		res3_14 *= alpha;
+		res3_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C2[8]  = res2_8;
+		C2[9]  = res2_9;
+		C2[10] = res2_10;
+		C2[11] = res2_11;
+		C2[12] = res2_12;
+		C2[13] = res2_13;
+		C2[14] = res2_14;
+		C2[15] = res2_15;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+		C3[8]  = res3_8;
+		C3[9]  = res3_9;
+		C3[10] = res3_10;
+		C3[11] = res3_11;
+		C3[12] = res3_12;
+		C3[13] = res3_13;
+		C3[14] = res3_14;
+		C3[15] = res3_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+		C2 = C2+16;
+		C3 = C3+16;
+	}
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+		res2_4 = 0;
+		res2_5 = 0;
+		res2_6 = 0;
+		res2_7 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+		res3_4 = 0;
+		res3_5 = 0;
+		res3_6 = 0;
+		res3_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+			res2_4 += a0*b2;
+			res3_4 += a0*b3;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+			res2_5 += a1*b2;
+			res3_5 += a1*b3;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+			res2_6 += a0*b2;
+			res3_6 += a0*b3;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+			res2_7 += a1*b2;
+			res3_7 += a1*b3;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+4;
+
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+		res2_4 *= alpha;
+		res2_5 *= alpha;
+		res2_6 *= alpha;
+		res2_7 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+		res3_4 *= alpha;
+		res3_5 *= alpha;
+		res3_6 *= alpha;
+		res3_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+		C2[4] = res2_4;
+		C2[5] = res2_5;
+		C2[6] = res2_6;
+		C2[7] = res2_7;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+		C3[4] = res3_4;
+		C3[5] = res3_5;
+		C3[6] = res3_6;
+		C3[7] = res3_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+		C2 = C2+8;
+		C3 = C3+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+	}
+	
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+		res0_0 *= alpha;
+		res1_0 *= alpha;
+		res2_0 *= alpha;
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+		C1[0] = res1_0;
+		C2[0] = res2_0;
+		C3[0] = res3_0;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+   if(bn&2)
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+	off = offset;
+#endif
+
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+		res1_8  = 0;
+		res1_9  = 0;
+		res1_10 = 0;
+		res1_11 = 0;
+		res1_12 = 0;
+		res1_13 = 0;
+		res1_14 = 0;
+		res1_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+			res1_8 += a0*b1;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+			res1_9 += a1*b1;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+			res1_10 += a0*b1;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+			res1_11 += a1*b1;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+			res1_12 += a0*b1;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+			res1_13 += a1*b1;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+			res1_14 += a0*b1;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+			res1_15 += a1*b1;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		res1_8  *= alpha;
+		res1_9  *= alpha;
+		res1_10 *= alpha;
+		res1_11 *= alpha;
+		res1_12 *= alpha;
+		res1_13 *= alpha;
+		res1_14 *= alpha;
+		res1_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+		C1[8]  = res1_8;
+		C1[9]  = res1_9;
+		C1[10] = res1_10;
+		C1[11] = res1_11;
+		C1[12] = res1_12;
+		C1[13] = res1_13;
+		C1[14] = res1_14;
+		C1[15] = res1_15;
+
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+		C1 = C1+16;
+	}
+
+
+
+
+        if ( bm & 8)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+		res1_4 = 0;
+		res1_5 = 0;
+		res1_6 = 0;
+		res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+			res1_4 += a0*b1;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+			res1_5 += a1*b1;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+			res1_6 += a0*b1;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+			res1_7 += a1*b1;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+		res1_4 *= alpha;
+		res1_5 *= alpha;
+		res1_6 *= alpha;
+		res1_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+		C1[4] = res1_4;
+		C1[5] = res1_5;
+		C1[6] = res1_6;
+		C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+		C1 = C1+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+   for (j=0; j<(bn&1); j+=1)
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+
+        for (i=0; i<bm/16; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*16;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+		res0_8  = 0;
+		res0_9  = 0;
+		res0_10 = 0;
+		res0_11 = 0;
+		res0_12 = 0;
+		res0_13 = 0;
+		res0_14 = 0;
+		res0_15 = 0;
+
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+16;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			a0 = ptrba[8];
+			res0_8 += a0*b0;
+
+			a1 = ptrba[9];
+			res0_9 += a1*b0;
+
+			a0 = ptrba[10];
+			res0_10 += a0*b0;
+
+			a1 = ptrba[11];
+			res0_11 += a1*b0;
+
+			a0 = ptrba[12];
+			res0_12 += a0*b0;
+
+			a1 = ptrba[13];
+			res0_13 += a1*b0;
+
+			a0 = ptrba[14];
+			res0_14 += a0*b0;
+
+			a1 = ptrba[15];
+			res0_15 += a1*b0;
+
+
+			ptrba = ptrba+16;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		res0_8  *= alpha;
+		res0_9  *= alpha;
+		res0_10 *= alpha;
+		res0_11 *= alpha;
+		res0_12 *= alpha;
+		res0_13 *= alpha;
+		res0_14 *= alpha;
+		res0_15 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+		C0[8]  = res0_8;
+		C0[9]  = res0_9;
+		C0[10] = res0_10;
+		C0[11] = res0_11;
+		C0[12] = res0_12;
+		C0[13] = res0_13;
+		C0[14] = res0_14;
+		C0[15] = res0_15;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 16; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*16;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 16; // number of values in A
+#endif
+
+		C0 = C0+16;
+	}
+
+
+
+
+        if ( bm & 8 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*8;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+		res0_4 = 0;
+		res0_5 = 0;
+		res0_6 = 0;
+		res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+8;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			a0 = ptrba[4];
+			res0_4 += a0*b0;
+
+			a1 = ptrba[5];
+			res0_5 += a1*b0;
+
+			a0 = ptrba[6];
+			res0_6 += a0*b0;
+
+			a1 = ptrba[7];
+			res0_7 += a1*b0;
+
+			ptrba = ptrba+8;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+		res0_4 *= alpha;
+		res0_5 *= alpha;
+		res0_6 *= alpha;
+		res0_7 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+		C0[4] = res0_4;
+		C0[5] = res0_5;
+		C0[6] = res0_6;
+		C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 8; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*8;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 8; // number of values in A
+#endif
+
+		C0 = C0+8;
+	}
+
+	if ( bm & 4 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 )
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/generic/trmmkernel_8x4.c b/kernel/generic/trmmkernel_8x4.c
new file mode 100644
index 000000000..ae88e8d8f
--- /dev/null
+++ b/kernel/generic/trmmkernel_8x4.c
@@ -0,0 +1,1317 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+    FLOAT res0_0;
+    FLOAT res0_1;
+    FLOAT res0_2;
+    FLOAT res0_3;
+    FLOAT res0_4;
+    FLOAT res0_5;
+    FLOAT res0_6;
+    FLOAT res0_7;
+
+    FLOAT res1_0;
+    FLOAT res1_1;
+    FLOAT res1_2;
+    FLOAT res1_3;
+    FLOAT res1_4;
+    FLOAT res1_5;
+    FLOAT res1_6;
+    FLOAT res1_7;
+
+    FLOAT res2_0;
+    FLOAT res2_1;
+    FLOAT res2_2;
+    FLOAT res2_3;
+    FLOAT res2_4;
+    FLOAT res2_5;
+    FLOAT res2_6;
+    FLOAT res2_7;
+
+    FLOAT res3_0;
+    FLOAT res3_1;
+    FLOAT res3_2;
+    FLOAT res3_3;
+    FLOAT res3_4;
+    FLOAT res3_5;
+    FLOAT res3_6;
+    FLOAT res3_7;
+
+    FLOAT a0;
+    FLOAT a1;
+
+    FLOAT b0;
+    FLOAT b1;
+    FLOAT b2;
+    FLOAT b3;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+
+    for (j=0; j<bn/4; j+=1)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+    for (j=0; j<(bn&2); j+=2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+
+            res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+
+            res1_0 *= alpha;
+
+            C0[0] = res0_0;
+
+            C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+    for (j=0; j<(bn&1); j+=1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+
+            C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+
+        }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+    }
+    return 0;
+}
diff --git a/kernel/generic/trmmkernel_8x8.c b/kernel/generic/trmmkernel_8x8.c
new file mode 100644
index 000000000..9add8b9d7
--- /dev/null
+++ b/kernel/generic/trmmkernel_8x8.c
@@ -0,0 +1,2207 @@
+#include "common.h"
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+    BLASLONG i,j,k;
+    FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+    FLOAT res0_0;
+    FLOAT res0_1;
+    FLOAT res0_2;
+    FLOAT res0_3;
+    FLOAT res0_4;
+    FLOAT res0_5;
+    FLOAT res0_6;
+    FLOAT res0_7;
+
+    FLOAT res1_0;
+    FLOAT res1_1;
+    FLOAT res1_2;
+    FLOAT res1_3;
+    FLOAT res1_4;
+    FLOAT res1_5;
+    FLOAT res1_6;
+    FLOAT res1_7;
+
+    FLOAT res2_0;
+    FLOAT res2_1;
+    FLOAT res2_2;
+    FLOAT res2_3;
+    FLOAT res2_4;
+    FLOAT res2_5;
+    FLOAT res2_6;
+    FLOAT res2_7;
+
+    FLOAT res3_0;
+    FLOAT res3_1;
+    FLOAT res3_2;
+    FLOAT res3_3;
+    FLOAT res3_4;
+    FLOAT res3_5;
+    FLOAT res3_6;
+    FLOAT res3_7;
+
+    FLOAT res4_0;
+    FLOAT res4_1;
+    FLOAT res4_2;
+    FLOAT res4_3;
+    FLOAT res4_4;
+    FLOAT res4_5;
+    FLOAT res4_6;
+    FLOAT res4_7;
+
+    FLOAT res5_0;
+    FLOAT res5_1;
+    FLOAT res5_2;
+    FLOAT res5_3;
+    FLOAT res5_4;
+    FLOAT res5_5;
+    FLOAT res5_6;
+    FLOAT res5_7;
+
+    FLOAT res6_0;
+    FLOAT res6_1;
+    FLOAT res6_2;
+    FLOAT res6_3;
+    FLOAT res6_4;
+    FLOAT res6_5;
+    FLOAT res6_6;
+    FLOAT res6_7;
+
+    FLOAT res7_0;
+    FLOAT res7_1;
+    FLOAT res7_2;
+    FLOAT res7_3;
+    FLOAT res7_4;
+    FLOAT res7_5;
+    FLOAT res7_6;
+    FLOAT res7_7;
+
+    FLOAT a0;
+    FLOAT a1;
+
+    FLOAT b0;
+    FLOAT b1;
+    FLOAT b2;
+    FLOAT b3;
+    
+    FLOAT b4;
+    FLOAT b5;
+    FLOAT b6;
+    FLOAT b7;
+
+    BLASLONG off, temp;
+
+#if !defined(LEFT)
+    off = -offset;
+#else
+    off = 0;
+#endif
+    for (j=0; j<bn/8; j+=1)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+	
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+            res4_2 = 0;
+            res4_3 = 0;
+            res4_4 = 0;
+            res4_5 = 0;
+            res4_6 = 0;
+            res4_7 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+            res5_2 = 0;
+            res5_3 = 0;
+            res5_4 = 0;
+            res5_5 = 0;
+            res5_6 = 0;
+            res5_7 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+            res6_2 = 0;
+            res6_3 = 0;
+            res6_4 = 0;
+            res6_5 = 0;
+            res6_6 = 0;
+            res6_7 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+            res7_2 = 0;
+            res7_3 = 0;
+            res7_4 = 0;
+            res7_5 = 0;
+            res7_6 = 0;
+            res7_7 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+                res4_2 += a0*b4;
+                res5_2 += a0*b5;
+                res6_2 += a0*b6;
+                res7_2 += a0*b7;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+                res4_3 += a1*b4;
+                res5_3 += a1*b5;
+                res6_3 += a1*b6;
+                res7_3 += a1*b7;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+                res4_4 += a0*b4;
+                res5_4 += a0*b5;
+                res6_4 += a0*b6;
+                res7_4 += a0*b7;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+                res4_5 += a1*b4;
+                res5_5 += a1*b5;
+                res6_5 += a1*b6;
+                res7_5 += a1*b7;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+                res4_6 += a0*b4;
+                res5_6 += a0*b5;
+                res6_6 += a0*b6;
+                res7_6 += a0*b7;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+                res4_7 += a1*b4;
+                res5_7 += a1*b5;
+                res6_7 += a1*b6;
+                res7_7 += a1*b7;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+	    
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+            res4_2 *= alpha;
+            res4_3 *= alpha;
+            res4_4 *= alpha;
+            res4_5 *= alpha;
+            res4_6 *= alpha;
+            res4_7 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+            res5_2 *= alpha;
+            res5_3 *= alpha;
+            res5_4 *= alpha;
+            res5_5 *= alpha;
+            res5_6 *= alpha;
+            res5_7 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+            res6_2 *= alpha;
+            res6_3 *= alpha;
+            res6_4 *= alpha;
+            res6_5 *= alpha;
+            res6_6 *= alpha;
+            res6_7 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+            res7_2 *= alpha;
+            res7_3 *= alpha;
+            res7_4 *= alpha;
+            res7_5 *= alpha;
+            res7_6 *= alpha;
+            res7_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+            C4[2] = res4_2;
+            C4[3] = res4_3;
+            C4[4] = res4_4;
+            C4[5] = res4_5;
+            C4[6] = res4_6;
+            C4[7] = res4_7;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+            C5[2] = res5_2;
+            C5[3] = res5_3;
+            C5[4] = res5_4;
+            C5[5] = res5_5;
+            C5[6] = res5_6;
+            C5[7] = res5_7;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+            C6[2] = res6_2;
+            C6[3] = res6_3;
+            C6[4] = res6_4;
+            C6[5] = res6_5;
+            C6[6] = res6_6;
+            C6[7] = res6_7;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+            C7[2] = res7_2;
+            C7[3] = res7_3;
+            C7[4] = res7_4;
+            C7[5] = res7_5;
+            C7[6] = res7_6;
+            C7[7] = res7_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+            C4 = C4+8;
+            C5 = C5+8;
+            C6 = C6+8;
+            C7 = C7+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+            res4_2 = 0;
+            res4_3 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+            res5_2 = 0;
+            res5_3 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+            res6_2 = 0;
+            res6_3 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+            res7_2 = 0;
+            res7_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+                res4_2 += a0*b4;
+                res5_2 += a0*b5;
+                res6_2 += a0*b6;
+                res7_2 += a0*b7;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+                res4_3 += a1*b4;
+                res5_3 += a1*b5;
+                res6_3 += a1*b6;
+                res7_3 += a1*b7;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+            res4_2 *= alpha;
+            res4_3 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+            res5_2 *= alpha;
+            res5_3 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+            res6_2 *= alpha;
+            res6_3 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+            res7_2 *= alpha;
+            res7_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+            C4[2] = res4_2;
+            C4[3] = res4_3;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+            C5[2] = res5_2;
+            C5[3] = res5_3;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+            C6[2] = res6_2;
+            C6[3] = res6_3;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+            C7[2] = res7_2;
+            C7[3] = res7_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+            C4 = C4+4;
+            C5 = C5+4;
+            C6 = C6+4;
+            C7 = C7+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+            res4_0 = 0;
+            res4_1 = 0;
+
+            res5_0 = 0;
+            res5_1 = 0;
+
+            res6_0 = 0;
+            res6_1 = 0;
+
+            res7_0 = 0;
+            res7_1 = 0;
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+		res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+		
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+                res4_1 += a1*b4;
+                res5_1 += a1*b5;
+                res6_1 += a1*b6;
+                res7_1 += a1*b7;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+	    
+            res4_0 *= alpha;
+            res4_1 *= alpha;
+
+            res5_0 *= alpha;
+            res5_1 *= alpha;
+
+            res6_0 *= alpha;
+            res6_1 *= alpha;
+
+            res7_0 *= alpha;
+            res7_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+            C4[0] = res4_0;
+            C4[1] = res4_1;
+
+            C5[0] = res5_0;
+            C5[1] = res5_1;
+
+            C6[0] = res6_0;
+            C6[1] = res6_1;
+
+            C7[0] = res7_0;
+            C7[1] = res7_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+            C4 = C4+2;
+            C5 = C5+2;
+            C6 = C6+2;
+            C7 = C7+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*8;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+            res4_0 = 0;
+            res5_0 = 0;
+            res6_0 = 0;
+            res7_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+8;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+                b4 = ptrbb[4];
+                b5 = ptrbb[5];
+                b6 = ptrbb[6];
+                b7 = ptrbb[7];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+                res4_0 += a0*b4;
+                res5_0 += a0*b5;
+                res6_0 += a0*b6;
+                res7_0 += a0*b7;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+8;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+	    res4_0 *= alpha;
+            res5_0 *= alpha;
+            res6_0 *= alpha;
+            res7_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+            C4[0] = res4_0;
+            C5[0] = res5_0;
+            C6[0] = res6_0;
+            C7[0] = res7_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 8; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+            C4 = C4+1;
+            C5 = C5+1;
+            C6 = C6+1;
+            C7 = C7+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+    if (bn&4)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+            res2_4 = 0;
+            res2_5 = 0;
+            res2_6 = 0;
+            res2_7 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+            res3_4 = 0;
+            res3_5 = 0;
+            res3_6 = 0;
+            res3_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+                res2_4 += a0*b2;
+                res3_4 += a0*b3;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+                res2_5 += a1*b2;
+                res3_5 += a1*b3;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+                res2_6 += a0*b2;
+                res3_6 += a0*b3;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+                res2_7 += a1*b2;
+                res3_7 += a1*b3;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+            res2_4 *= alpha;
+            res2_5 *= alpha;
+            res2_6 *= alpha;
+            res2_7 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+            res3_4 *= alpha;
+            res3_5 *= alpha;
+            res3_6 *= alpha;
+            res3_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+            C2[4] = res2_4;
+            C2[5] = res2_5;
+            C2[6] = res2_6;
+            C2[7] = res2_7;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+            C3[4] = res3_4;
+            C3[5] = res3_5;
+            C3[6] = res3_6;
+            C3[7] = res3_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+            C2 = C2+8;
+            C3 = C3+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+            res2_2 = 0;
+            res2_3 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+            res3_2 = 0;
+            res3_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+                res2_2 += a0*b2;
+                res3_2 += a0*b3;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+                res2_3 += a1*b2;
+                res3_3 += a1*b3;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+            res2_2 *= alpha;
+            res2_3 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+            res3_2 *= alpha;
+            res3_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+            C2[2] = res2_2;
+            C2[3] = res2_3;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+            C3[2] = res3_2;
+            C3[3] = res3_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+            C2 = C2+4;
+            C3 = C3+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+            res2_0 = 0;
+            res2_1 = 0;
+
+            res3_0 = 0;
+            res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+                res2_1 += a1*b2;
+                res3_1 += a1*b3;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            res2_0 *= alpha;
+            res2_1 *= alpha;
+
+            res3_0 *= alpha;
+            res3_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+            C2[0] = res2_0;
+            C2[1] = res2_1;
+
+            C3[0] = res3_0;
+            C3[1] = res3_1;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+            C2 = C2+2;
+            C3 = C3+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*4;
+#endif
+
+            res0_0 = 0;
+            res1_0 = 0;
+            res2_0 = 0;
+            res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+4;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+                b2 = ptrbb[2];
+                b3 = ptrbb[3];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+                res2_0 += a0*b2;
+                res3_0 += a0*b3;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+4;
+            }
+
+            res0_0 *= alpha;
+            res1_0 *= alpha;
+            res2_0 *= alpha;
+            res3_0 *= alpha;
+
+            C0[0] = res0_0;
+            C1[0] = res1_0;
+            C2[0] = res2_0;
+            C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 4; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+            C2 = C2+1;
+            C3 = C3+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+    for (j=0; j<(bn&2); j+=2)
+    {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+            res1_4 = 0;
+            res1_5 = 0;
+            res1_6 = 0;
+            res1_7 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+                res1_4 += a0*b1;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+                res1_5 += a1*b1;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+                res1_6 += a0*b1;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+                res1_7 += a1*b1;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+            res1_4 *= alpha;
+            res1_5 *= alpha;
+            res1_6 *= alpha;
+            res1_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+            C1[4] = res1_4;
+            C1[5] = res1_5;
+            C1[6] = res1_6;
+            C1[7] = res1_7;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+            C1 = C1+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+            res1_2 = 0;
+            res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+                res1_2 += a0*b1;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+                res1_3 += a1*b1;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+            res1_2 *= alpha;
+            res1_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+            C1[2] = res1_2;
+            C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+            C1 = C1+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+            res1_0 = 0;
+            res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+                res1_1 += a1*b1;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            res1_0 *= alpha;
+            res1_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+            C1[0] = res1_0;
+            C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+            C1 = C1+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*2;
+#endif
+
+            res0_0 = 0;
+
+            res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+2;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+                b1 = ptrbb[1];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+                res1_0 += a0*b1;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+2;
+            }
+
+            res0_0 *= alpha;
+
+            res1_0 *= alpha;
+
+            C0[0] = res0_0;
+
+            C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 2; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+            C1 = C1+1;
+
+        }
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+    for (j=0; j<(bn&1); j+=1)
+    {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+        off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/8; i+=1)
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*8;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+            res0_4 = 0;
+            res0_5 = 0;
+            res0_6 = 0;
+            res0_7 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+8;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                a0 = ptrba[4];
+                res0_4 += a0*b0;
+
+                a1 = ptrba[5];
+                res0_5 += a1*b0;
+
+                a0 = ptrba[6];
+                res0_6 += a0*b0;
+
+                a1 = ptrba[7];
+                res0_7 += a1*b0;
+
+                ptrba = ptrba+8;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+            res0_4 *= alpha;
+            res0_5 *= alpha;
+            res0_6 *= alpha;
+            res0_7 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+            C0[4] = res0_4;
+            C0[5] = res0_5;
+            C0[6] = res0_6;
+            C0[7] = res0_7;
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 8; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*8;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 8; // number of values in A
+#endif
+
+            C0 = C0+8;
+        }
+
+        if ( bm & 4 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*4;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+            res0_2 = 0;
+            res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+4;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                a0 = ptrba[2];
+                res0_2 += a0*b0;
+
+                a1 = ptrba[3];
+                res0_3 += a1*b0;
+
+                ptrba = ptrba+4;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+            res0_2 *= alpha;
+            res0_3 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+            C0[2] = res0_2;
+            C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 4; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*4;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 4; // number of values in A
+#endif
+
+            C0 = C0+4;
+
+        }
+
+        if ( bm & 2 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*2;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+            res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+2;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                a1 = ptrba[1];
+                res0_1 += a1*b0;
+
+                ptrba = ptrba+2;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+            res0_1 *= alpha;
+
+            C0[0] = res0_0;
+            C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 2; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*2;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 2; // number of values in A
+#endif
+
+            C0 = C0+2;
+
+        }
+
+        if ( bm & 1 )
+        {
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            ptrbb = bb;
+#else
+            ptrba += off*1;
+            ptrbb = bb + off*1;
+#endif
+
+            res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+            temp = bk-off;
+#elif defined(LEFT)
+            temp = off+1;	// number of values in A
+#else
+            temp = off+1;	// number of values in B
+#endif
+
+            for (k=0; k<temp; k++)
+            {
+                b0 = ptrbb[0];
+
+                a0 = ptrba[0];
+                res0_0 += a0*b0;
+
+                ptrba = ptrba+1;
+                ptrbb = ptrbb+1;
+            }
+
+            res0_0 *= alpha;
+
+            C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+            temp = bk - off;
+#ifdef LEFT
+            temp -= 1; // number of values in A
+#else
+            temp -= 1; // number of values in B
+#endif
+            ptrba += temp*1;
+            ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+            off += 1; // number of values in A
+#endif
+
+            C0 = C0+1;
+
+        }
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+    }
+    return 0;
+}
diff --git a/kernel/riscv64/KERNEL b/kernel/riscv64/KERNEL
new file mode 100644
index 000000000..68d68b5f8
--- /dev/null
+++ b/kernel/riscv64/KERNEL
@@ -0,0 +1,30 @@
+ifndef SCABS_KERNEL
+SCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef DCABS_KERNEL
+DCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef QCABS_KERNEL
+QCABS_KERNEL	= ../generic/cabs.c
+endif
+
+ifndef LSAME_KERNEL
+LSAME_KERNEL	= ../generic/lsame.c
+endif
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
+
+
diff --git a/kernel/riscv64/KERNEL.C910V b/kernel/riscv64/KERNEL.C910V
new file mode 100644
index 000000000..0da66fa35
--- /dev/null
+++ b/kernel/riscv64/KERNEL.C910V
@@ -0,0 +1,190 @@
+SAMAXKERNEL  = amax_vector.c
+DAMAXKERNEL  = amax_vector.c
+CAMAXKERNEL  = zamax_vector.c
+ZAMAXKERNEL  = zamax_vector.c
+
+SAMINKERNEL  = amin_vector.c
+DAMINKERNEL  = amin_vector.c
+CAMINKERNEL  = zamin_vector.c
+ZAMINKERNEL  = zamin_vector.c
+
+SMAXKERNEL   = max_vector.c
+DMAXKERNEL   = max_vector.c
+
+SMINKERNEL   = min_vector.c
+DMINKERNEL   = min_vector.c
+
+ISAMAXKERNEL = iamax_vector.c
+IDAMAXKERNEL = iamax_vector.c
+ICAMAXKERNEL = izamax_vector.c
+IZAMAXKERNEL = izamax_vector.c
+
+ISAMINKERNEL = iamin_vector.c
+IDAMINKERNEL = iamin_vector.c
+ICAMINKERNEL = izamin_vector.c
+IZAMINKERNEL = izamin_vector.c
+
+ISMAXKERNEL  = imax_vector.c
+IDMAXKERNEL  = imax_vector.c
+
+ISMINKERNEL  = imin_vector.c
+IDMINKERNEL  = imin_vector.c
+
+SASUMKERNEL  = asum_vector.c
+DASUMKERNEL  = asum_vector.c
+CASUMKERNEL  = zasum_vector.c
+ZASUMKERNEL  = zasum_vector.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = axpy_vector.c
+DAXPYKERNEL  = axpy_vector.c
+CAXPYKERNEL  = zaxpy_vector.c
+ZAXPYKERNEL  = zaxpy_vector.c
+
+SAXPBYKERNEL  = axpby_vector.c
+DAXPBYKERNEL  = axpby_vector.c
+CAXPBYKERNEL  = zaxpby_vector.c
+ZAXPBYKERNEL  = zaxpby_vector.c
+
+SCOPYKERNEL  = copy_vector.c
+DCOPYKERNEL  = copy_vector.c
+CCOPYKERNEL  = zcopy_vector.c
+ZCOPYKERNEL  = zcopy_vector.c
+
+SDOTKERNEL   = dot_vector.c
+DDOTKERNEL   = dot_vector.c
+CDOTKERNEL   = zdot_vector.c
+ZDOTKERNEL   = zdot_vector.c
+
+SNRM2KERNEL  = nrm2_vector.c
+DNRM2KERNEL  = nrm2_vector.c
+CNRM2KERNEL  = znrm2_vector.c
+ZNRM2KERNEL  = znrm2_vector.c
+
+SROTKERNEL   = rot_vector.c
+DROTKERNEL   = rot_vector.c
+CROTKERNEL   = zrot_vector.c
+ZROTKERNEL   = zrot_vector.c
+
+SSCALKERNEL  = scal_vector.c
+DSCALKERNEL  = scal_vector.c
+CSCALKERNEL  = zscal_vector.c
+ZSCALKERNEL  = zscal_vector.c
+
+SSWAPKERNEL  = swap_vector.c
+DSWAPKERNEL  = swap_vector.c
+CSWAPKERNEL  = zswap_vector.c
+ZSWAPKERNEL  = zswap_vector.c
+
+SGEMVNKERNEL = gemv_n_vector.c
+DGEMVNKERNEL = gemv_n_vector.c
+CGEMVNKERNEL = zgemv_n_vector.c
+ZGEMVNKERNEL = zgemv_n_vector.c
+
+SGEMVTKERNEL = gemv_t_vector.c
+DGEMVTKERNEL = gemv_t_vector.c
+CGEMVTKERNEL = zgemv_t_vector.c
+ZGEMVTKERNEL = zgemv_t_vector.c
+
+STRMMKERNEL	= ../generic/trmmkernel_16x4.c
+DTRMMKERNEL	= ../generic/trmmkernel_8x4.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  sgemm_kernel_16x4_c910v.c
+ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
+SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  dgemm_kernel_8x4_c910v.c
+ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
+DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+endif
+DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
+DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  symv_U_vector.c
+SSYMV_L_KERNEL =  symv_L_vector.c
+DSYMV_U_KERNEL =  symv_U_vector.c
+DSYMV_L_KERNEL =  symv_L_vector.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+CHEMV_L_KERNEL =  zhemv_LM_vector.c
+CHEMV_M_KERNEL =  zhemv_LM_vector.c
+CHEMV_U_KERNEL =  zhemv_UV_vector.c
+CHEMV_V_KERNEL =  zhemv_UV_vector.c
+ZHEMV_L_KERNEL =  zhemv_LM_vector.c
+ZHEMV_M_KERNEL =  zhemv_LM_vector.c
+ZHEMV_U_KERNEL =  zhemv_UV_vector.c
+ZHEMV_V_KERNEL =  zhemv_UV_vector.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/KERNEL.RISCV64_GENERIC b/kernel/riscv64/KERNEL.RISCV64_GENERIC
new file mode 100644
index 000000000..ea6a8cf21
--- /dev/null
+++ b/kernel/riscv64/KERNEL.RISCV64_GENERIC
@@ -0,0 +1,164 @@
+SAMAXKERNEL  = ../riscv64/amax.c
+DAMAXKERNEL  = ../riscv64/amax.c
+CAMAXKERNEL  = ../riscv64/zamax.c
+ZAMAXKERNEL  = ../riscv64/zamax.c
+
+SAMINKERNEL  = ../riscv64/amin.c
+DAMINKERNEL  = ../riscv64/amin.c
+CAMINKERNEL  = ../riscv64/zamin.c
+ZAMINKERNEL  = ../riscv64/zamin.c
+
+SMAXKERNEL   = ../riscv64/max.c
+DMAXKERNEL   = ../riscv64/max.c
+
+SMINKERNEL   = ../riscv64/min.c
+DMINKERNEL   = ../riscv64/min.c
+
+ISAMAXKERNEL = ../riscv64/iamax.c
+IDAMAXKERNEL = ../riscv64/iamax.c
+ICAMAXKERNEL = ../riscv64/izamax.c
+IZAMAXKERNEL = ../riscv64/izamax.c
+
+ISAMINKERNEL = ../riscv64/iamin.c
+IDAMINKERNEL = ../riscv64/iamin.c
+ICAMINKERNEL = ../riscv64/izamin.c
+IZAMINKERNEL = ../riscv64/izamin.c
+
+ISMAXKERNEL  = ../riscv64/imax.c
+IDMAXKERNEL  = ../riscv64/imax.c
+
+ISMINKERNEL  = ../riscv64/imin.c
+IDMINKERNEL  = ../riscv64/imin.c
+
+SASUMKERNEL  = ../riscv64/asum.c
+DASUMKERNEL  = ../riscv64/asum.c
+CASUMKERNEL  = ../riscv64/zasum.c
+ZASUMKERNEL  = ../riscv64/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../riscv64/axpy.c
+DAXPYKERNEL  = ../riscv64/axpy.c
+CAXPYKERNEL  = ../riscv64/zaxpy.c
+ZAXPYKERNEL  = ../riscv64/zaxpy.c
+
+SCOPYKERNEL  = ../riscv64/copy.c
+DCOPYKERNEL  = ../riscv64/copy.c
+CCOPYKERNEL  = ../riscv64/zcopy.c
+ZCOPYKERNEL  = ../riscv64/zcopy.c
+
+SDOTKERNEL   = ../riscv64/dot.c
+DDOTKERNEL   = ../riscv64/dot.c
+CDOTKERNEL   = ../riscv64/zdot.c
+ZDOTKERNEL   = ../riscv64/zdot.c
+
+SNRM2KERNEL  = ../riscv64/nrm2.c
+DNRM2KERNEL  = ../riscv64/nrm2.c
+CNRM2KERNEL  = ../riscv64/znrm2.c
+ZNRM2KERNEL  = ../riscv64/znrm2.c
+
+SROTKERNEL   = ../riscv64/rot.c
+DROTKERNEL   = ../riscv64/rot.c
+CROTKERNEL   = ../riscv64/zrot.c
+ZROTKERNEL   = ../riscv64/zrot.c
+
+SSCALKERNEL  = ../riscv64/scal.c
+DSCALKERNEL  = ../riscv64/scal.c
+CSCALKERNEL  = ../riscv64/zscal.c
+ZSCALKERNEL  = ../riscv64/zscal.c
+
+SSWAPKERNEL  = ../riscv64/swap.c
+DSWAPKERNEL  = ../riscv64/swap.c
+CSWAPKERNEL  = ../riscv64/zswap.c
+ZSWAPKERNEL  = ../riscv64/zswap.c
+
+SGEMVNKERNEL = ../riscv64/gemv_n.c
+DGEMVNKERNEL = ../riscv64/gemv_n.c
+CGEMVNKERNEL = ../riscv64/zgemv_n.c
+ZGEMVNKERNEL = ../riscv64/zgemv_n.c
+
+SGEMVTKERNEL = ../riscv64/gemv_t.c
+DGEMVTKERNEL = ../riscv64/gemv_t.c
+CGEMVTKERNEL = ../riscv64/zgemv_t.c
+ZGEMVTKERNEL = ../riscv64/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy.o
+SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy.o
+DGEMMOTCOPYOBJ = dgemm_otcopy.o
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy.o
+CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy.o
+ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+SSYMV_U_KERNEL =  ../generic/symv_k.c
+SSYMV_L_KERNEL =  ../generic/symv_k.c
+DSYMV_U_KERNEL =  ../generic/symv_k.c
+DSYMV_L_KERNEL =  ../generic/symv_k.c
+CSYMV_U_KERNEL =  ../generic/zsymv_k.c
+CSYMV_L_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_U_KERNEL =  ../generic/zsymv_k.c
+ZSYMV_L_KERNEL =  ../generic/zsymv_k.c
+
+
+LSAME_KERNEL = ../generic/lsame.c
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+
+ifndef SGEMM_BETA
+SGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef DGEMM_BETA
+DGEMM_BETA = ../generic/gemm_beta.c
+endif
+ifndef CGEMM_BETA
+CGEMM_BETA = ../generic/zgemm_beta.c
+endif
+ifndef ZGEMM_BETA
+ZGEMM_BETA = ../generic/zgemm_beta.c
+endif
diff --git a/kernel/riscv64/amax.c b/kernel/riscv64/amax.c
new file mode 100644
index 000000000..792e68bd9
--- /dev/null
+++ b/kernel/riscv64/amax.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c
new file mode 100644
index 000000000..b6aec131e
--- /dev/null
+++ b/kernel/riscv64/amax_vector.c
@@ -0,0 +1,245 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	if (n <= 0 || inc_x <= 0) return(maxf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        MASK_T mask0, mask1;
+        FLOAT zero = 0.0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl*2;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        v_max = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl*2;
+                                ix += inc_xv*2;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c
new file mode 100644
index 000000000..78495a8e3
--- /dev/null
+++ b/kernel/riscv64/amin.c
@@ -0,0 +1,75 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < minf )
+		{
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c
new file mode 100644
index 000000000..53243ad56
--- /dev/null
+++ b/kernel/riscv64/amin_vector.c
@@ -0,0 +1,241 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+
+        MASK_T mask0, mask1;
+	FLOAT zero = 0.0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl*2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                //v1 = VFRSUBVF_MASK_FLOAT(v1, 0, mask1, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl*2;
+                                idx += inc_xv*2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        //v0 = VFRSUBVF_MASK_FLOAT(v0, 0, mask0, gvl);
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(v0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/asum.c b/kernel/riscv64/asum.c
new file mode 100644
index 000000000..b284ae3fc
--- /dev/null
+++ b/kernel/riscv64/asum.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	n *= inc_x;
+	while(i < n)
+	{
+		sumf += ABS(x[i]);
+		i += inc_x;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c
new file mode 100644
index 000000000..7ab7484e8
--- /dev/null
+++ b/kernel/riscv64/asum_vector.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+
+        MASK_T mask0, mask1;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                                inc_xv += inc_xv * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/axpby.c b/kernel/riscv64/axpby.c
new file mode 100644
index 000000000..278747f75
--- /dev/null
+++ b/kernel/riscv64/axpby.c
@@ -0,0 +1,96 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	if ( beta == 0.0 )
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] = 0.0 ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+
+		if ( alpha == 0.0 )
+		{
+			while(i < n)
+			{
+				y[iy] =  beta * y[iy] ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+		else
+		{
+			while(i < n)
+			{
+				y[iy] = alpha * x[ix] + beta * y[iy] ;
+				ix += inc_x ;
+				iy += inc_y ;
+				i++ ;
+			}
+		}
+
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/axpby_vector.c b/kernel/riscv64/axpby_vector.c
new file mode 100644
index 000000000..432708db7
--- /dev/null
+++ b/kernel/riscv64/axpby_vector.c
@@ -0,0 +1,378 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha, FLOAT *x, BLASLONG inc_x, FLOAT beta, FLOAT *y, BLASLONG inc_y)
+{
+	if (n < 0)  return(0);
+
+	BLASLONG i=0, j=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+        FLOAT_V_T vy0, vy1;
+
+	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
+
+        if(beta == 0.0){
+                if(alpha == 0.0){//alpha == 0 && beta == 0
+                        if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy0, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy0, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+
+                }else{//alpha != 0 && beta == 0, y = ax
+			if(inc_x == 1 && inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else if(inc_x == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{//inc_x !=1 && inc_y != 1
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vy1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vy0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+		}
+        }else{//beta != 0
+		if(alpha == 0.0){//alpha == 0 && beta != 0; y = by
+			if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMULVF_FLOAT(vy1, beta, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMULVF_FLOAT(vy1, beta, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMULVF_FLOAT(vy0, beta, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+			}
+
+		}else{//alpha != 0 && beta != 0; y = ax + by
+			if(inc_x == 1 && inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1,gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+			}else if(inc_y == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSEV_FLOAT(&y[j], vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else if(inc_x == 1){
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }else{//inc_x != 1 && inc_y != 1
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                stride_x = inc_x * sizeof(FLOAT);
+                                stride_y = inc_y * sizeof(FLOAT);
+                                if(gvl <= n/2){
+                                        BLASLONG inc_xv = inc_x * gvl;
+                                        BLASLONG inc_yv = inc_y * gvl;
+                                        for(i=0,j=0;i<n/(2*gvl);i++){
+                                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                                vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                                vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+
+                                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                                vx1 = VFMULVF_FLOAT(vx1, alpha, gvl);
+                                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                                vy1 = VFMACCVF_FLOAT(vx1, beta, vy1, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                                j += gvl * 2;
+                                                ix += inc_xv * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                        vx0 = VFMULVF_FLOAT(vx0, alpha, gvl);
+                                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                                        vy0 = VFMACCVF_FLOAT(vx0, beta, vy0, gvl);
+                                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                                        j += gvl;
+                                }
+                        }
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/axpy.c b/kernel/riscv64/axpy.c
new file mode 100644
index 000000000..fb1094dd9
--- /dev/null
+++ b/kernel/riscv64/axpy.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+
+	if ( n < 0     )  return(0);
+	if ( da == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/axpy_vector.c b/kernel/riscv64/axpy_vector.c
new file mode 100644
index 000000000..5a7ba4b08
--- /dev/null
+++ b/kernel/riscv64/axpy_vector.c
@@ -0,0 +1,179 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0, j=0, jx=0, jy=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+	FLOAT_V_T vy0, vy1;
+	BLASLONG stride_x, stride_y;
+
+	if (n < 0)  return(0);
+	if (da == 0.0) return(0);
+
+	if (inc_x == 1 && inc_y == 1) {
+
+		gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+
+		if (gvl <= n/2) {
+			for (i = 0, j=0; i < n/(2*gvl); i++, j+=2*gvl) {
+				vx0 = VLEV_FLOAT(&x[j], gvl);
+				vy0 = VLEV_FLOAT(&y[j], gvl);
+				vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+				VSEV_FLOAT(&y[j], vy0, gvl);
+
+				vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+				vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+				vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+				VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+			}
+		}
+		//tail
+		for (; j < n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLEV_FLOAT(&x[j], gvl);
+			vy0 = VLEV_FLOAT(&y[j], gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSEV_FLOAT(&y[j], vy0, gvl);
+
+			j += gvl;
+		}
+	}else if (inc_y == 1) {
+		stride_x = inc_x * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLSEV_FLOAT(&x[jx], stride_x, gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSEV_FLOAT(&y[j], vy0, gvl);
+
+			        vx1 = VLSEV_FLOAT(&x[jx+inc_xv], stride_x, gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+
+                                j += gvl * 2;
+                                jx += inc_xv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+			vy0 = VLEV_FLOAT(&y[j], gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSEV_FLOAT(&y[j], vy0, gvl);
+			j += gvl;
+		}
+        }else if(inc_x == 1){
+		stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLSEV_FLOAT(&y[jy], stride_y, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSSEV_FLOAT(&y[jy], stride_y, vy0, gvl);
+
+			        vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLSEV_FLOAT(&y[jy+inc_yv], stride_y, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSSEV_FLOAT(&y[jy+inc_yv], stride_y, vy1, gvl);
+
+                                j += gvl * 2;
+                                jy += inc_yv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLEV_FLOAT(&x[j], gvl);
+			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+			j += gvl;
+		}
+	}else{
+		stride_x = inc_x * sizeof(FLOAT);
+		stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+			        vx0 = VLSEV_FLOAT(&x[jx], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[jy], stride_y, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+                                VSSEV_FLOAT(&y[jy], stride_y, vy0, gvl);
+
+			        vx1 = VLSEV_FLOAT(&x[jx+inc_xv], stride_x, gvl);
+                                vy1 = VLSEV_FLOAT(&y[jy+inc_yv], stride_y, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, da, vx1, gvl);
+                                VSSEV_FLOAT(&y[jy+inc_yv], stride_y, vy1, gvl);
+
+                                j += gvl * 2;
+                                jx += inc_xv * 2;
+                                jy += inc_yv * 2;
+                        }
+                }
+		for (; j<n; ) {
+			gvl = vsetvli(n - j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, da, vx0, gvl);
+			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+			j += gvl;
+		}
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/copy.c b/kernel/riscv64/copy.c
new file mode 100644
index 000000000..7b4f04f30
--- /dev/null
+++ b/kernel/riscv64/copy.c
@@ -0,0 +1,59 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		y[iy] = x[ix] ;
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/copy_vector.c b/kernel/riscv64/copy_vector.c
new file mode 100644
index 000000000..ce44a20f3
--- /dev/null
+++ b/kernel/riscv64/copy_vector.c
@@ -0,0 +1,148 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0,iy=0;
+	if(n < 0)  return(0);
+
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T v0, v1, v2, v3;
+        unsigned int gvl = 0;
+
+        if(inc_x == 1 && inc_y == 1){
+                memcpy(&y[0], &x[0], n*sizeof(FLOAT));
+        }else if (inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG gvl3 = gvl * 3;
+                        BLASLONG inc_xv3 = inc_xv * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                VSEV_FLOAT(&y[j], v0, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl], v1, gvl);
+
+                                v2 = VLSEV_FLOAT(&x[ix+inc_xv*2], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl*2], v2, gvl);
+                                v3 = VLSEV_FLOAT(&x[ix+inc_xv3], stride_x, gvl);
+                                VSEV_FLOAT(&y[j+gvl3], v3, gvl);
+                                j += gvl * 4;
+                                ix += inc_xv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        VSEV_FLOAT(&y[j], v0, gvl);
+                        j += gvl;
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        BLASLONG inc_yv3 = inc_yv * 3;
+                        BLASLONG gvl3 = gvl * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, v1, gvl);
+
+                                v2 = VLEV_FLOAT(&x[j+gvl*2], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv*2], stride_y, v2, gvl);
+                                v3 = VLEV_FLOAT(&x[j+gvl3], gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv3], stride_y, v3, gvl);
+                                j += gvl * 4;
+                                iy += inc_yv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
+                        j += gvl;
+                }
+
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/4){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        BLASLONG inc_xv3 = inc_xv * 3;
+                        BLASLONG inc_yv3 = inc_yv * 3;
+                        for(i=0,j=0; i<n/(4*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, v1, gvl);
+
+                                v2 = VLSEV_FLOAT(&x[ix+inc_xv*2], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv*2], stride_y, v2, gvl);
+                                v3 = VLSEV_FLOAT(&x[ix+inc_xv3], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv3], stride_y, v3, gvl);
+
+                                j += gvl * 4;
+                                ix += inc_xv * 4;
+                                iy += inc_yv * 4;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v0, gvl);
+                        j += gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/dgemm_kernel_8x4_c910v.c b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
new file mode 100644
index 000000000..bcbf942c0
--- /dev/null
+++ b/kernel/riscv64/dgemm_kernel_8x4_c910v.c
@@ -0,0 +1,977 @@
+#include "common.h"
+#include <riscv-vector.h>
+
+#define KERNEL8x4_I \
+	"addi       t1,    %[PB], 1*8  \n\t"\
+	"addi       t2,    %[PB], 2*8  \n\t"\
+	"addi       t3,    %[PB], 3*8  \n\t"\
+	"fld        ft0,  (%[PB])      \n\t"\
+	"fld        ft1,  (t1)         \n\t"\
+	"fld        ft2,  (t2)         \n\t"\
+	"fld        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 2*8  \n\t"\
+	"addi       t5,    %[PA], 4*8  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 6*8  \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*8  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"fld        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"fld        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"fld        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"fld        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2    \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"vfmacc.vv  v27,  v10,    v3    \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v30,  v11,    v2    \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+#define KERNEL8x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"fld        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"fld        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"fld        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"fld        ft7,  (t3)        \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v27,  v10,    v3   \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v30,  v11,    v2   \n\t"\
+	"vfmacc.vv  v31,  v11,    v3   \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL8x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*8  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*8  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    8*8  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    8*8  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"fld        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"fld        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"fld        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"fld        ft3,  (t3)         \n\t"\
+        "addi       %[PB], %[PB], 4*8  \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"addi       t1,   t1,     4*8  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"addi       t2,   t2,     4*8  \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*8  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL8x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"
+
+
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3;
+   FLOAT *ptrba,*ptrbb;
+   
+   FLOAT loadb0,loadb1,loadb2,loadb3;
+   FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
+
+   FLOAT res0,res1,res2,res3;
+   FLOAT res4,res5,res6,res7;
+   FLOAT res8,res9,res10,res11;
+   FLOAT res12,res13,res14,res15;
+
+   for (j=0; j<bn/4; j+=1){
+	   C0 = C;
+	   C1 = C0+ldc;
+	   C2 = C1+ldc;
+	   C3 = C2+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e64,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+				"vfmv.v.f   v26,  ft11         \n\t"
+				"vfmv.v.f   v27,  ft11         \n\t"
+				
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				"vfmv.v.f   v30,  ft11         \n\t"
+				"vfmv.v.f   v31,  ft11         \n\t"
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M8x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL8x4_I
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M8x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M8x4_MAINLOOP:                \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_MAINLOOP \n\t"
+				
+				"M8x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_E
+				
+				//tail
+				"M8x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M8x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 2*8  \n\t"
+				"addi       t5,    %[PA], 4*8  \n\t"
+				"addi       t6,    %[PA], 6*8  \n\t"
+				"addi       t1,    %[PB], 1*8  \n\t"
+				"addi       t2,    %[PB], 2*8  \n\t"
+				"addi       t3,    %[PB], 3*8  \n\t"
+
+				".align 4                      \n\t"
+				"M8x4_TAILLOOP:                \n\t"
+				"fld        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*8  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 8*8  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    8*8  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"fld        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*8  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    8*8  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    8*8  \n\t"
+
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"fld        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,     4*8  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+								
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"fld        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,     4*8  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+				"vfmacc.vv  v26,  v10,    v2    \n\t"
+				"vfmacc.vv  v27,  v10,    v3    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+				"vfmacc.vv  v30,  v11,    v2    \n\t"
+				"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M8x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 2*8   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 2*8   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 2*8   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 2*8   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 4*8  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 4*8  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 4*8  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 4*8  \n\t"
+
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     4*8  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     4*8  \n\t"
+
+				"vle.v      v2,   (%[C2])      \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"add        t6,   t6,     4*8  \n\t"
+
+				"vle.v      v3,   (%[C3])      \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"add        t3,   t3,     4*8  \n\t"
+
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v26 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v30 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 4*8  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 4*8  \n\t"
+
+				"vfmacc.vv  v6,   v8, v27 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 4*8  \n\t"
+
+				"vfmacc.vv  v7,   v8, v31 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 4*8  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M8x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+				 "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+			   res6 = res6 + load2 * loadb1;
+			   res7 = res7 + load3 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+			   res10 = res10 + load2 * loadb2;
+			   res11 = res11 + load3 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+			   res14 = res14 + load2 * loadb3;
+			   res15 = res15 + load3 * loadb3;
+
+			   ptrba += 4;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C1[0] += res4;
+		   C1[1] += res5;
+		   C1[2] += res6;
+		   C1[3] += res7;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   C2[2] += res10;
+		   C2[3] += res11;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+		   C3[2] += res14;
+		   C3[3] += res15;
+
+		   C0 += 4;
+		   C1 += 4;
+		   C2 += 4;
+		   C3 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+		   
+       		   res0 = 0;
+		   res1 = 0;
+		   
+		   res4 = 0;
+		   res5 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   res12 = 0;
+		   res13 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+
+			   ptrba += 2;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+		   C1[0] += res4;
+		   C1[1] += res5;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+
+		   C0 += 2;
+		   C1 += 2;
+		   C2 += 2;
+		   C3 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+		   		   
+       		   res0 = 0;
+		   
+		   res4 = 0;
+		   
+		   res8 = 0;
+		   
+		   res12 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+
+			   ptrba += 1;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+
+		   res4 = res4 * alpha;
+
+       		   res8 = res8 * alpha;
+
+		   res12 = res12 * alpha;
+
+		   C0[0] += res0;
+		   C1[0] += res4;
+   		   C2[0] += res8;
+		   C3[0] += res12;
+
+		   C0 += 1;
+		   C1 += 1;
+		   C2 += 1;
+		   C3 += 1;
+	   }
+	   
+	   k = bk<<2;
+	   bb = bb+k;
+	   i = ldc<<2;
+	   C = C+i;
+   }
+   
+   if(bn&2){
+	   C0 = C;
+	   C1 = C0+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+			   load4 = ptrba[4];
+			   load5 = ptrba[5];
+			   load6 = ptrba[6];
+			   load7 = ptrba[7];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load4 * loadb0;
+			   res5 = res5 + load5 * loadb0;
+			   res6 = res6 + load6 * loadb0;
+			   res7 = res7 + load7 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   res12 = res12 + load4 * loadb1;
+			   res13 = res13 + load5 * loadb1;
+			   res14 = res14 + load6 * loadb1;
+			   res15 = res15 + load7 * loadb1;
+
+			   ptrba += 8;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+		   C1[4] += res12;
+		   C1[5] += res13;
+		   C1[6] += res14;
+		   C1[7] += res15;
+
+		   C0 += 8;
+		   C1 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   ptrba += 4;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+
+		   C0 += 4;
+		   C1 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+
+			   ptrba += 2;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   
+		   C0 += 2;
+		   C1 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+       		   res0 = 0;
+		   res8 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+   			   res8 = res8 + load0 * loadb1;
+			   ptrba += 1;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+       		   res8 = res8 * alpha;
+
+		   C0[0] += res0;
+   		   C1[0] += res8;
+		   
+		   C0 += 1;
+		   C1 += 1;
+	   }
+	   k = bk<<1;
+	   bb = bb+k;
+	   i = ldc<<1;
+	   C = C+i;
+   }
+
+   if (bn&1){
+	   C0 = C;
+	   ptrba = ba;
+	   for(i=0; i<bm/8; i+=1){
+		   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   ptrba += 8;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   ptrba += 4;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C0 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+
+			   ptrba += 2;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   
+		   C0 += 2;
+	   }
+	   if(bm&1){
+   		   ptrbb = bb;
+   		   res0 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   ptrba += 1;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   C0[0] += res0;
+		   C0 += 1;
+	   }
+
+	   k = bk;
+	   bb = bb+k;
+	   C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/dot.c b/kernel/riscv64/dot.c
new file mode 100644
index 000000000..46a84ad18
--- /dev/null
+++ b/kernel/riscv64/dot.c
@@ -0,0 +1,64 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+	while(i < n)
+	{
+
+		dot += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+
diff --git a/kernel/riscv64/dot_vector.c b/kernel/riscv64/dot_vector.c
new file mode 100644
index 000000000..8b1ae278c
--- /dev/null
+++ b/kernel/riscv64/dot_vector.c
@@ -0,0 +1,172 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#endif
+
+#if defined(DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0, j=0;
+	double dot = 0.0 ;
+
+	if ( n < 0 )  return(dot);
+
+        FLOAT_V_T vr, vx, vy;
+        unsigned int gvl = 0;
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int stride_y = inc_y * sizeof(FLOAT);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        vr = VFMACCVV_FLOAT(vr, vx, vy, gvl);
+                        j += gvl;
+                }
+                if(j > 0){
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vx, gvl);
+                        dot += vx[0];
+                }
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(vx, vy, gvl);
+                        vr = VFMACCVV_FLOAT(vz, vx, vy, gvl);
+                        vx = VFREDSUM_FLOAT(vr, vz, gvl);
+                        dot += vx[0];
+                }
+        }
+	return(dot);
+}
+
+
diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c
new file mode 100644
index 000000000..ef61b245b
--- /dev/null
+++ b/kernel/riscv64/gemv_n.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	ix = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = alpha * x[ix];
+		iy = 0;
+		for (i=0; i<m; i++)
+		{
+			y[iy] += temp * a_ptr[i];
+			iy += inc_y;
+		}
+		a_ptr += lda;
+		ix    += inc_x;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/gemv_n_vector.c b/kernel/riscv64/gemv_n_vector.c
new file mode 100644
index 000000000..bd4d23eae
--- /dev/null
+++ b/kernel/riscv64/gemv_n_vector.c
@@ -0,0 +1,146 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+        BLASLONG ix = 0, iy = 0;
+
+	if(n < 0)  return(0);
+        FLOAT *a_ptr = a;
+        FLOAT temp = 0.0;
+        FLOAT_V_T va0, va1, vy0, vy1;
+        unsigned int gvl = 0;
+        if(inc_y == 1){
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                if(gvl <= m/2){
+                        for(k=0,j=0; k<m/(2*gvl); k++){
+                                a_ptr = a;
+                                ix = 0;
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                for(i = 0; i < n; i++){
+                                        temp = alpha * x[ix];
+                                        va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                        va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
+                                        vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
+                                        a_ptr += lda;
+                                        ix += inc_x;
+                                }
+                                VSEV_FLOAT(&y[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vy1, gvl);
+                                j += gvl * 2;
+                        }
+                }
+                //tail
+                for(;j < m;){
+                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        a_ptr = a;
+                        ix = 0;
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        for(i = 0; i < n; i++){
+                                temp = alpha * x[ix];
+                                va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                a_ptr += lda;
+                                ix += inc_x;
+                        }
+                        VSEV_FLOAT(&y[j], vy0, gvl);
+                        j += gvl;
+                }
+        }else{
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                if(gvl <= m/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(k=0,j=0; k<m/(2*gvl); k++){
+                                a_ptr = a;
+                                ix = 0;
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                for(i = 0; i < n; i++){
+                                        temp = alpha * x[ix];
+                                        va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                        vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                        va1 = VLEV_FLOAT(&a_ptr[j+gvl], gvl);
+                                        vy1 = VFMACCVF_FLOAT(vy1, temp, va1, gvl);
+                                        a_ptr += lda;
+                                        ix += inc_x;
+                                }
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy1, gvl);
+                                j += gvl * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                //tail
+                for(;j < m;){
+                        gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                        a_ptr = a;
+                        ix = 0;
+                        vy0 = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl);
+                        for(i = 0; i < n; i++){
+                                temp = alpha * x[ix];
+                                va0 = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp, va0, gvl);
+
+                                a_ptr += lda;
+                                ix += inc_x;
+                        }
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
+                        j += gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/gemv_t.c b/kernel/riscv64/gemv_t.c
new file mode 100644
index 000000000..169047b72
--- /dev/null
+++ b/kernel/riscv64/gemv_t.c
@@ -0,0 +1,68 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/09/14 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp;
+
+	iy = 0;
+	a_ptr = a;
+
+	for (j=0; j<n; j++)
+	{
+		temp = 0.0;
+		ix = 0;
+		for (i=0; i<m; i++)
+		{
+			temp += a_ptr[i] * x[ix];
+			ix    += inc_x;
+		}
+		y[iy] += alpha * temp;
+		iy += inc_y;
+		a_ptr += lda;
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/gemv_t_vector.c b/kernel/riscv64/gemv_t_vector.c
new file mode 100644
index 000000000..beca8dc0f
--- /dev/null
+++ b/kernel/riscv64/gemv_t_vector.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
+        FLOAT temp;
+
+        FLOAT_V_T va, vr, vx;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                for(i = 0; i < n; i++){
+                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        j = 0;
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < m/gvl; k++){
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                j += gvl;
+                        }
+                        va = VFMVVF_FLOAT(0, gvl);
+                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                        temp = va[0];
+                        if(j < m){
+                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp += va[0];
+                        }
+                        y[iy] += alpha * temp;
+                        iy += inc_y;
+                        a_ptr += lda;
+                }
+        }else{
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                for(i = 0; i < n; i++){
+                        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                        j = 0;
+                        ix = 0;
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < m/gvl; k++){
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, va, vx, gvl);
+                                j += gvl;
+                                ix += inc_xv;
+                        }
+                        va = VFMVVF_FLOAT(0, gvl);
+                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                        temp = va[0];
+                        if(j < m){
+                                gvl = vsetvli(m-j, RVV_EFLOAT, RVV_M);
+                                va = VLEV_FLOAT(&a_ptr[j], gvl);
+                                vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vr = VFMULVV_FLOAT(va, vx, gvl);
+
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp += va[0];
+                        }
+                        y[iy] += alpha * temp;
+                        iy += inc_y;
+                        a_ptr += lda;
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/iamax.c b/kernel/riscv64/iamax.c
new file mode 100644
index 000000000..8c016ce4d
--- /dev/null
+++ b/kernel/riscv64/iamax.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) > maxf )
+		{
+			max = i;
+			maxf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c
new file mode 100644
index 000000000..3aa64afc9
--- /dev/null
+++ b/kernel/riscv64/iamax_vector.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT maxf=0.0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+
+        FLOAT_V_T vx, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-1, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(0, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-1, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(0, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(0, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c
new file mode 100644
index 000000000..155292bd5
--- /dev/null
+++ b/kernel/riscv64/iamin.c
@@ -0,0 +1,77 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=ABS(x[0]);
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( ABS(x[ix]) < ABS(minf) )
+		{
+			min = i;
+			minf = ABS(x[ix]);
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c
new file mode 100644
index 000000000..608f19a00
--- /dev/null
+++ b/kernel/riscv64/iamin_vector.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(vx, 0, gvl);
+                        v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c
new file mode 100644
index 000000000..5072dd16e
--- /dev/null
+++ b/kernel/riscv64/imax.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	BLASLONG max=0;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			max = i;
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c
new file mode 100644
index 000000000..44af7101b
--- /dev/null
+++ b/kernel/riscv64/imax_vector.c
@@ -0,0 +1,176 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+	FLOAT maxf=-FLT_MAX;
+
+        FLOAT_V_T vx, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_max = VLEV_FLOAT(&x[j], gvl);
+
+                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                v_max_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        //index where element greater than v_max
+                        mask = VMFLTVV_FLOAT(v_max, vx, gvl);
+                        v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl);
+                        v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl);
+
+                        //update v_max and start_index j
+                        v_max = VFMAXVV_FLOAT(v_max, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                maxf = vx[0];
+                mask = VMFGEVF_FLOAT(v_max, maxf, gvl);
+                max_index = VMFIRSTM(mask,gvl);
+                max_index = v_max_index[max_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        vx = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        vx = VFREDMAXVS_FLOAT(v_max, vx, gvl);
+                        FLOAT cur_maxf = vx[0];
+                        if(cur_maxf > maxf){
+                                //tail index
+                                v_max_index = VIDV_UINT(gvl);
+                                v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                                mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                                max_index = VMFIRSTM(mask,gvl);
+                                max_index = v_max_index[max_index];
+                        }
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c
new file mode 100644
index 000000000..ffc65226e
--- /dev/null
+++ b/kernel/riscv64/imin.c
@@ -0,0 +1,67 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+/**************************************************************************************
+* 2013/08/19 Saar
+*	 BLASTEST float
+* 	 BLASTEST double
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+	BLASLONG min=0;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] < minf )
+		{
+			min = i;
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c
new file mode 100644
index 000000000..e6e0e9f9f
--- /dev/null
+++ b/kernel/riscv64/imin_vector.c
@@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#endif
+*/
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_min = VLEV_FLOAT(&x[j], gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                unsigned int idx = 0, inc_v = gvl * inc_x;
+
+                v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min_index = VMVVX_UINT(0, gvl);
+                for(i=0,j=0; i < n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        //index where element less than v_min
+                        mask = VMFLTVV_FLOAT(vx, v_min, gvl);
+                        v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask), "r"(gvl)
+        :"v0");
+#endif
+*/
+
+                        v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl);
+
+                        //update v_min and start_index j
+                        v_min = VFMINVV_FLOAT(v_min, vx, gvl);
+                        j += gvl;
+                        idx += inc_v;
+                }
+                vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                minf = vx[0];
+                mask = VMFLEVF_FLOAT(v_min, minf, gvl);
+                min_index = VMFIRSTM(mask,gvl);
+                min_index = v_min_index[min_index];
+
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+
+                        vx = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        vx = VFREDMINVS_FLOAT(v_min, vx, gvl);
+                        FLOAT cur_minf = vx[0];
+                        if(cur_minf < minf){
+                                //tail index
+                                v_min_index = VIDV_UINT(gvl);
+                                v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+                                mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                                min_index = VMFIRSTM(mask,gvl);
+                                min_index = v_min_index[min_index];
+                        }
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c
new file mode 100644
index 000000000..8fe33e95b
--- /dev/null
+++ b/kernel/riscv64/izamax.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG max=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(max);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			max = i;
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(max+1);
+}
+
+
diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c
new file mode 100644
index 000000000..62c95d973
--- /dev/null
+++ b/kernel/riscv64/izamax_vector.c
@@ -0,0 +1,246 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define RVV_EFLOAT RVV_E64
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+#define RVV_M RVV_M8
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT maxf=0.0;
+        unsigned int max_index = 0;
+	if (n <= 0 || inc_x <= 0) return(max_index);
+
+        FLOAT_V_T vx0, vx1, v_max;
+        UINT_V_T v_max_index;
+        MASK_T mask0, mask1;
+        unsigned int gvl = 0;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_max_index = VMVVX_UINT(0, gvl);
+        v_max = VFMVVF_FLOAT(-1, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = gvl * inc_x * 2;
+        BLASLONG ix = 0;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+
+                //index where element greater than v_max
+                mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl);
+                v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_max_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_max_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl);
+
+                //update v_max and start_index j
+                v_max = VFMAXVV_FLOAT(v_max, vx0, gvl);
+                j += gvl;
+                ix += inc_xv;
+        }
+        vx0 = VFMVVF_FLOAT(0, gvl);
+        vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
+        maxf = vx0[0];
+        mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl);
+        max_index = VMFIRSTM(mask0,gvl);
+        max_index = v_max_index[max_index];
+
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v_max_index = VMVVX_UINT(0, gvl);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_max = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx0 = VFMVVF_FLOAT(0, gvl);
+                vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl);
+                FLOAT cur_maxf = vx0[0];
+                if(cur_maxf > maxf){
+                        //tail index
+                        v_max_index = VIDV_UINT(gvl);
+                        v_max_index = VADDVX_UINT(v_max_index, j, gvl);
+
+                        mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl);
+                        max_index = VMFIRSTM(mask0,gvl);
+                        max_index = v_max_index[max_index];
+                }
+        }
+	return(max_index+1);
+}
+
+
diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c
new file mode 100644
index 000000000..fb5a0d4cb
--- /dev/null
+++ b/kernel/riscv64/izamin.c
@@ -0,0 +1,81 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG min=0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(min);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			min = i;
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(min+1);
+}
+
+
diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c
new file mode 100644
index 000000000..38eccf1b5
--- /dev/null
+++ b/kernel/riscv64/izamin_vector.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if defined(DOUBLE)
+
+#define RVV_EFLOAT RVV_E64
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8
+#define VMFIRSTM vmfirstm_e64xm8
+#define UINT_V_T uint64xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint64xm8
+#define VIDV_UINT vidv_uint64xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8
+#define VADDVX_UINT vaddvx_uint64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#define VMVVX_UINT vmvvx_uint64xm8
+#else
+
+#define ABS fabsf
+#define RVV_EFLOAT RVV_E32
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8
+#define VMFIRSTM vmfirstm_e32xm8
+#define UINT_V_T uint32xm8_t
+#define VIDV_MASK_UINT vidv_mask_uint32xm8
+#define VIDV_UINT vidv_uint32xm8
+#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8
+#define VADDVX_UINT vaddvx_uint32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#define VMVVX_UINT vmvvx_uint32xm8
+#endif
+
+#define RVV_M RVV_M8
+
+BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	FLOAT minf=FLT_MAX;
+        unsigned int min_index = 0;
+	if (n <= 0 || inc_x <= 0) return(min_index);
+
+        FLOAT_V_T vx0, vx1, v_min;
+        UINT_V_T v_min_index;
+        MASK_T mask0, mask1;
+        unsigned int gvl = 0;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_min_index = VMVVX_UINT(0, gvl);
+        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = gvl * inc_x * 2;
+        BLASLONG ix = 0;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx0 = VFADDVV_FLOAT(vx0, vx1, gvl);
+
+                //index where element less than v_min
+                mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl);
+                v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e64,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv v0, %1, %1 \n\t"
+        "vsetvli x0, %2, e32,m8 \n\t"
+        "vid.v %0, v0.t \n\t"
+        :"+v"(v_min_index)
+        :"v"(mask0), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl);
+
+                //update v_min and start_index j
+                v_min = VFMINVV_FLOAT(v_min, vx0, gvl);
+                j += gvl;
+                ix += inc_xv;
+        }
+        vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+        vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
+        minf = vx0[0];
+        mask0 = VMFLEVF_FLOAT(v_min, minf, gvl);
+        min_index = VMFIRSTM(mask0,gvl);
+        min_index = v_min_index[min_index];
+
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v_min_index = VMVVX_UINT(0, gvl);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                //fabs(vector)
+                mask0 = VMFLTVF_FLOAT(vx0, 0, gvl);
+                vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx0)
+        :"v"(mask0), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                //fabs(vector)
+                mask1 = VMFLTVF_FLOAT(vx1, 0, gvl);
+                vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl);
+/*
+#if defined(DOUBLE)
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e64,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#else
+asm volatile(
+        "vor.vv     v0, %1, %1\n\t"
+        "vsetvli    x0, %3, e32,m8 \n\t"
+        "vfrsub.vf  %0, %0, %2, v0.t \n\t"
+        :"+v"(vx1)
+        :"v"(mask1), "f"(zero), "r"(gvl)
+        :"v0");
+#endif
+*/
+                v_min = VFADDVV_FLOAT(vx0, vx1, gvl);
+                vx0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl);
+                FLOAT cur_minf = vx0[0];
+                if(cur_minf < minf){
+                        //tail index
+                        v_min_index = VIDV_UINT(gvl);
+                        v_min_index = VADDVX_UINT(v_min_index, j, gvl);
+
+                        mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl);
+                        min_index = VMFIRSTM(mask0,gvl);
+                        min_index = v_min_index[min_index];
+                }
+        }
+	return(min_index+1);
+}
+
+
diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c
new file mode 100644
index 000000000..2ad956bc0
--- /dev/null
+++ b/kernel/riscv64/max.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(maxf);
+
+	maxf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] > maxf )
+		{
+			maxf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c
new file mode 100644
index 000000000..4ef75452d
--- /dev/null
+++ b/kernel/riscv64/max_vector.c
@@ -0,0 +1,116 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT maxf=-FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_max = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                v_max = VFMAXVV_FLOAT(v_max, v1, gvl);
+                                j += gvl * 2;
+                                idx += inc_xv * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v_max, v1, gvl);
+                        maxf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v1 = VFMVVF_FLOAT(-FLT_MAX, gvl);
+                        v0 = VFREDMAXVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] > maxf)
+                                maxf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c
new file mode 100644
index 000000000..2812fe397
--- /dev/null
+++ b/kernel/riscv64/min.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: NoTest
+* 	 BLASTEST double	: NoTest
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf=0.0;
+
+	if (n <= 0 || inc_x <= 0) return(minf);
+
+	minf=x[0];
+	ix += inc_x;
+	i++;
+
+	while(i < n)
+	{
+		if( x[ix] < minf )
+		{
+			minf = x[ix];
+		}
+		ix += inc_x;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c
new file mode 100644
index 000000000..83c965bfa
--- /dev/null
+++ b/kernel/riscv64/min_vector.c
@@ -0,0 +1,116 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        BLASLONG idx = 0, inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(gvl*2); i++){
+                                v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                                v1 = VLSEV_FLOAT(&x[idx+inc_xv], stride_x, gvl);
+                                v_min = VFMINVV_FLOAT(v_min, v1, gvl);
+                                j += gvl * 2;
+                                idx += inc_xv * 2;
+                        }
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v_min, v1, gvl);
+                        minf = v0[0];
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        v1 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                        v0 = VFREDMINVS_FLOAT(v0, v1, gvl);
+                        if(v0[0] < minf)
+                                minf = v0[0];
+                        j += gvl;
+                }
+        }
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/nrm2.c b/kernel/riscv64/nrm2.c
new file mode 100644
index 000000000..fcff09337
--- /dev/null
+++ b/kernel/riscv64/nrm2.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	FLOAT absxi = 0.0;
+
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	if ( n == 1 ) return( ABS(x[0]) );
+
+	n *= inc_x;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			absxi = ABS( x[i] );
+			if ( scale < absxi )
+			{
+				ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi );
+				scale = absxi ;
+			}
+			else
+			{
+				ssq += ( absxi/scale ) * ( absxi/scale );
+			}
+
+		}
+		i += inc_x;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c
new file mode 100644
index 000000000..785c0d2f8
--- /dev/null
+++ b/kernel/riscv64/nrm2_vector.c
@@ -0,0 +1,220 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define ABS fabsf
+#define MASK_T e32xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
+#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
+#define VMFIRSTM vmfirstm_e32xm4
+#define VFDIVVF_FLOAT vfdivvf_float32xm4
+#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define ABS fabs
+#define MASK_T e64xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
+#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
+#define VMFIRSTM vmfirstm_e64xm4
+#define VFDIVVF_FLOAT vfdivvf_float64xm4
+#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if ( n < 0 )  return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v_zero;
+        unsigned int gvl = 0;
+        FLOAT scale = 0.0, ssq = 0.0;
+        MASK_T mask;
+        BLASLONG index = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                int idx = 0, inc_v = inc_x * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                        idx += inc_v;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/nrm2_vector_dot.c b/kernel/riscv64/nrm2_vector_dot.c
new file mode 100644
index 000000000..a3d15406c
--- /dev/null
+++ b/kernel/riscv64/nrm2_vector_dot.c
@@ -0,0 +1,128 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUM_FLOAT vfredsumvs_float32xm8
+#define VFMACCVV_FLOAT vfmaccvv_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFDOTVV_FLOAT vfdotvv_float32xm8
+#define ABS fabsf
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUM_FLOAT vfredsumvs_float64xm8
+#define VFMACCVV_FLOAT vfmaccvv_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFDOTVV_FLOAT vfdotvv_float64xm8
+#define ABS fabs
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	double len = 0.0 ;
+
+	if ( n < 0 )  return(0.0);
+        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v1;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl < n/2){
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                j += gvl;
+
+                                v1 = VLEV_FLOAT(&x[j], gvl);
+                                vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
+                                j += gvl;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
+                        len += v0[0];
+                }
+                //tail
+                for(;j < n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //v1 = 0
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(v0, v0, gvl);
+                        vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
+                        len += v0[0];
+
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT);
+                if(gvl < n/2){
+                        vr = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                j += gvl;
+
+                                v1 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                                vr = VFMACCVV_FLOAT(vr, v1, v1, gvl);
+                                j += gvl;
+                        }
+                        v0 = VFMVVF_FLOAT(0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v0, gvl);
+                        len += v0[0];
+                }
+                //tail
+                for(;j < n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        //v1 = 0
+                        v1 = VFMVVF_FLOAT(0, gvl);
+                        //vr = VFDOTVV_FLOAT(v0, v0, gvl);
+                        vr = VFMACCVV_FLOAT(v1, v0, v0, gvl);
+                        v0 = VFREDSUM_FLOAT(vr, v1, gvl);
+                        len += v0[0];
+
+                        j += gvl;
+                }
+        }
+	return(sqrt(len));
+}
+
+
diff --git a/kernel/riscv64/omatcopy_cn.c b/kernel/riscv64/omatcopy_cn.c
new file mode 100644
index 000000000..4d11b9125
--- /dev/null
+++ b/kernel/riscv64/omatcopy_cn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			for(j=0; j<rows; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		for(j=0; j<rows; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_ct.c b/kernel/riscv64/omatcopy_ct.c
new file mode 100644
index 000000000..b2587813f
--- /dev/null
+++ b/kernel/riscv64/omatcopy_ct.c
@@ -0,0 +1,89 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = 0.0;
+			}
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<cols ; i++ )
+		{
+			bptr = &b[i];
+			for(j=0; j<rows; j++)
+			{
+				bptr[j*ldb] = aptr[j];
+			}
+			aptr += lda;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<rows; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rn.c b/kernel/riscv64/omatcopy_rn.c
new file mode 100644
index 000000000..57515e729
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rn.c
@@ -0,0 +1,90 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	if ( alpha == 0.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = 0.0;
+			}
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	if ( alpha == 1.0 )
+	{
+		for ( i=0; i<rows ; i++ )
+		{
+			for(j=0; j<cols; j++)
+			{
+				bptr[j] = aptr[j];
+			}
+			aptr += lda;
+			bptr += ldb;
+		}
+		return(0);
+	}
+
+	for ( i=0; i<rows ; i++ )
+	{
+		for(j=0; j<cols; j++)
+		{
+			bptr[j] = alpha * aptr[j];
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/omatcopy_rt.c b/kernel/riscv64/omatcopy_rt.c
new file mode 100644
index 000000000..9d58350d5
--- /dev/null
+++ b/kernel/riscv64/omatcopy_rt.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[i];
+		for(j=0; j<cols; j++)
+		{
+			bptr[j*ldb] = alpha * aptr[j];
+		}
+		aptr += lda;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/rot.c b/kernel/riscv64/rot.c
new file mode 100644
index 000000000..18b4ca252
--- /dev/null
+++ b/kernel/riscv64/rot.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n <= 0     )  return(0);
+
+	while(i < n)
+	{
+		temp   = c*x[ix] + s*y[iy] ;
+		y[iy]  = c*y[iy] - s*x[ix] ;
+		x[ix]  = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/rot_vector.c b/kernel/riscv64/rot_vector.c
new file mode 100644
index 000000000..aeabca1ba
--- /dev/null
+++ b/kernel/riscv64/rot_vector.c
@@ -0,0 +1,196 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFMSACVF_FLOAT vfmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFMSACVF_FLOAT vfmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0,iy=0;
+
+	if(n <= 0)  return(0);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, vx, vy;
+
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+
+                        j += gvl;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+                }
+        }else if(inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl);
+                        vy = VLEV_FLOAT(&y[j], gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[j*inc_x], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSEV_FLOAT(&y[j], v1, gvl);
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG inc_yv = inc_y * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, v1, gvl);
+
+                        j += gvl;
+                        iy += inc_yv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLEV_FLOAT(&x[j], gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl;
+                BLASLONG inc_yv = inc_y * gvl;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                        iy += inc_yv;
+                }
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx = VLSEV_FLOAT(&x[j*inc_x],stride_x, gvl);
+                        vy = VLSEV_FLOAT(&y[j*inc_y],stride_y, gvl);
+
+                        v0 = VFMULVF_FLOAT(vx, c, gvl);
+                        v0 = VFMACCVF_FLOAT(v0, s, vy, gvl);
+                        VSSEV_FLOAT(&x[j*inc_x], stride_x, v0, gvl);
+
+                        v1 = VFMULVF_FLOAT(vx, s, gvl);
+                        v1 = VFMSACVF_FLOAT(v1, c, vy, gvl);
+                        VSSEV_FLOAT(&y[j*inc_y], stride_y, v1, gvl);
+                }
+        }
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/scal.c b/kernel/riscv64/scal.c
new file mode 100644
index 000000000..4ef49e293
--- /dev/null
+++ b/kernel/riscv64/scal.c
@@ -0,0 +1,63 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+
+	if ( (n <= 0) || (inc_x <= 0))
+		return(0);
+	
+
+	while(j < n)
+	{
+
+		if ( da == 0.0 )
+			x[i]=0.0;
+		else
+			x[i] = da * x[i] ;
+
+		i += inc_x ;
+		j++;
+
+	}
+	return 0;
+
+}
+
+
diff --git a/kernel/riscv64/scal_vector.c b/kernel/riscv64/scal_vector.c
new file mode 100644
index 000000000..5152eea06
--- /dev/null
+++ b/kernel/riscv64/scal_vector.c
@@ -0,0 +1,133 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#define VFMULVF_FLOAT vfmulvf_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#define VFMULVF_FLOAT vfmulvf_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0,j=0;
+
+	if ( (n <= 0) || (inc_x <= 0))
+		return(0);
+
+        FLOAT_V_T v0, v1;
+        unsigned int gvl = 0;
+        if(inc_x == 1){
+                if(da == 0.0){
+                        memset(&x[0], 0, n * sizeof(FLOAT));
+                }else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        if(gvl <= n / 2){
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        v0 = VLEV_FLOAT(&x[j], gvl);
+                                        v0 = VFMULVF_FLOAT(v0, da,gvl);
+                                        VSEV_FLOAT(&x[j], v0, gvl);
+
+                                        v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                        v1 = VFMULVF_FLOAT(v1, da, gvl);
+                                        VSEV_FLOAT(&x[j+gvl], v1, gvl);
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                v0 = VFMULVF_FLOAT(v0, da, gvl);
+                                VSEV_FLOAT(&x[j], v0, gvl);
+                                j += gvl;
+                        }
+                }
+        }else{
+                if(da == 0.0){
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        if(gvl <= n / 2){
+                                v0 = VFMVVF_FLOAT(0, gvl);
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        VSEV_FLOAT(&x[j], v0, gvl);
+                                        VSEV_FLOAT(&x[j+gvl], v0, gvl);
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VFMVVF_FLOAT(0, gvl);
+                                VSEV_FLOAT(&x[j], v0, gvl);
+                                j += gvl;
+                        }
+                }else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+                        BLASLONG ix = 0;
+                        if(gvl < n / 2){
+                                BLASLONG inc_xv = gvl * inc_x;
+                                for(i = 0, j = 0; i < n/(2*gvl); i++, j+=2*gvl){
+                                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        v0 = VFMULVF_FLOAT(v0, da,gvl);
+                                        VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+
+                                        v1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                        v1 = VFMULVF_FLOAT(v1, da, gvl);
+                                        VSSEV_FLOAT(&x[ix+inc_xv], stride_x, v1, gvl);
+                                        ix += inc_xv * 2;
+                                }
+                        }
+                        //tail
+                        for(; j <n; ){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                v0 = VFMULVF_FLOAT(v0, da, gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, v0, gvl);
+                                j += gvl;
+                                ix += inc_x * gvl;
+                        }
+                }
+        }
+	return 0;
+}
+
+
diff --git a/kernel/riscv64/sgemm_kernel_16x4_c910v.c b/kernel/riscv64/sgemm_kernel_16x4_c910v.c
new file mode 100644
index 000000000..83507e744
--- /dev/null
+++ b/kernel/riscv64/sgemm_kernel_16x4_c910v.c
@@ -0,0 +1,1575 @@
+#include "common.h"
+#include <riscv-vector.h>
+
+#define KERNEL16x4_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"addi       t2,    %[PB], 2*4  \n\t"\
+	"addi       t3,    %[PB], 3*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"addi       t5,    %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 12*4  \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2    \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"vfmacc.vv  v27,  v10,    v3    \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v30,  v11,    v2    \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+#define KERNEL16x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v26,  v10,    v2   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v27,  v10,    v3   \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmacc.vv  v30,  v11,    v2   \n\t"\
+	"vfmacc.vv  v31,  v11,    v3   \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL16x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL16x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v26,  v14,    v6   \n\t"\
+	"vfmacc.vv  v27,  v14,    v7   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmacc.vv  v30,  v15,    v6   \n\t"\
+	"vfmacc.vv  v31,  v15,    v7   \n\t"
+
+
+#define KERNEL8x4_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"addi       t2,    %[PB], 2*4  \n\t"\
+	"addi       t3,    %[PB], 3*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"addi       %[PB], %[PB], 4*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"vfmacc.vv  v24,  v10,    v0    \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v25,  v10,    v1    \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v28,  v11,    v0    \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1    \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"
+
+
+#define KERNEL8x4_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"flw        ft6,  (t2)        \n\t"\
+	"vfmacc.vv  v24,  v10,    v0   \n\t"\
+	"flw        ft7,  (t3)        \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v10,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"\
+	"vfmacc.vv  v28,  v11,    v0   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v11,    v1   \n\t"\
+	"vfmv.v.f   v14,  ft6          \n\t"\
+	"vfmv.v.f   v15,  ft7          \n\t"
+
+#define KERNEL8x4_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 8*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    8*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+        "addi       %[PB], %[PB], 4*4  \n\t"\
+	"flw        ft2,  (t2)         \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"flw        ft3,  (t3)         \n\t"\
+	"addi       t1,   t1,     4*4  \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t2,   t2,     4*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"addi       t3,   t3,     4*4  \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"\
+	"vfmv.v.f   v10,  ft2          \n\t"\
+	"vfmv.v.f   v11,  ft3          \n\t"
+
+#define KERNEL8x4_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v24,  v14,    v4   \n\t"\
+	"vfmacc.vv  v25,  v14,    v5   \n\t"\
+	"vfmacc.vv  v28,  v15,    v4   \n\t"\
+	"vfmacc.vv  v29,  v15,    v5   \n\t"
+
+
+#define KERNEL16x2_I \
+	"addi       t1,    %[PB], 1*4  \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       t4,    %[PA], 4*4  \n\t"\
+	"addi       t5,    %[PA], 8*4  \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"addi       t6,    %[PA], 12*4  \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"flw        ft4,  (%[PB])   \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"
+	
+
+#define KERNEL16x2_M1 \
+	"vfmacc.vv  v16,  v8,    v0   \n\t"\
+	"vle.v      v4,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v8,    v1   \n\t"\
+	"vle.v      v5,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v8,    v2   \n\t"\
+	"vle.v      v6,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v8,    v3   \n\t"\
+	"vle.v      v7,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"flw        ft4,  (%[PB])      \n\t"\
+	"vfmacc.vv  v20,  v9,    v0   \n\t"\
+	"flw        ft5,  (t1)        \n\t"\
+	"vfmacc.vv  v21,  v9,    v1   \n\t"\
+	"vfmv.v.f   v12,  ft4          \n\t"\
+	"vfmacc.vv  v22,  v9,    v2   \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmacc.vv  v23,  v9,    v3   \n\t"\
+	"addi       %[PB], %[PB], 2*4  \n\t"\
+	"vfmv.v.f   v13,  ft5          \n\t"
+
+
+#define KERNEL16x2_M2 \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vle.v      v0,   (%[PA])      \n\t"\
+	"addi       %[PA], %[PA], 16*4  \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vle.v      v1,   (t4)         \n\t"\
+	"addi       t4,    t4,    16*4  \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vle.v      v2,   (t5)         \n\t"\
+	"addi       t5,    t5,    16*4  \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vle.v      v3,   (t6)         \n\t"\
+	"addi       t6,    t6,    16*4  \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"flw        ft0,  (%[PB])      \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"flw        ft1,  (t1)         \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmv.v.f   v8,   ft0          \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"\
+        "addi       %[PB], %[PB], 2*4  \n\t"\
+	"addi       t1,   t1,     2*4  \n\t"\
+	"vfmv.v.f   v9,   ft1          \n\t"
+
+
+#define KERNEL16x2_E \
+	"vfmacc.vv  v16,  v12,    v4   \n\t"\
+	"vfmacc.vv  v17,  v12,    v5   \n\t"\
+	"vfmacc.vv  v18,  v12,    v6   \n\t"\
+	"vfmacc.vv  v19,  v12,    v7   \n\t"\
+	"vfmacc.vv  v20,  v13,    v4   \n\t"\
+	"vfmacc.vv  v21,  v13,    v5   \n\t"\
+	"vfmacc.vv  v22,  v13,    v6   \n\t"\
+	"vfmacc.vv  v23,  v13,    v7   \n\t"
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
+#ifdef TRMMKERNEL
+		,BLASLONG offset
+#endif
+		)
+{
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3;
+   FLOAT *ptrba,*ptrbb;
+   
+   FLOAT loadb0,loadb1,loadb2,loadb3;
+   FLOAT load0,load1,load2,load3,load4,load5,load6,load7;
+
+   FLOAT res0,res1,res2,res3;
+   FLOAT res4,res5,res6,res7;
+   FLOAT res8,res9,res10,res11;
+   FLOAT res12,res13,res14,res15;
+
+   for (j=0; j<bn/4; j+=1){
+	   C0 = C;
+	   C1 = C0+ldc;
+	   C2 = C1+ldc;
+	   C3 = C2+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+				"vfmv.v.f   v26,  ft11         \n\t"
+				"vfmv.v.f   v27,  ft11         \n\t"
+				
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				"vfmv.v.f   v30,  ft11         \n\t"
+				"vfmv.v.f   v31,  ft11         \n\t"
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M16x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL16x4_I
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M16x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M16x4_MAINLOOP:                \n\t"
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x4_MAINLOOP \n\t"
+				
+				"M16x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL16x4_M1
+				KERNEL16x4_M2
+				KERNEL16x4_M1
+				KERNEL16x4_E
+				
+				//tail
+				"M16x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M16x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				"addi       t5,    %[PA], 8*4  \n\t"
+				"addi       t6,    %[PA], 12*4  \n\t"
+				"addi       t1,    %[PB], 1*4  \n\t"
+				"addi       t2,    %[PB], 2*4  \n\t"
+				"addi       t3,    %[PB], 3*4  \n\t"
+
+				".align 4                      \n\t"
+				"M16x4_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 16*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    16*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*4  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    16*4  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    16*4  \n\t"
+
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"flw        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,    4*4  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+								
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"flw        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,    4*4  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+				"vfmacc.vv  v26,  v10,    v2    \n\t"
+				"vfmacc.vv  v27,  v10,    v3    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+				"vfmacc.vv  v30,  v11,    v2    \n\t"
+				"vfmacc.vv  v31,  v11,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M16x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 4*4   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     8*4  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     8*4  \n\t"
+
+				"vle.v      v2,   (%[C2])      \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"add        t6,   t6,     8*4  \n\t"
+
+				"vle.v      v3,   (%[C3])      \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"add        t3,   t3,     8*4  \n\t"
+
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v26 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v30 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+
+				"vfmacc.vv  v6,   v8, v27 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v31 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M16x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
+				 "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
+	   }
+	   if(bm&8){
+   		   ptrbb = bb;
+		   //t0 for k
+		   //ft0-ft3,ft4-ft7,v8-v15 for B, t1-t3 for PB1-3
+		   //v0-v3,v4-v7 for A, t4-t6 for PA1-3
+		   //v16-v31 for temp C
+		   
+		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				
+				"vfmv.v.f   v24,  ft11         \n\t"
+				"vfmv.v.f   v25,  ft11         \n\t"
+								
+				"vfmv.v.f   v28,  ft11         \n\t"
+				"vfmv.v.f   v29,  ft11         \n\t"
+				
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M8x4_TAIL    \n\t"
+				
+				//preloop
+				KERNEL8x4_I
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M8x4_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M8x4_MAINLOOP:                \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_MAINLOOP \n\t"
+				
+				"M8x4_MAINLOOP_TAIL:           \n\t"
+				KERNEL8x4_M1
+				KERNEL8x4_M2
+				KERNEL8x4_M1
+				KERNEL8x4_E
+				
+				//tail
+				"M8x4_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M8x4_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				
+				"addi       t1,    %[PB], 1*4  \n\t"
+				"addi       t2,    %[PB], 2*4  \n\t"
+				"addi       t3,    %[PB], 3*4  \n\t"
+
+				".align 4                      \n\t"
+				"M8x4_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 4*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 8*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    8*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     4*4  \n\t"
+				
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"flw        ft2,  (t2)         \n\t"
+				"addi       t2,   t2,    4*4  \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmv.v.f   v9,   ft1          \n\t"
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"flw        ft3,  (t3)         \n\t"
+				"addi       t3,   t3,    4*4  \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmv.v.f   v10,  ft2          \n\t"
+
+				"vfmv.v.f   v11,  ft3          \n\t"
+				"vfmacc.vv  v24,  v10,    v0    \n\t"
+				"vfmacc.vv  v25,  v10,    v1    \n\t"
+
+				"vfmacc.vv  v28,  v11,    v0    \n\t"
+				"vfmacc.vv  v29,  v11,    v1    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M8x4_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M8x4_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				"vle.v      v2,   (%[C2])      \n\t"
+				"addi       t6,   %[C2], 4*4   \n\t"
+				"vle.v      v3,   (%[C3])      \n\t"
+				"addi       t3,   %[C3], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				"vfmacc.vv  v2,   v8, v24 \n\t"
+				"vle.v      v6,   (t6)          \n\t"
+				"vfmacc.vv  v3,   v8, v28 \n\t"
+				"vle.v      v7,   (t3)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vfmacc.vv  v6,   v8, v25 \n\t"
+				"vse.v      v2,   (%[C2])      \n\t"
+				"add        %[C2], %[C2], 8*4  \n\t"
+
+				"vfmacc.vv  v7,   v8, v29 \n\t"
+				"vse.v      v3,   (%[C3])      \n\t"
+				"add        %[C3], %[C3], 8*4  \n\t"
+				
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"vse.v      v6,   (t6)         \n\t"
+				"vse.v      v7,   (t3)         \n\t"
+				"M8x4_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),[C2]"+r"(C2),[C3]"+r"(C3),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v20", "v21", 
+				 "v24", "v25", "v28", "v29");
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+			   res6 = res6 + load2 * loadb1;
+			   res7 = res7 + load3 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+			   res10 = res10 + load2 * loadb2;
+			   res11 = res11 + load3 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+			   res14 = res14 + load2 * loadb3;
+			   res15 = res15 + load3 * loadb3;
+
+			   ptrba += 4;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C1[0] += res4;
+		   C1[1] += res5;
+		   C1[2] += res6;
+		   C1[3] += res7;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   C2[2] += res10;
+		   C2[3] += res11;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+		   C3[2] += res14;
+		   C3[3] += res15;
+
+		   C0 += 4;
+		   C1 += 4;
+		   C2 += 4;
+		   C3 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+		   
+       		   res0 = 0;
+		   res1 = 0;
+		   
+		   res4 = 0;
+		   res5 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   res12 = 0;
+		   res13 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+			   res5 = res5 + load1 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+			   res9 = res9 + load1 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+			   res13 = res13 + load1 * loadb3;
+
+			   ptrba += 2;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+		   C1[0] += res4;
+		   C1[1] += res5;
+
+   		   C2[0] += res8;
+		   C2[1] += res9;
+		   
+		   C3[0] += res12;
+		   C3[1] += res13;
+
+		   C0 += 2;
+		   C1 += 2;
+		   C2 += 2;
+		   C3 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+		   		   
+       		   res0 = 0;
+		   
+		   res4 = 0;
+		   
+		   res8 = 0;
+		   
+		   res12 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+
+			   res4 = res4 + load0 * loadb1;
+
+			   loadb2 = ptrbb[2];
+			   loadb3 = ptrbb[3];
+			   
+   			   res8 = res8 + load0 * loadb2;
+
+			   res12 = res12 + load0 * loadb3;
+
+			   ptrba += 1;
+			   ptrbb += 4;
+		   }
+		   
+      		   res0 = res0 * alpha;
+
+		   res4 = res4 * alpha;
+
+       		   res8 = res8 * alpha;
+
+		   res12 = res12 * alpha;
+
+		   C0[0] += res0;
+		   C1[0] += res4;
+   		   C2[0] += res8;
+		   C3[0] += res12;
+
+		   C0 += 1;
+		   C1 += 1;
+		   C2 += 1;
+		   C3 += 1;
+	   }
+	   
+	   k = bk<<2;
+	   bb = bb+k;
+	   i = ldc<<2;
+	   C = C+i;
+   }
+   
+   if(bn&2){
+	   C0 = C;
+	   C1 = C0+ldc;
+
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+		   ptrbb = bb;
+   		   asm volatile(
+				"vsetvli    zero, zero, e32,m1 \n\t"
+				"fmv.w.x    ft11, zero         \n\t"
+				"mv         t0,   %[BK]        \n\t"
+				
+				"vfmv.v.f   v16,  ft11         \n\t"
+				"vfmv.v.f   v17,  ft11         \n\t"
+				"vfmv.v.f   v18,  ft11         \n\t"
+				"vfmv.v.f   v19,  ft11         \n\t"
+
+				"vfmv.v.f   v20,  ft11         \n\t"
+				"vfmv.v.f   v21,  ft11         \n\t"
+				"vfmv.v.f   v22,  ft11         \n\t"
+				"vfmv.v.f   v23,  ft11         \n\t"
+
+				//unloop 8
+				"srli       t0,   %[BK], 3     \n\t"
+				"blez       t0,   M16x2_TAIL    \n\t"
+				
+				//preloop
+				KERNEL16x2_I
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				"addi       t0,   t0, -1       \n\t"
+				"blez       t0,   M16x2_MAINLOOP_TAIL    \n\t"
+				".align 4                      \n\t"
+				"M16x2_MAINLOOP:                \n\t"
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x2_MAINLOOP \n\t"
+				
+				"M16x2_MAINLOOP_TAIL:           \n\t"
+				KERNEL16x2_M1
+				KERNEL16x2_M2
+				KERNEL16x2_M1
+				KERNEL16x2_E
+				
+				//tail
+				"M16x2_TAIL:                    \n\t"
+				"andi       t0,   %[BK], 7     \n\t"
+				"blez       t0,   M16x2_SAVERESULT   \n\t"
+
+				"addi       t4,    %[PA], 4*4  \n\t"
+				"addi       t5,    %[PA], 8*4  \n\t"
+				"addi       t6,    %[PA], 12*4  \n\t"
+				"addi       t1,    %[PB], 1*4  \n\t"
+
+				".align 4                      \n\t"
+				"M16x2_TAILLOOP:                \n\t"
+				"flw        ft0,  (%[PB])      \n\t"
+				"addi       %[PB], %[PB], 2*4  \n\t"
+				"vle.v      v0,   (%[PA])      \n\t"
+				"add        %[PA], %[PA], 16*4  \n\t"
+				"vle.v      v1,   (t4)         \n\t"
+				"addi       t4,    t4,    16*4  \n\t"
+
+				"vfmv.v.f   v8,   ft0          \n\t"
+				"flw        ft1,  (t1)         \n\t"
+				"addi       t1,   t1,     2*4  \n\t"
+				"vle.v      v2,   (t5)         \n\t"
+				"addi       t5,    t5,    16*4  \n\t"
+				"vle.v      v3,   (t6)         \n\t"
+				"addi       t6,    t6,    16*4  \n\t"
+
+				"vfmv.v.f   v9,   ft1          \n\t"				
+				"vfmacc.vv  v16,  v8,    v0    \n\t"
+				"vfmacc.vv  v17,  v8,    v1    \n\t"
+				"vfmacc.vv  v18,  v8,    v2    \n\t"
+				"vfmacc.vv  v19,  v8,    v3    \n\t"
+
+				"vfmacc.vv  v20,  v9,    v0    \n\t"
+				"vfmacc.vv  v21,  v9,    v1    \n\t"
+				"vfmacc.vv  v22,  v9,    v2    \n\t"
+				"vfmacc.vv  v23,  v9,    v3    \n\t"
+
+				"addi       t0,   t0, -1       \n\t"
+				"bgtz       t0,   M16x2_TAILLOOP \n\t"
+				
+				//Save result
+				//load C
+				"M16x2_SAVERESULT:              \n\t"
+				//use v8 to store alpha
+				"vfmv.v.f   v8,   %[ALPHA]     \n\t"
+				"vle.v      v0,   (%[C0])      \n\t"
+				"addi       t4,   %[C0], 4*4   \n\t"
+				"vle.v      v1,   (%[C1])      \n\t"
+				"addi       t5,   %[C1], 4*4   \n\t"
+				
+				//Multiply Alpha
+				"vfmacc.vv  v0,   v8, v16 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v20 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+				
+				"vfmacc.vv  v4,   v8, v17 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+				"vfmacc.vv  v5,   v8, v21 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+				
+				"vle.v      v0,   (%[C0])      \n\t"
+				"vse.v      v4,   (t4)         \n\t"
+				"add        t4,   t4,     8*4  \n\t"
+				
+				"vle.v      v1,   (%[C1])      \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"add        t5,   t5,     8*4  \n\t"
+
+				"vfmacc.vv  v0,   v8, v18 \n\t"
+				"vle.v      v4,   (t4)          \n\t"
+				"vfmacc.vv  v1,   v8, v22 \n\t"
+				"vle.v      v5,   (t5)          \n\t"
+
+				"vfmacc.vv  v4,   v8, v19 \n\t"
+				"vse.v      v0,   (%[C0])      \n\t"
+				"add        %[C0], %[C0], 8*4  \n\t"
+
+				"vfmacc.vv  v5,   v8, v23 \n\t"
+				"vse.v      v1,   (%[C1])      \n\t"
+				"add        %[C1], %[C1], 8*4  \n\t"
+
+				"vse.v      v4,   (t4)         \n\t"
+				"vse.v      v5,   (t5)         \n\t"
+				"M16x2_END:                     \n\t"
+				
+				:[C0]"+r"(C0),[C1]"+r"(C1),
+				 [PA]"+r"(ptrba), [PB]"+r"(ptrbb)
+				:[ALPHA]"f"(alpha), [BK]"r"(bk)
+				:"cc", "t0", "t4","t5","t6","t3","t1","t2",
+				 "ft11", "ft0", "ft1", "ft2","ft3","ft4", "ft5", "ft6","ft7",
+				 "v0", "v1", "v2", "v3","v4", "v5", "v6", "v7",
+				 "v8", "v9", "v10", "v11","v12", "v13", "v14", "v15",
+				 "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
+
+	   }
+	   if(bm&8){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+			   load4 = ptrba[4];
+			   load5 = ptrba[5];
+			   load6 = ptrba[6];
+			   load7 = ptrba[7];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+			   res4 = res4 + load4 * loadb0;
+			   res5 = res5 + load5 * loadb0;
+			   res6 = res6 + load6 * loadb0;
+			   res7 = res7 + load7 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   res12 = res12 + load4 * loadb1;
+			   res13 = res13 + load5 * loadb1;
+			   res14 = res14 + load6 * loadb1;
+			   res15 = res15 + load7 * loadb1;
+
+			   ptrba += 8;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+		   C1[4] += res12;
+		   C1[5] += res13;
+		   C1[6] += res14;
+		   C1[7] += res15;
+
+		   C0 += 8;
+		   C1 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+			   load2 = ptrba[2];
+			   load3 = ptrba[3];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+			   res2 = res2 + load2 * loadb0;
+			   res3 = res3 + load3 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+			   res10 = res10 + load2 * loadb1;
+			   res11 = res11 + load3 * loadb1;
+
+			   ptrba += 4;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   C1[2] += res10;
+		   C1[3] += res11;
+
+		   C0 += 4;
+		   C1 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+      		   res0 = 0;
+		   res1 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+
+			   load0 = ptrba[0];
+			   load1 = ptrba[1];
+				   
+			   res0 = res0 + load0 * loadb0;
+			   res1 = res1 + load1 * loadb0;
+
+   			   res8 = res8 + load0 * loadb1;
+			   res9 = res9 + load1 * loadb1;
+
+			   ptrba += 2;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+       		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+
+   		   C1[0] += res8;
+		   C1[1] += res9;
+		   
+		   C0 += 2;
+		   C1 += 2;
+	   }
+	   if(bm&1){
+		   ptrbb = bb;
+       		   res0 = 0;
+		   res8 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   loadb1 = ptrbb[1];
+			   load0 = ptrba[0];
+				   
+			   res0 = res0 + load0 * loadb0;
+   			   res8 = res8 + load0 * loadb1;
+			   ptrba += 1;
+			   ptrbb += 2;
+		   }
+		   
+      		   res0 = res0 * alpha;
+       		   res8 = res8 * alpha;
+
+		   C0[0] += res0;
+   		   C1[0] += res8;
+		   
+		   C0 += 1;
+		   C1 += 1;
+	   }
+	   k = bk<<1;
+	   bb = bb+k;
+	   i = ldc<<1;
+	   C = C+i;
+   }
+
+   if (bn&1){
+	   C0 = C;
+	   ptrba = ba;
+	   for(i=0; i<bm/16; i+=1){
+	 	   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+		   
+		   res8 = 0;
+		   res9 = 0;
+		   res10 = 0;
+		   res11 = 0;
+		   res12 = 0;
+		   res13 = 0;
+		   res14 = 0;
+		   res15 = 0;
+		   
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   res8 = res8 + ptrba[8] * loadb0;
+			   res9 = res9 + ptrba[9] * loadb0;
+			   res10 = res10 + ptrba[10] * loadb0;
+			   res11 = res11 + ptrba[11] * loadb0;
+
+			   res12 = res12 + ptrba[12] * loadb0;
+			   res13 = res13 + ptrba[13] * loadb0;
+			   res14 = res14 + ptrba[14] * loadb0;
+			   res15 = res15 + ptrba[15] * loadb0;
+			   
+			   ptrba += 16;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+		   
+   		   res8 = res8 * alpha;
+		   res9 = res9 * alpha;
+		   res10 = res10 * alpha;
+		   res11 = res11 * alpha;
+		   res12 = res12 * alpha;
+		   res13 = res13 * alpha;
+		   res14 = res14 * alpha;
+		   res15 = res15 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0[8] += res8;
+		   C0[9] += res9;
+		   C0[10] += res10;
+		   C0[11] += res11;
+		   C0[12] += res12;
+		   C0[13] += res13;
+		   C0[14] += res14;
+		   C0[15] += res15;
+		   
+		   C0 += 16;
+
+	   }
+	   
+	   if(bm&8){
+		   ptrbb = bb;
+		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   res4 = 0;
+		   res5 = 0;
+		   res6 = 0;
+		   res7 = 0;
+
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   res4 = res4 + ptrba[4] * loadb0;
+			   res5 = res5 + ptrba[5] * loadb0;
+			   res6 = res6 + ptrba[6] * loadb0;
+			   res7 = res7 + ptrba[7] * loadb0;
+			   
+			   ptrba += 8;
+			   ptrbb += 1;
+		   }
+   		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+		   res4 = res4 * alpha;
+		   res5 = res5 * alpha;
+		   res6 = res6 * alpha;
+		   res7 = res7 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   C0[4] += res4;
+		   C0[5] += res5;
+		   C0[6] += res6;
+		   C0[7] += res7;
+		   
+		   C0 += 8;
+	   }
+	   if(bm&4){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   res2 = 0;
+		   res3 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+			   res2 = res2 + ptrba[2] * loadb0;
+			   res3 = res3 + ptrba[3] * loadb0;
+
+			   ptrba += 4;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+		   res2 = res2 * alpha;
+		   res3 = res3 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   C0[2] += res2;
+		   C0[3] += res3;
+		   
+		   C0 += 4;
+	   }
+   	   if(bm&2){
+		   ptrbb = bb;
+   		   res0 = 0;
+		   res1 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   res1 = res1 + ptrba[1] * loadb0;
+
+			   ptrba += 2;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   res1 = res1 * alpha;
+
+		   C0[0] += res0;
+		   C0[1] += res1;
+		   
+		   C0 += 2;
+	   }
+	   if(bm&1){
+   		   ptrbb = bb;
+   		   res0 = 0;
+		   for(k=0; k<bk; k+=1){
+			   loadb0 = ptrbb[0];
+			   res0 = res0 + ptrba[0] * loadb0;
+			   ptrba += 1;
+			   ptrbb += 1;
+		   }
+      		   res0 = res0 * alpha;
+		   C0[0] += res0;
+		   C0 += 1;
+	   }
+
+	   k = bk;
+	   bb = bb+k;
+	   C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/riscv64/swap.c b/kernel/riscv64/swap.c
new file mode 100644
index 000000000..eac621fb2
--- /dev/null
+++ b/kernel/riscv64/swap.c
@@ -0,0 +1,62 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/08/20 Saar
+*	 BLASTEST float		OK
+* 	 BLASTEST double	OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp;
+
+	if ( n < 0     )  return(0);
+
+	while(i < n)
+	{
+
+		temp  = x[ix] ;
+		x[ix] = y[iy] ;
+		y[iy] = temp ;
+
+		ix += inc_x ;
+		iy += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c
new file mode 100644
index 000000000..9377bf4b9
--- /dev/null
+++ b/kernel/riscv64/swap_vector.c
@@ -0,0 +1,173 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        unsigned int gvl = 0;
+
+	if (n < 0)  return(0);
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                if(gvl <= n/2){
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j+=gvl * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j+=gvl;
+                }
+        }else if (inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j += gvl;
+                        ix += inc_x * gvl;
+                }
+        }else if(inc_x == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx1, gvl);
+                                j += gvl * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        j += gvl;
+                        iy += inc_y * gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl;
+                        BLASLONG inc_yv = inc_y * gvl;
+                        for(i=0,j=0; i<n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+
+                                vx1 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+inc_yv], stride_y, gvl);
+                                VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx1, gvl);
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        j += gvl;
+                        ix += inc_x * gvl;
+                        iy += inc_y * gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_L.c b/kernel/riscv64/symv_L.c
new file mode 100644
index 000000000..8f48d03f5
--- /dev/null
+++ b/kernel/riscv64/symv_L.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if ( m != offset )
+		printf("Symv_L: m=%d offset=%d\n",m,offset);
+#endif
+
+	jx = 0;
+	jy = 0;
+
+	for (j=0; j<offset; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		y[jy] += temp1 * a[j*lda+j];
+		iy = jy;
+		ix = jx;
+		for (i=j+1; i<m; i++)
+		{
+			ix += inc_x;
+			iy += inc_y;
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			
+		}
+		y[jy] += alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_L_vector.c b/kernel/riscv64/symv_L_vector.c
new file mode 100644
index 000000000..3c2647026
--- /dev/null
+++ b/kernel/riscv64/symv_L_vector.c
@@ -0,0 +1,265 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT temp2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv, len;
+
+        if(inc_x == 1 && inc_y == 1){
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        y[j] += temp1 * a_ptr[j];
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[j] += alpha * temp2;
+                        a_ptr += lda;
+                }
+        }else if(inc_x == 1){
+                jy = 0;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        y[jy] += temp1 * a_ptr[j];
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[jy] += alpha * temp2;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }else if(inc_y == 1){
+                jx = 0;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        y[j] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                inc_xv = inc_x * gvl;
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[j] += alpha * temp2;
+                        jx    += inc_x;
+                        a_ptr += lda;
+                }
+        }else{
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                jx = 0;
+                jy = 0;
+                for (j=0; j<offset; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        y[jy] += temp1 * a_ptr[j];
+                        ix = jx + inc_x;
+                        iy = jy + inc_y;
+                        i = j + 1;
+                        len = m - i;
+                        if(len > 0){
+                                gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < len / gvl; k++){
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < m){
+                                        gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+			}
+                        y[jy] += alpha * temp2;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                        a_ptr += lda;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/symv_U.c b/kernel/riscv64/symv_U.c
new file mode 100644
index 000000000..b5a0c96e9
--- /dev/null
+++ b/kernel/riscv64/symv_U.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG jx,jy;
+	BLASLONG j;
+	FLOAT temp1;
+	FLOAT temp2;
+
+#if 0
+	if( m != offset )
+		printf("Symv_U: m=%d offset=%d\n",m,offset);
+#endif
+
+	BLASLONG m1 = m - offset;
+
+	jx = m1 * inc_x;
+	jy = m1 * inc_y;
+
+	for (j=m1; j<m; j++)
+	{
+		temp1 = alpha * x[jx];
+		temp2 = 0.0;
+		iy = 0;
+		ix = 0;
+		for (i=0; i<j; i++)
+		{
+			y[iy] += temp1 * a[j*lda+i];
+			temp2 += a[j*lda+i] * x[ix];
+			ix += inc_x;
+			iy += inc_y;
+			
+		}
+		y[jy] += temp1 * a[j*lda+j] + alpha * temp2;
+		jx    += inc_x;
+		jy    += inc_y;
+	}
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/symv_U_vector.c b/kernel/riscv64/symv_U_vector.c
new file mode 100644
index 000000000..29e0e4b65
--- /dev/null
+++ b/kernel/riscv64/symv_U_vector.c
@@ -0,0 +1,264 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+        BLASLONG i, j, k;
+        BLASLONG ix,iy;
+        BLASLONG jx,jy;
+        FLOAT temp1;
+        FLOAT temp2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+        FLOAT_V_T va, vx, vy, vr;
+        BLASLONG stride_x, stride_y, inc_xv, inc_yv;
+
+        BLASLONG m1 = m - offset;
+        if(inc_x == 1 && inc_y == 1){
+                a_ptr += m1 * lda;
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[j] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                }
+        }else if(inc_x == 1){
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[j];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                iy = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLEV_FLOAT(&x[i], gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[jy] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jy    += inc_y;
+                }
+        }else if(inc_y == 1){
+                jx = m1 * inc_x;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                ix = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLEV_FLOAT(&y[i], gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSEV_FLOAT(&y[i], vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[j] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jx    += inc_x;
+                }
+        }else{
+                jx = m1 * inc_x;
+                jy = m1 * inc_y;
+                a_ptr += m1 * lda;
+                stride_x = inc_x * sizeof(FLOAT);
+                stride_y = inc_y * sizeof(FLOAT);
+                for (j=m1; j<m; j++)
+                {
+                        temp1 = alpha * x[jx];
+                        temp2 = 0.0;
+                        if(j > 0){
+                                ix = 0;
+                                iy = 0;
+                                i = 0;
+                                gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                                inc_xv = inc_x * gvl;
+                                inc_yv = inc_y * gvl;
+                                vr = VFMVVF_FLOAT(0, gvl);
+                                for(k = 0; k < j / gvl; k++){
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, vx, va, gvl);
+
+                                        i += gvl;
+                                        ix += inc_xv;
+                                        iy += inc_yv;
+                                }
+                                va = VFMVVF_FLOAT(0, gvl);
+                                va = VFREDSUM_FLOAT(vr, va, gvl);
+                                temp2 = va[0];
+                                if(i < j){
+                                        gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                        vy = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                        va = VLEV_FLOAT(&a_ptr[i], gvl);
+                                        vy = VFMACCVF_FLOAT(vy, temp1, va, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy, gvl);
+
+                                        vx = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                        vr = VFMULVV_FLOAT(vx, va, gvl);
+                                        va = VFMVVF_FLOAT(0, gvl);
+                                        va = VFREDSUM_FLOAT(vr, va, gvl);
+                                        temp2 += va[0];
+                                }
+                        }
+                        y[jy] += temp1 * a_ptr[j] + alpha * temp2;
+                        a_ptr += lda;
+                        jx    += inc_x;
+                        jy    += inc_y;
+                }
+        }
+        return(0);
+}
+
diff --git a/kernel/riscv64/zamax.c b/kernel/riscv64/zamax.c
new file mode 100644
index 000000000..a39bd7821
--- /dev/null
+++ b/kernel/riscv64/zamax.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT maxf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	maxf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) > maxf )
+		{
+			maxf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(maxf);
+}
+
+
diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c
new file mode 100644
index 000000000..a6c742b14
--- /dev/null
+++ b/kernel/riscv64/zamax_vector.c
@@ -0,0 +1,104 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMAXVV_FLOAT vfmaxvv_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMAXVV_FLOAT vfmaxvv_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT maxf=0.0;
+	if (n <= 0 || inc_x <= 0) return(maxf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_max;
+
+        MASK_T mask0, mask1;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_max = VFMVVF_FLOAT(0, gvl);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        for(; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+
+                v0 = VFADDVV_FLOAT(v0, v1, gvl);
+                v_max = VFMAXVV_FLOAT(v_max, v0, gvl);
+
+                j += gvl;
+                ix += inc_xv;
+        }
+        v0 = VFMVVF_FLOAT(0, gvl);
+        v_max = VFREDMAXVS_FLOAT(v_max, v0, gvl);
+        maxf = v_max[0];
+
+        if(j<n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFADDVV_FLOAT(v0, v1, gvl);
+                v0 = VFMVVF_FLOAT(0, gvl);
+                v_max = VFREDMAXVS_FLOAT(v1, v0, gvl);
+                if(v_max[0] > maxf)
+                        maxf = v_max[0];
+        }
+        return(maxf);
+}
diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c
new file mode 100644
index 000000000..02eab3e75
--- /dev/null
+++ b/kernel/riscv64/zamin.c
@@ -0,0 +1,79 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: NoTest
+* 	 TEST			: NoTest
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0;
+	FLOAT minf;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	minf = CABS1(x,0);
+	ix += inc_x2;
+	i++;
+
+	while(i < n)
+	{
+		if( CABS1(x,ix) < minf )
+		{
+			minf = CABS1(x,ix);
+		}
+		ix += inc_x2;
+		i++;
+	}
+	return(minf);
+}
+
+
diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c
new file mode 100644
index 000000000..44a7cf1dc
--- /dev/null
+++ b/kernel/riscv64/zamin_vector.c
@@ -0,0 +1,104 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+#include <float.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDMINVS_FLOAT vfredminvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFMINVV_FLOAT vfminvv_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDMINVS_FLOAT vfredminvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFMINVV_FLOAT vfminvv_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	if (n <= 0 || inc_x <= 0) return(0.0);
+	FLOAT minf=FLT_MAX;
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_min;
+        MASK_T mask0, mask1;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        v_min = VFMVVF_FLOAT(FLT_MAX, gvl);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        for(; i<n/gvl; i++){
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+
+                v0 = VFADDVV_FLOAT(v0, v1, gvl);
+                v_min = VFMINVV_FLOAT(v_min, v0, gvl);
+
+                j += gvl;
+                ix += inc_xv;
+        }
+        v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+        v_min = VFREDMINVS_FLOAT(v_min, v0, gvl);
+        minf = v_min[0];
+
+        if(j<n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                v1 = VFADDVV_FLOAT(v0, v1, gvl);
+                v0 = VFMVVF_FLOAT(FLT_MAX, gvl);
+                v_min = VFREDMINVS_FLOAT(v1, v0, gvl);
+                if(v_min[0] < minf)
+                        minf = v_min[0];
+        }
+        return(minf);
+}
diff --git a/kernel/riscv64/zasum.c b/kernel/riscv64/zasum.c
new file mode 100644
index 000000000..61e85cae6
--- /dev/null
+++ b/kernel/riscv64/zasum.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+#define CABS1(x,i)	ABS(x[i])+ABS(x[i+1])
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT sumf = 0.0;
+	BLASLONG inc_x2;
+
+	if (n <= 0 || inc_x <= 0) return(sumf);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+		sumf += CABS1(x,i);
+		i += inc_x2;
+	}
+	return(sumf);
+}
+
+
diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c
new file mode 100644
index 000000000..d9fa88971
--- /dev/null
+++ b/kernel/riscv64/zasum_vector.c
@@ -0,0 +1,136 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8
+#define MASK_T e32xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8
+#define VFMVVF_FLOAT vfmvvf_float32xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8
+#define VFADDVV_FLOAT vfaddvv_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8
+#define MASK_T e64xm8_t
+#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8
+#define VFMVVF_FLOAT vfmvvf_float64xm8
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8
+#define VFADDVV_FLOAT vfaddvv_float64xm8
+#endif
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+	BLASLONG ix=0;
+	FLOAT asumf=0.0;
+	if (n <= 0 || inc_x <= 0) return(asumf);
+        unsigned int gvl = 0;
+        FLOAT_V_T v0, v1, v_zero,v_sum;
+
+        MASK_T mask0, mask1;
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                if(gvl <= n2/2){
+                        v_sum = VFMVVF_FLOAT(0, gvl);
+                        for(i=0,j=0; i<n2/(gvl*2); i++){
+                                v0 = VLEV_FLOAT(&x[j], gvl);
+                                mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                                v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                                v1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                                v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                                v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+                                j += gvl * 2;
+                        }
+                        v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v0[0];
+                }
+                for(;j<n2;){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v0 = VFREDSUMVS_FLOAT(v0, v_zero, gvl);
+                        asumf += v0[0];
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                v_zero = VFMVVF_FLOAT(0, gvl);
+
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                v_sum = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v0, gvl);
+
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v_sum = VFADDVV_FLOAT(v_sum, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                v0 = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                asumf += v0[0];
+                if(j<n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        mask0 = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask0, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        mask1 = VMFLTVF_FLOAT(v1, 0, gvl);
+                        v1 = VFRSUBVF_MASK_FLOAT(v1, v1, 0, mask1, gvl);
+                        v_sum = VFADDVV_FLOAT(v0, v1, gvl);
+                        v_sum = VFREDSUMVS_FLOAT(v_sum, v_zero, gvl);
+                        asumf += v_sum[0];
+                }
+        }
+	return(asumf);
+}
+
+
diff --git a/kernel/riscv64/zaxpby.c b/kernel/riscv64/zaxpby.c
new file mode 100644
index 000000000..445354416
--- /dev/null
+++ b/kernel/riscv64/zaxpby.c
@@ -0,0 +1,118 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/***************************************************************************
+* 2014/06/07 Saar
+*
+***************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i,FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	FLOAT temp;
+	BLASLONG inc_x2, inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	if ( beta_r == 0.0 && beta_i == 0.0)
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				y[iy]   = 0.0 ;
+				y[iy+1] = 0.0 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				y[iy]   = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) ;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+	}
+	else
+	{
+		if ( alpha_r == 0.0 && alpha_i == 0.0 )
+		{
+
+			while(i < n)
+			{
+				temp    = ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+		}
+		else
+		{
+
+			while(i < n)
+			{
+				temp    = ( alpha_r * x[ix]   - alpha_i * x[ix+1] ) + ( beta_r * y[iy]   - beta_i * y[iy+1] ) ;
+				y[iy+1] = ( alpha_r * x[ix+1] + alpha_i * x[ix]   ) + ( beta_r * y[iy+1] + beta_i * y[iy]   ) ;
+				y[iy]   = temp;
+				ix += inc_x2 ;
+				iy += inc_y2 ;
+				i++ ;
+			}
+
+
+		}
+
+
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zaxpby_vector.c b/kernel/riscv64/zaxpby_vector.c
new file mode 100644
index 000000000..1897ce417
--- /dev/null
+++ b/kernel/riscv64/zaxpby_vector.c
@@ -0,0 +1,197 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFMSACVF_FLOAT vfmsacvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFMSACVF_FLOAT vfmsacvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FLOAT beta_r, FLOAT beta_i, FLOAT *y, BLASLONG inc_y)
+{
+	if (n <= 0)  return(0);
+
+	BLASLONG i=0, j=0;
+	unsigned int gvl = 0;
+	FLOAT_V_T vx0, vx1;
+        FLOAT_V_T vy0, vy1;
+
+	BLASLONG stride_x, stride_y, ix = 0, iy = 0;
+        stride_x = inc_x * 2 * sizeof(FLOAT);
+        stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        if(beta_r == 0.0 && beta_i == 0.0){
+                if(alpha_r == 0.0 && alpha_i == 0.0){
+                        if(inc_y == 1){
+                                memset(&y[0], 0, 2 * n * sizeof(FLOAT));
+                        }else{
+                                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                                if(gvl <= n/2){
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        BLASLONG inc_yv = inc_y * gvl * 2;
+                                        for(i=0,j=0;i<n/(gvl*2);i++){
+                                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vy0, gvl);
+                                                VSSEV_FLOAT(&y[iy+1+inc_yv], stride_y, vy0, gvl);
+                                                j += gvl * 2;
+                                                iy += inc_yv * 2;
+                                        }
+                                }
+                                for(;j<n;){
+                                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                        vy0 = VFMVVF_FLOAT(0.0, gvl);
+                                        VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                        VSSEV_FLOAT(&y[iy+1], stride_y, vy0, gvl);
+                                        j += gvl;
+                                        iy += inc_y * gvl * 2;
+                                }
+                        }
+		}else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i<n/gvl; i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl);
+                                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                vy1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                j += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VFMULVF_FLOAT(vx1, alpha_i, gvl);
+                                vy0 = VFMSACVF_FLOAT(vy0, alpha_r, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                vy1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, alpha_i, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                        }
+                }
+        }else{
+	        FLOAT_V_T v0, v1;
+                if(alpha_r == 0.0 && alpha_i == 0.0){
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0;i<n/gvl;i++){
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vy1, beta_i, gvl);
+                                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vy1, beta_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                                j += gvl;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vy1, beta_i, gvl);
+                                v0 = VFMSACVF_FLOAT(v0, beta_r, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vy1, beta_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                        }
+		}else{
+                        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i<n/gvl; i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vx0, alpha_r, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, gvl);
+                                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+
+                                j += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                        }
+                        if(j<n){
+                                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                                v0 = VFMULVF_FLOAT(vx0, alpha_r, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, alpha_i, vx1, gvl);
+                                v0 = VFMACCVF_FLOAT(v0, beta_r, vy0, gvl);
+                                v0 = VFNMSACVF_FLOAT(v0, beta_i, vy1, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, v0, gvl);
+                                v1 = VFMULVF_FLOAT(vx1, alpha_r, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, alpha_i, vx0, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_r, vy1, gvl);
+                                v1 = VFMACCVF_FLOAT(v1, beta_i, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, v1, gvl);
+                        }
+                }
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/zaxpy.c b/kernel/riscv64/zaxpy.c
new file mode 100644
index 000000000..1dcaeac27
--- /dev/null
+++ b/kernel/riscv64/zaxpy.c
@@ -0,0 +1,74 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/15 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix,iy;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+	if ( da_r == 0.0 && da_i == 0.0 ) return(0);
+
+	ix = 0;
+	iy = 0;
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+		y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+		y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+		y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zaxpy_vector.c b/kernel/riscv64/zaxpy_vector.c
new file mode 100644
index 000000000..fb2656a1d
--- /dev/null
+++ b/kernel/riscv64/zaxpy_vector.c
@@ -0,0 +1,107 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+	if(n < 0) return(0);
+	if(da_r == 0.0 && da_i == 0.0) return(0);
+        unsigned int gvl = 0;
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+        for(i=0,j=0; i < n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#if !defined(CONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#endif
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                j += gvl;
+                ix += inc_xv;
+                iy += inc_yv;
+        }
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#if !defined(CONJ)
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFNMSACVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#else
+                vy0 = VFMACCVF_FLOAT(vy0, da_r, vx0, gvl);
+                vy0 = VFMACCVF_FLOAT(vy0, da_i, vx1, gvl);
+                vy1 = VFNMSACVF_FLOAT(vy1, da_r, vx1, gvl);
+                vy1 = VFMACCVF_FLOAT(vy1, da_i, vx0, gvl);
+#endif
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zcopy.c b/kernel/riscv64/zcopy.c
new file mode 100644
index 000000000..07fe584c5
--- /dev/null
+++ b/kernel/riscv64/zcopy.c
@@ -0,0 +1,65 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		y[iy]   = x[ix] ;
+		y[iy+1] = x[ix+1] ;
+		ix += inc_x2;
+		iy += inc_y2;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zcopy_vector.c b/kernel/riscv64/zcopy_vector.c
new file mode 100644
index 000000000..6ed430931
--- /dev/null
+++ b/kernel/riscv64/zcopy_vector.c
@@ -0,0 +1,92 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#endif
+
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+	if(n < 0) return(0);
+
+        unsigned int gvl = 0;
+        if(inc_x == 1 && inc_y == 1){
+                memcpy(&y[0], &x[0], n * 2 * sizeof(FLOAT));
+        }else{
+                FLOAT_V_T vx0, vx1, vx2, vx3;
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+                if(gvl <= n/2){
+                        BLASLONG inc_xv = inc_x * gvl * 2;
+                        BLASLONG inc_yv = inc_y * gvl * 2;
+                        for(i=0,j=0; i < n/(2*gvl); i++){
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                                vx2 = VLSEV_FLOAT(&x[ix+inc_xv], stride_x, gvl);
+                                vx3 = VLSEV_FLOAT(&x[ix+1+inc_xv], stride_x, gvl);
+                                VSSEV_FLOAT(&y[iy+inc_yv], stride_y, vx2, gvl);
+                                VSSEV_FLOAT(&y[iy+1+inc_yv], stride_y, vx3, gvl);
+
+                                j += gvl * 2;
+                                ix += inc_xv * 2;
+                                iy += inc_yv * 2;
+                        }
+                }
+                for(;j<n;){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                        j += gvl;
+                        ix += inc_x * 2 * gvl;
+                        iy += inc_y * 2 * gvl;
+                }
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zdot.c b/kernel/riscv64/zdot.c
new file mode 100644
index 000000000..733c235c6
--- /dev/null
+++ b/kernel/riscv64/zdot.c
@@ -0,0 +1,80 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: FAIL
+* 	 BLASTEST double	: FAIL
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT dot[2];
+	OPENBLAS_COMPLEX_FLOAT result;
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	dot[0]=0.0;
+	dot[1]=0.0;
+
+	CREAL(result) = 0.0 ;
+	CIMAG(result) = 0.0 ;
+
+	if ( n < 1 )  return(result);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+#if !defined(CONJ)
+		dot[0] += ( x[ix]   * y[iy] - x[ix+1] * y[iy+1] ) ;
+		dot[1] += ( x[ix+1] * y[iy] + x[ix]   * y[iy+1] ) ;
+#else
+		dot[0] += ( x[ix]   * y[iy] + x[ix+1] * y[iy+1] ) ;
+		dot[1] -= ( x[ix+1] * y[iy] - x[ix]   * y[iy+1] ) ;
+#endif
+		ix  += inc_x2 ;
+		iy  += inc_y2 ;
+		i++ ;
+
+	}
+	CREAL(result) = dot[0];
+	CIMAG(result) = dot[1];
+	return(result);
+
+}
+
+
diff --git a/kernel/riscv64/zdot_vector.c b/kernel/riscv64/zdot_vector.c
new file mode 100644
index 000000000..33efd07e7
--- /dev/null
+++ b/kernel/riscv64/zdot_vector.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFMSACVV_FLOAT vfmsacvv_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFMSACVV_FLOAT vfmsacvv_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0,iy=0;
+        FLOAT dot[2];
+        OPENBLAS_COMPLEX_FLOAT result;
+
+        dot[0]=0.0;
+        dot[1]=0.0;
+
+        CREAL(result) = 0.0;
+        CIMAG(result) = 0.0;
+
+        if ( n < 1 )  return(result);
+
+        unsigned int gvl = 0;
+
+        FLOAT_V_T vr0, vr1, vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        vr0 = VFMVVF_FLOAT(0, gvl);
+        vr1 = VFMVVF_FLOAT(0, gvl);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+
+        for(i=0,j=0; i<n/gvl; i++){
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+                vr0 = VFMACCVV_FLOAT(vr0, vx0, vy0, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx0, vy1, gvl);
+#if !defined(CONJ)
+                vr0 = VFNMSACVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, gvl);
+#else
+                vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFNMSACVV_FLOAT(vr1, vx1, vy0, gvl);
+#endif
+                j += gvl;
+                ix += inc_xv;
+                iy += inc_yv;
+        }
+        vx0 = VFMVVF_FLOAT(0, gvl);
+        vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
+        dot[0] += vr0[0];
+        vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
+        dot[1] += vr1[0];
+        //tail
+        if(j < n){
+                gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+#if !defined(CONJ)
+                vr0 = VFMULVV_FLOAT(vx1, vy1, gvl);
+                vr0 = VFMSACVV_FLOAT(vr0, vx0, vy0, gvl);
+                vr1 = VFMULVV_FLOAT(vx0, vy1, gvl);
+                vr1 = VFMACCVV_FLOAT(vr1, vx1, vy0, gvl);
+#else
+                vr0 = VFMULVV_FLOAT(vx0, vy0, gvl);
+                vr0 = VFMACCVV_FLOAT(vr0, vx1, vy1, gvl);
+                vr1 = VFMULVV_FLOAT(vx1, vy0, gvl);
+                vr1 = VFMSACVV_FLOAT(vr1, vx0, vy1, gvl);
+#endif
+                vx0 = VFMVVF_FLOAT(0, gvl);
+                vr0 = VFREDSUM_FLOAT(vr0, vx0, gvl);
+                dot[0] += vr0[0];
+                vr1 = VFREDSUM_FLOAT(vr1, vx0, gvl);
+                dot[1] += vr1[0];
+        }
+        CREAL(result) = dot[0];
+        CIMAG(result) = dot[1];
+        return(result);
+}
diff --git a/kernel/riscv64/zgemv_n.c b/kernel/riscv64/zgemv_n.c
new file mode 100644
index 000000000..b9b03f792
--- /dev/null
+++ b/kernel/riscv64/zgemv_n.c
@@ -0,0 +1,157 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	ix = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += 2;
+		}
+		a_ptr += lda2;
+		ix    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+
+#if !defined(XCONJ)
+		temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+		temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+		temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+		iy = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] += temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			y[iy]   += temp_r * a_ptr[i2]   + temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] - temp_i * a_ptr[i2];
+#else
+			y[iy]   += temp_r * a_ptr[i2]   - temp_i * a_ptr[i2+1];
+			y[iy+1] -= temp_r * a_ptr[i2+1] + temp_i * a_ptr[i2];
+#endif
+
+#endif
+			i2 += 2;
+			iy += inc_y2;
+		}
+		a_ptr += lda2;
+		ix    += inc_x2;
+	}
+
+
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zgemv_n_vector.c b/kernel/riscv64/zgemv_n_vector.c
new file mode 100644
index 000000000..31cbbe6bb
--- /dev/null
+++ b/kernel/riscv64/zgemv_n_vector.c
@@ -0,0 +1,175 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+        BLASLONG ix = 0, iy = 0;
+        FLOAT *a_ptr = a;
+        FLOAT temp_r = 0.0, temp_i = 0.0;
+        FLOAT_V_T va0, va1, vy0, vy1;
+        unsigned int gvl = 0;
+        BLASLONG stride_a = sizeof(FLOAT) * 2;
+        BLASLONG stride_y = inc_y * sizeof(FLOAT) * 2;
+        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_yv = inc_y * gvl * 2;
+        BLASLONG inc_x2 = inc_x * 2;
+        BLASLONG lda2 = lda * 2;
+        for(k=0,j=0; k<m/gvl; k++){
+                a_ptr = a;
+                ix = 0;
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+			temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+			temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+#if !defined(CONJ)
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#endif
+                        a_ptr += lda2;
+                        ix += inc_x2;
+                }
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+                j += gvl * 2;
+                iy += inc_yv;
+        }
+        //tail
+        if(j/2 < m){
+                gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                a_ptr = a;
+                ix = 0;
+                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                for(i = 0; i < n; i++){
+#if !defined(XCONJ)
+			temp_r = alpha_r * x[ix]   - alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] + alpha_i * x[ix];
+#else
+			temp_r = alpha_r * x[ix]   + alpha_i * x[ix+1];
+			temp_i = alpha_r * x[ix+1] - alpha_i * x[ix];
+#endif
+
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+#if !defined(CONJ)
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#else
+
+#if !defined(XCONJ)
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFMACCVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFMACCVF_FLOAT(vy1, temp_i, va0, gvl);
+#else
+			vy0 = VFMACCVF_FLOAT(vy0, temp_r, va0, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, temp_i, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_r, va1, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, temp_i, va0, gvl);
+#endif
+
+#endif
+                        a_ptr += lda2;
+                        ix += inc_x2;
+                }
+                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+        }
+	return(0);
+}
+
+
diff --git a/kernel/riscv64/zgemv_t.c b/kernel/riscv64/zgemv_t.c
new file mode 100644
index 000000000..1239cf3f7
--- /dev/null
+++ b/kernel/riscv64/zgemv_t.c
@@ -0,0 +1,140 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+ * * 2013/11/23 Saar
+ * *	 BLASTEST float		: OK
+ * * 	 BLASTEST double	: OK
+ * 	 CTEST			: OK
+ * 	 TEST			: OK
+ * *
+ * **************************************************************************************/
+
+
+#include "common.h"
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i;
+	BLASLONG ix,iy;
+	BLASLONG j;
+	FLOAT *a_ptr;
+	FLOAT temp_r,temp_i;
+	BLASLONG inc_x2,inc_y2;
+	BLASLONG lda2;
+	BLASLONG i2;
+
+	lda2 = 2*lda;
+
+	iy = 0;
+	a_ptr = a;
+
+	if ( inc_x == 1 && inc_y == 1 )
+	{
+
+	   for (j=0; j<n; j++)
+	   {
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += 2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += 2;
+	   }
+
+	   return(0);
+
+	}
+
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	for (j=0; j<n; j++)
+	{
+		temp_r = 0.0;
+		temp_i = 0.0;
+		ix = 0;
+		i2=0;
+
+		for (i=0; i<m; i++)
+		{
+
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+			temp_r += a_ptr[i2] * x[ix]   - a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] + a_ptr[i2+1] * x[ix];
+#else
+			temp_r += a_ptr[i2] * x[ix]   + a_ptr[i2+1] * x[ix+1];
+			temp_i += a_ptr[i2] * x[ix+1] - a_ptr[i2+1] * x[ix];
+#endif
+
+			i2 += 2;
+			ix += inc_x2;
+		}
+
+#if !defined(XCONJ)
+		y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+		y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+		y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+		y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+
+		a_ptr += lda2;
+		iy    += inc_y2;
+	}
+
+	return(0);
+
+}
+
+
+
diff --git a/kernel/riscv64/zgemv_t_vector.c b/kernel/riscv64/zgemv_t_vector.c
new file mode 100644
index 000000000..b23a4d8a3
--- /dev/null
+++ b/kernel/riscv64/zgemv_t_vector.c
@@ -0,0 +1,134 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
+{
+	BLASLONG i = 0, j = 0, k = 0;
+	BLASLONG ix = 0, iy = 0;
+	FLOAT *a_ptr = a;
+        FLOAT temp_r, temp_i;
+
+        FLOAT_V_T va0, va1, vx0, vx1, vr, vi;
+        unsigned int gvl = 0;
+        BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2;
+        BLASLONG stride_a = sizeof(FLOAT) * 2;
+        gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+        BLASLONG inc_xv = inc_x * gvl * 2;
+        BLASLONG inc_av = gvl * 2;
+        BLASLONG inc_y2 = inc_y * 2;
+        BLASLONG lda2 = lda * 2;
+        for(i = 0; i < n; i++){
+                gvl = vsetvli(m, RVV_EFLOAT, RVV_M);
+                j = 0;
+                ix = 0;
+                vr = VFMVVF_FLOAT(0, gvl);
+                vi = VFMVVF_FLOAT(0, gvl);
+                for(k = 0; k < m/gvl; k++){
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
+#else
+                        vr = VFMACCVV_FLOAT(vr, va0, vx0, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va0, vx1, gvl);
+                        vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
+
+#endif
+                        j += inc_av;
+                        ix += inc_xv;
+                }
+                va0 = VFMVVF_FLOAT(0, gvl);
+                vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
+                temp_r = vx0[0];
+                vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
+                temp_i = vx1[0];
+                if(j/2 < m){
+                        gvl = vsetvli(m-j/2, RVV_EFLOAT, RVV_M);
+                        va0 = VLSEV_FLOAT(&a_ptr[j], stride_a, gvl);
+                        va1 = VLSEV_FLOAT(&a_ptr[j+1], stride_a, gvl);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vr = VFNMSACVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vi = VFMACCVV_FLOAT(vi, va1, vx0, gvl);
+#else
+                        vr = VFMULVV_FLOAT(va0, vx0, gvl);
+                        vr = VFMACCVV_FLOAT(vr, va1, vx1, gvl);
+                        vi = VFMULVV_FLOAT(va0, vx1, gvl);
+                        vi = VFNMSACVV_FLOAT(vi, va1, vx0, gvl);
+
+#endif
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr, va0, gvl);
+                        temp_r += vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vi, va0, gvl);
+                        temp_i += vx1[0];
+                }
+#if !defined(XCONJ)
+                y[iy]   += alpha_r * temp_r - alpha_i * temp_i;
+                y[iy+1] += alpha_r * temp_i + alpha_i * temp_r;
+#else
+                y[iy]   += alpha_r * temp_r + alpha_i * temp_i;
+                y[iy+1] -= alpha_r * temp_i - alpha_i * temp_r;
+#endif
+                iy += inc_y2;
+                a_ptr += lda2;
+        }
+	return(0);
+}
+
diff --git a/kernel/riscv64/zhemv_LM_vector.c b/kernel/riscv64/zhemv_LM_vector.c
new file mode 100644
index 000000000..aa9ac85d5
--- /dev/null
+++ b/kernel/riscv64/zhemv_LM_vector.c
@@ -0,0 +1,191 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, len, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        jx = 0;
+        jy = 0;
+        ja = 0;
+        for(j = 0; j < offset; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+                ix = jx + inc_x2;
+                iy = jy + inc_y2;
+                ia = ja + 2;
+                i = j + 1;
+                len = m - i;
+                if(len > 0){
+                        gvl = vsetvli(len, RVV_EFLOAT, RVV_M);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < len / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                        temp_r2 = vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                        temp_i2 = vx1[0];
+                        if(i < m){
+				gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                va0 = VFMVVF_FLOAT(0, gvl);
+                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                                temp_r2 += vx0[0];
+                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                                temp_i2 += vx1[0];
+                        }
+                }
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c
new file mode 100644
index 000000000..6fe12c76c
--- /dev/null
+++ b/kernel/riscv64/zhemv_UV_vector.c
@@ -0,0 +1,192 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFMULVV_FLOAT vfmulvv_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFMULVV_FLOAT vfmulvv_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4
+#endif
+
+int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){
+        BLASLONG i, j, k;
+        BLASLONG ix, iy, ia;
+        BLASLONG jx, jy, ja;
+        FLOAT temp_r1, temp_i1;
+        FLOAT temp_r2, temp_i2;
+        FLOAT *a_ptr = a;
+        unsigned int gvl = 0;
+
+
+        FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1;
+        BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2;
+
+        BLASLONG inc_x2 = incx * 2;
+        BLASLONG inc_y2 = incy * 2;
+        stride_x = inc_x2 * sizeof(FLOAT);
+        stride_y = inc_y2 * sizeof(FLOAT);
+        stride_a = 2 * sizeof(FLOAT);
+        lda2 = lda * 2;
+
+        BLASLONG m1 = m - offset;
+        a_ptr = a + m1 * lda2;
+        jx = m1 * inc_x2;
+        jy = m1 * inc_y2;
+        ja = m1 * 2;
+        for(j = m1; j < m; j++){
+                temp_r1 = alpha_r * x[jx]   - alpha_i * x[jx+1];;
+                temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx];
+                temp_r2 = 0;
+                temp_i2 = 0;
+                ix = 0;
+                iy = 0;
+                ia = 0;
+                i = 0;
+                if(j > 0){
+                        gvl = vsetvli(j, RVV_EFLOAT, RVV_M);
+                        inc_xv = incx * gvl * 2;
+                        inc_yv = incy * gvl * 2;
+                        inc_av = gvl * 2;
+                        vr0 = VFMVVF_FLOAT(0, gvl);
+                        vr1 = VFMVVF_FLOAT(0, gvl);
+                        for(k = 0; k < j / gvl; k++){
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+
+#endif
+                                i += gvl;
+                                ix += inc_xv;
+                                iy += inc_yv;
+                                ia += inc_av;
+                        }
+                        va0 = VFMVVF_FLOAT(0, gvl);
+                        vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                        temp_r2 = vx0[0];
+                        vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                        temp_i2 = vx1[0];
+                        if(i < j){
+				gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M);
+                                va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl);
+                                va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl);
+                                vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                                vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+#ifndef HEMVREV
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#else
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl);
+                                vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl);
+                                vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl);
+                                vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl);
+#endif
+                                VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+                                VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+                                vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                                vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+#ifndef HEMVREV
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl);
+#else
+                                vr0 = VFMULVV_FLOAT(vx0, va0, gvl);
+                                vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl);
+                                vr1 = VFMULVV_FLOAT(vx1, va0, gvl);
+                                vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl);
+#endif
+
+                                va0 = VFMVVF_FLOAT(0, gvl);
+                                vx0 = VFREDSUM_FLOAT(vr0, va0, gvl);
+                                temp_r2 += vx0[0];
+                                vx1 = VFREDSUM_FLOAT(vr1, va0, gvl);
+                                temp_i2 += vx1[0];
+                        }
+                }
+                y[jy] += temp_r1 * a_ptr[ja];
+                y[jy+1] += temp_i1 * a_ptr[ja];
+		y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2;
+		y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2;
+		jx    += inc_x2;
+		jy    += inc_y2;
+		ja    += 2;
+		a_ptr += lda2;
+        }
+	return(0);
+}
diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c
new file mode 100644
index 000000000..fc1c8b54a
--- /dev/null
+++ b/kernel/riscv64/znrm2.c
@@ -0,0 +1,106 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/13 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <math.h>
+
+#if defined(DOUBLE)
+
+#define ABS fabs
+
+#else
+
+#define ABS fabsf
+
+#endif
+
+
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0;
+	FLOAT scale = 0.0;
+	FLOAT ssq   = 1.0;
+	BLASLONG inc_x2;
+	FLOAT temp;
+
+	if (n <= 0 || inc_x <= 0) return(0.0);
+
+	inc_x2 = 2 * inc_x;
+
+	n *= inc_x2;
+	while(i < n)
+	{
+
+		if ( x[i] != 0.0 )
+		{
+			temp = ABS( x[i] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+		if ( x[i+1] != 0.0 )
+		{
+			temp = ABS( x[i+1] );
+			if ( scale < temp )
+			{
+				ssq = 1 + ssq * ( scale / temp ) * ( scale / temp );
+				scale = temp ;
+			}
+			else
+			{
+				ssq += ( temp / scale ) * ( temp / scale );
+			}
+
+		}
+
+
+		i += inc_x2;
+	}
+	scale = scale * sqrt( ssq );
+	return(scale);
+
+}
+
+
diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c
new file mode 100644
index 000000000..b0ebfa5f4
--- /dev/null
+++ b/kernel/riscv64/znrm2_vector.c
@@ -0,0 +1,278 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VFREDSUM_FLOAT vfredsumvs_float32xm4
+#define VFMACCVV_FLOAT vfmaccvv_float32xm4
+#define VFMVVF_FLOAT vfmvvf_float32xm4
+#define VFDOTVV_FLOAT vfdotvv_float32xm4
+#define ABS fabsf
+#define MASK_T e32xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4
+#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4
+#define VMFIRSTM vmfirstm_e32xm4
+#define VFDIVVF_FLOAT vfdivvf_float32xm4
+#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VFREDSUM_FLOAT vfredsumvs_float64xm4
+#define VFMACCVV_FLOAT vfmaccvv_float64xm4
+#define VFMVVF_FLOAT vfmvvf_float64xm4
+#define VFDOTVV_FLOAT vfdotvv_float64xm4
+#define ABS fabs
+#define MASK_T e64xm4_t
+#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4
+#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4
+#define VMFIRSTM vmfirstm_e64xm4
+#define VFDIVVF_FLOAT vfdivvf_float64xm4
+#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4
+#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+	BLASLONG i=0, j=0;
+
+	if ( n < 0 )  return(0.0);
+//        if(n == 1) return (ABS(x[0]));
+
+        FLOAT_V_T vr, v0, v_zero;
+        unsigned int gvl = 0;
+        FLOAT scale = 0.0, ssq = 0.0;
+        MASK_T mask;
+        BLASLONG index = 0;
+        if(inc_x == 1){
+                BLASLONG n2 = n * 2;
+                gvl = vsetvli(n2, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                for(i=0,j=0; i<n2/gvl; i++){
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n2){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLEV_FLOAT(&x[j], gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0)
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                        }
+                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                vr = VFMVVF_FLOAT(0, gvl);
+                v_zero = VFMVVF_FLOAT(0, gvl);
+                unsigned int stride_x = inc_x * sizeof(FLOAT) * 2;
+                int idx = 0, inc_v = inc_x * gvl * 2;
+                for(i=0,j=0; i<n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+
+                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                //ssq in vector vr
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        j += gvl;
+                        idx += inc_v;
+                }
+                //ssq in vector vr: vr[0]
+                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                //total ssq now
+                ssq += vr[0];
+
+                //tail
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[idx], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+
+                        v0 = VLSEV_FLOAT(&x[idx+1], stride_x, gvl);
+                        //fabs(vector)
+                        mask = VMFLTVF_FLOAT(v0, 0, gvl);
+                        v0 = VFRSUBVF_MASK_FLOAT(v0, v0, 0, mask, gvl);
+                        //if scale change
+                        mask = VMFGTVF_FLOAT(v0, scale, gvl);
+                        index = VMFIRSTM(mask, gvl);
+                        if(index == -1){//no elements greater than scale
+                                if(scale != 0.0){
+                                        v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                        vr = VFMACCVV_FLOAT(vr, v0, v0, gvl);
+                                }
+                        }else{//found greater element
+                                //ssq in vector vr: vr[0]
+                                vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                                //total ssq before current vector
+                                ssq += vr[0];
+                                //find max
+                                vr = VFREDMAXVS_FLOAT(v0, v_zero, gvl);
+                                //update ssq before max_index
+                                ssq = ssq * (scale/vr[0])*(scale/vr[0]);
+                                //update scale
+                                scale = vr[0];
+                                v0 = VFDIVVF_FLOAT(v0, scale, gvl);
+                                vr = VFMACCVV_FLOAT(v_zero, v0, v0, gvl);
+                        }
+                        //ssq in vector vr: vr[0]
+                        vr = VFREDSUM_FLOAT(vr, v_zero, gvl);
+                        //total ssq now
+                        ssq += vr[0];
+                }
+        }
+	return(scale * sqrt(ssq));
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cn.c b/kernel/riscv64/zomatcopy_cn.c
new file mode 100644
index 000000000..f5a7a6284
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia+=2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_cnc.c b/kernel/riscv64/zomatcopy_cnc.c
new file mode 100644
index 000000000..210c3f716
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_cnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * No Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+	lda *= 2;
+	ldb *= 2;
+
+	for ( i=0; i<cols ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ct.c b/kernel/riscv64/zomatcopy_ct.c
new file mode 100644
index 000000000..38bc9b9f7
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ct.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_ctc.c b/kernel/riscv64/zomatcopy_ctc.c
new file mode 100644
index 000000000..34e7e919a
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_ctc.c
@@ -0,0 +1,71 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order ColMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+	for ( i=0; i<cols ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<rows; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rn.c b/kernel/riscv64/zomatcopy_rn.c
new file mode 100644
index 000000000..ded381e15
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rn.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   = alpha_r * aptr[ia]    - alpha_i * aptr[ia+1];
+			bptr[ia+1] = alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rnc.c b/kernel/riscv64/zomatcopy_rnc.c
new file mode 100644
index 000000000..fc27f17ec
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rnc.c
@@ -0,0 +1,69 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * No Trans , conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+	bptr = b;
+
+	lda *=2;
+	ldb *=2;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		ia = 0;
+		for(j=0; j<cols; j++)
+		{
+			bptr[ia]   =   alpha_r * aptr[ia]    + alpha_i * aptr[ia+1];
+			bptr[ia+1] = - alpha_r * aptr[ia+1]  + alpha_i * aptr[ia];
+			ia += 2;
+		}
+		aptr += lda;
+		bptr += ldb;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rt.c b/kernel/riscv64/zomatcopy_rt.c
new file mode 100644
index 000000000..d34db24e0
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rt.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   = alpha_r * aptr[ia]   - alpha_i * aptr[ia+1];
+			bptr[1]   = alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zomatcopy_rtc.c b/kernel/riscv64/zomatcopy_rtc.c
new file mode 100644
index 000000000..a80ee6dfe
--- /dev/null
+++ b/kernel/riscv64/zomatcopy_rtc.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+/*****************************************************
+ * 2014/06/09 Saar
+ *
+ * Order rowMajor
+ * Trans, conjugate
+ *
+******************************************************/
+
+int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG ldb)
+{
+	BLASLONG i,j,ia,ib;
+	FLOAT *aptr,*bptr;
+
+	if ( rows <= 0     )  return(0);
+	if ( cols <= 0     )  return(0);
+
+	aptr = a;
+
+	lda *= 2;
+	ldb *= 2;
+	ib = 0;
+
+	for ( i=0; i<rows ; i++ )
+	{
+		bptr = &b[ib];
+		ia = 0;
+
+		for(j=0; j<cols; j++)
+		{
+			bptr[0]   =   alpha_r * aptr[ia]   + alpha_i * aptr[ia+1];
+			bptr[1]   = - alpha_r * aptr[ia+1] + alpha_i * aptr[ia];
+			ia += 2;
+			bptr += ldb;
+		}
+		aptr += lda;
+		ib += 2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zrot.c b/kernel/riscv64/zrot.c
new file mode 100644
index 000000000..98be68db8
--- /dev/null
+++ b/kernel/riscv64/zrot.c
@@ -0,0 +1,70 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n <= 0     )  return(0);
+
+	inc_x2 = 2 * inc_x ;
+	inc_y2 = 2 * inc_y ;
+
+	while(i < n)
+	{
+		temp[0]   = c*x[ix]   + s*y[iy] ;
+		temp[1]   = c*x[ix+1] + s*y[iy+1] ;
+		y[iy]     = c*y[iy]   - s*x[ix] ;
+		y[iy+1]   = c*y[iy+1] - s*x[ix+1] ;
+		x[ix]     = temp[0] ;
+		x[ix+1]   = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zrot_vector.c b/kernel/riscv64/zrot_vector.c
new file mode 100644
index 000000000..a3fdda45a
--- /dev/null
+++ b/kernel/riscv64/zrot_vector.c
@@ -0,0 +1,162 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLEV_FLOAT vlev_float32xm4
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSEV_FLOAT vsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLEV_FLOAT vlev_float64xm4
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSEV_FLOAT vsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0,iy=0;
+
+        if (n < 1)  return(0);
+        unsigned int gvl = 0;
+
+        FLOAT_V_T vt0, vt1, vx0, vx1, vy0, vy1;
+        gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+        BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+        BLASLONG stride_y = inc_y * 2 * sizeof(FLOAT);
+        BLASLONG inc_xv = inc_x * 2 * gvl;
+        BLASLONG inc_yv = inc_y * 2 * gvl;
+
+	if(inc_x==1 && inc_y==1){
+		for(i=0,j=0; i < n/gvl; i++){
+			vx0 = VLEV_FLOAT(&x[ix], gvl);
+			vx1 = VLEV_FLOAT(&x[ix+gvl], gvl);
+			vy0 = VLEV_FLOAT(&y[ix], gvl);
+			vy1 = VLEV_FLOAT(&y[ix+gvl], gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSEV_FLOAT(&x[ix], vt0, gvl);
+			VSEV_FLOAT(&x[ix+gvl], vt1, gvl);
+			VSEV_FLOAT(&y[ix], vy0, gvl);
+			VSEV_FLOAT(&y[ix+gvl], vy1, gvl);
+
+			j += gvl;
+			ix += 2*gvl;
+		}
+		if(j < n){
+			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+						vx0 = VLEV_FLOAT(&x[ix], gvl);
+			vx1 = VLEV_FLOAT(&x[ix+gvl], gvl);
+			vy0 = VLEV_FLOAT(&y[ix], gvl);
+			vy1 = VLEV_FLOAT(&y[ix+gvl], gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSEV_FLOAT(&x[ix], vt0, gvl);
+			VSEV_FLOAT(&x[ix+gvl], vt1, gvl);
+			VSEV_FLOAT(&y[ix], vy0, gvl);
+			VSEV_FLOAT(&y[ix+gvl], vy1, gvl);
+		}
+		
+	}else{
+		for(i=0,j=0; i < n/gvl; i++){
+			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+			vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSSEV_FLOAT(&x[ix], stride_x, vt0, gvl);
+			VSSEV_FLOAT(&x[ix+1], stride_x, vt1, gvl);
+			VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+			VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+
+			j += gvl;
+			ix += inc_xv;
+			iy += inc_yv;
+		}
+		if(j < n){
+			gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+			vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+			vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+			vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+			vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+
+			vt0 = VFMULVF_FLOAT(vx0, c, gvl);
+			vt0 = VFMACCVF_FLOAT(vt0, s, vy0, gvl);
+			vt1 = VFMULVF_FLOAT(vx1, c, gvl);
+			vt1 = VFMACCVF_FLOAT(vt1, s, vy1, gvl);
+			vy0 = VFMULVF_FLOAT(vy0, c, gvl);
+			vy0 = VFNMSACVF_FLOAT(vy0, s, vx0, gvl);
+			vy1 = VFMULVF_FLOAT(vy1, c, gvl);
+			vy1 = VFNMSACVF_FLOAT(vy1, s, vx1, gvl);
+
+			VSSEV_FLOAT(&x[ix], stride_x, vt0, gvl);
+			VSSEV_FLOAT(&x[ix+1], stride_x, vt1, gvl);
+			VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
+			VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl);
+		}
+	}
+        return(0);
+}
diff --git a/kernel/riscv64/zscal.c b/kernel/riscv64/zscal.c
new file mode 100644
index 000000000..0521aaa0b
--- /dev/null
+++ b/kernel/riscv64/zscal.c
@@ -0,0 +1,88 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG inc_x2;
+	BLASLONG ip = 0;
+	FLOAT temp;
+
+        if ( (n <= 0) || (inc_x <= 0))
+                return(0);
+
+
+	inc_x2 = 2 * inc_x;
+	for ( i=0; i<n; i++ )
+	{
+		if ( da_r == 0.0 )
+		{
+			if ( da_i == 0.0 )
+			{
+				temp = 0.0;
+				x[ip+1] = 0.0 ;
+			}
+			else
+			{
+				temp = - da_i * x[ip+1] ;
+				x[ip+1] = da_i * x[ip]  ;
+			}
+		}
+		else
+		{
+			if ( da_i == 0.0 )
+			{
+				temp    = da_r * x[ip]  ;
+				x[ip+1] = da_r * x[ip+1];
+			}
+			else
+			{
+				temp    = da_r * x[ip]   - da_i * x[ip+1] ;
+				x[ip+1] = da_r * x[ip+1] + da_i * x[ip]   ;
+			}
+		}
+		x[ip]   = temp;
+
+		ip += inc_x2;
+	}
+
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zscal_vector.c b/kernel/riscv64/zscal_vector.c
new file mode 100644
index 000000000..796c52a02
--- /dev/null
+++ b/kernel/riscv64/zscal_vector.c
@@ -0,0 +1,152 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M4
+#define FLOAT_V_T float32xm4_t
+#define VLSEV_FLOAT vlsev_float32xm4
+#define VSSEV_FLOAT vssev_float32xm4
+#define VFMACCVF_FLOAT vfmaccvf_float32xm4
+#define VFMULVF_FLOAT vfmulvf_float32xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M4
+#define FLOAT_V_T float64xm4_t
+#define VLSEV_FLOAT vlsev_float64xm4
+#define VSSEV_FLOAT vssev_float64xm4
+#define VFMACCVF_FLOAT vfmaccvf_float64xm4
+#define VFMULVF_FLOAT vfmulvf_float64xm4
+#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+        BLASLONG i=0, j=0;
+        BLASLONG ix=0;
+
+
+        if((n <= 0) || (inc_x <= 0))
+                return(0);
+
+        unsigned int gvl = 0;
+        FLOAT_V_T vt, v0, v1;
+        if(da_r == 0.0 && da_i == 0.0){
+                memset(&x[0], 0, n * 2 * sizeof(FLOAT));
+        }else if(da_r == 0.0){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
+                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v1, -da_i, gvl);
+                        v1 = VFMULVF_FLOAT(v0, da_i, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }else if(da_i == 0.0){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * 2 * gvl;
+                for(i=0,j=0; i < n/gvl; i++){
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        vt = VFNMSACVF_FLOAT(vt, da_i, v1, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+                        v1 = VFMACCVF_FLOAT(v1, da_i, v0, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+
+                        vt = VFMULVF_FLOAT(v0, da_r, gvl);
+                        vt = VFNMSACVF_FLOAT(vt, da_i, v1, gvl);
+                        v1 = VFMULVF_FLOAT(v1, da_r, gvl);
+                        v1 = VFMACCVF_FLOAT(v1, da_i, v0, gvl);
+
+                        VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
+                }
+        }
+        return(0);
+}
diff --git a/kernel/riscv64/zswap.c b/kernel/riscv64/zswap.c
new file mode 100644
index 000000000..ae4760ae0
--- /dev/null
+++ b/kernel/riscv64/zswap.c
@@ -0,0 +1,72 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/09/14 Saar
+*	 BLASTEST float		: OK
+* 	 BLASTEST double	: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+**************************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	FLOAT temp[2];
+	BLASLONG inc_x2;
+	BLASLONG inc_y2;
+
+	if ( n < 0     )  return(0);
+
+	inc_x2 = 2 * inc_x;
+	inc_y2 = 2 * inc_y;
+
+	while(i < n)
+	{
+
+		temp[0]  = x[ix]   ;
+		temp[1]  = x[ix+1] ;
+		x[ix]    = y[iy]   ;
+		x[ix+1]  = y[iy+1] ;
+		y[iy]    = temp[0] ;
+		y[iy+1]  = temp[1] ;
+
+		ix += inc_x2 ;
+		iy += inc_y2 ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+
diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c
new file mode 100644
index 000000000..b655a968c
--- /dev/null
+++ b/kernel/riscv64/zswap_vector.c
@@ -0,0 +1,117 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+#include <stdio.h>
+#if !defined(DOUBLE)
+#define RVV_EFLOAT RVV_E32
+#define RVV_M RVV_M8
+#define FLOAT_V_T float32xm8_t
+#define VLEV_FLOAT vlev_float32xm8
+#define VLSEV_FLOAT vlsev_float32xm8
+#define VSEV_FLOAT vsev_float32xm8
+#define VSSEV_FLOAT vssev_float32xm8
+#else
+#define RVV_EFLOAT RVV_E64
+#define RVV_M RVV_M8
+#define FLOAT_V_T float64xm8_t
+#define VLEV_FLOAT vlev_float64xm8
+#define VLSEV_FLOAT vlsev_float64xm8
+#define VSEV_FLOAT vsev_float64xm8
+#define VSSEV_FLOAT vssev_float64xm8
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i = 0, j = 0;
+	BLASLONG ix = 0,iy = 0;
+        BLASLONG stride_x, stride_y;
+        FLOAT_V_T vx0, vx1, vy0, vy1;
+        unsigned int gvl = 0;
+
+	if (n < 0)  return(0);
+        if(inc_x == 1 && inc_y == 1){
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                BLASLONG n2 = n * 2;
+                if(gvl <= n2/2){
+                        for(i=0,j=0; i<n2/(2*gvl); i++){
+                                vx0 = VLEV_FLOAT(&x[j], gvl);
+                                vy0 = VLEV_FLOAT(&y[j], gvl);
+                                VSEV_FLOAT(&x[j], vy0, gvl);
+                                VSEV_FLOAT(&y[j], vx0, gvl);
+
+                                vx1 = VLEV_FLOAT(&x[j+gvl], gvl);
+                                vy1 = VLEV_FLOAT(&y[j+gvl], gvl);
+                                VSEV_FLOAT(&x[j+gvl], vy1, gvl);
+                                VSEV_FLOAT(&y[j+gvl], vx1, gvl);
+                                j += gvl * 2;
+                        }
+                }
+                for(;j<n2;){
+                        gvl = vsetvli(n2-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLEV_FLOAT(&x[j], gvl);
+                        vy0 = VLEV_FLOAT(&y[j], gvl);
+                        VSEV_FLOAT(&x[j], vy0, gvl);
+                        VSEV_FLOAT(&y[j], vx0, gvl);
+                        j += gvl;
+                }
+        }else{
+                gvl = vsetvli(n, RVV_EFLOAT, RVV_M);
+                stride_x = inc_x * 2 * sizeof(FLOAT);
+                stride_y = inc_y * 2 * sizeof(FLOAT);
+                BLASLONG inc_xv = inc_x * gvl * 2;
+                BLASLONG inc_yv = inc_y * gvl * 2;
+                for(i=0,j=0; i<n/gvl; i++){
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vy1, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+
+                        j += gvl;
+                        ix += inc_xv;
+                        iy += inc_yv;
+                }
+                if(j < n){
+                        gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M);
+                        vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
+                        vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);
+                        vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl);
+                        vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl);
+                        VSSEV_FLOAT(&x[ix], stride_x, vy0, gvl);
+                        VSSEV_FLOAT(&x[ix+1], stride_x, vy1, gvl);
+                        VSSEV_FLOAT(&y[iy], stride_y, vx0, gvl);
+                        VSSEV_FLOAT(&y[iy+1], stride_y, vx1, gvl);
+                }
+        }
+	return(0);
+}
+
+
diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile
new file mode 100644
index 000000000..75411deb5
--- /dev/null
+++ b/lapack/laswp/riscv64/Makefile
@@ -0,0 +1,13 @@
+TOPDIR	= ../../..
+include ../../../Makefile.system
+
+ifndef LASWP
+LASWP	= ../generic/laswp_k.c
+endif
+
+ifndef ZLASWP
+ZLASWP	= ../generic/zlaswp_k.c
+endif
+
+include ../generic/Makefile
+
diff --git a/param.h b/param.h
index f3ddde6a1..6dc6c84f8 100644
--- a/param.h
+++ b/param.h
@@ -2672,6 +2672,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P  16
 #endif
 
+#ifdef RISCV64_GENERIC
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  2
+#define SGEMM_DEFAULT_UNROLL_N  2
+
+#define DGEMM_DEFAULT_UNROLL_M  2
+#define DGEMM_DEFAULT_UNROLL_N  2
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	128
+#define DGEMM_DEFAULT_P	128
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 120
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
+#ifdef C910V
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+#define GEMM_DEFAULT_ALIGN 0x03fffUL
+
+#define SGEMM_DEFAULT_UNROLL_M  16
+#define SGEMM_DEFAULT_UNROLL_N  4
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_N  2
+
+#define ZGEMM_DEFAULT_UNROLL_M  2
+#define ZGEMM_DEFAULT_UNROLL_N  2
+
+#define SGEMM_DEFAULT_P	160
+#define DGEMM_DEFAULT_P	160
+#define CGEMM_DEFAULT_P 96
+#define ZGEMM_DEFAULT_P 64
+
+#define SGEMM_DEFAULT_Q 240
+#define DGEMM_DEFAULT_Q 128
+#define CGEMM_DEFAULT_Q 120
+#define ZGEMM_DEFAULT_Q 120
+
+#define SGEMM_DEFAULT_R 12288
+#define DGEMM_DEFAULT_R 8192
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	16
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 0
+
+#endif
+
 #ifdef ARMV7
 #define SNUMOPT		2
 #define DNUMOPT		2
diff --git a/test/Makefile b/test/Makefile
index eb3bc3447..1ecce0be7 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -258,6 +258,12 @@ endif
 
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
+
+ifeq ($(CORE), C910V)
+EXTRALIB =
+CEXTRALIB =
+endif
+
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)

From 1917a4e7b842ef046de2401bf634682039883768 Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Thu, 22 Oct 2020 22:00:00 -0400
Subject: [PATCH 009/121] reuse variables defined in Makefile.system

---
 Makefile.x86_64 |  7 +------
 kernel/Makefile | 15 ++-------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index a849f0b01..49a9a0a23 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -47,8 +47,6 @@ ifndef DYNAMIC_ARCH
 ifndef NO_AVX512
 ifeq ($(C_COMPILER), GCC)
 # cooperlake support was added in 10.1
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
 ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
 CCOMMON_OPT += -march=cooperlake
 FCOMMON_OPT += -march=cooperlake
@@ -73,10 +71,7 @@ ifndef DYNAMIC_ARCH
 ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
 ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
 CCOMMON_OPT += -mavx2
 endif
diff --git a/kernel/Makefile b/kernel/Makefile
index e52781c6d..e811ed43d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -12,11 +12,6 @@ ifdef HAVE_SSSE3
 CFLAGS += -mssse3
 endif
 
-ifeq ($(C_COMPILER), GCC)
-GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
-GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
-endif
-
 ifeq ($(ARCH), power)
 ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as
@@ -26,20 +21,14 @@ endif
 AVX2OPT = 
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
-GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
-GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
 ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif
 endif
 ifeq ($(C_COMPILER), CLANG)
 # Any clang posing as gcc 4.2 should be new enough (3.4 or later)
-  GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
-  GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
-  GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
-  GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
+  GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
   ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
    AVX2OPT = -mavx2
   endif

From dd6ebdfdab65e5235da4887c943f7639639d19af Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 23 Oct 2020 10:32:03 +0800
Subject: [PATCH 010/121] Refactor the performance measurement system

---
 benchmark/amax.c     | 170 ++++++++++++++--------------------------
 benchmark/amin.c     | 166 ++++++++++++++-------------------------
 benchmark/asum.c     | 180 +++++++++++++------------------------------
 benchmark/axpby.c    |  86 +--------------------
 benchmark/axpy.c     |  81 +------------------
 benchmark/bench.h    | 103 +++++++++++++++++++++++++
 benchmark/cholesky.c |  50 +-----------
 benchmark/copy.c     |  86 +--------------------
 benchmark/dot.c      |  84 +-------------------
 benchmark/geev.c     |  80 +------------------
 benchmark/gemm.c     |  80 +------------------
 benchmark/gemm3m.c   |  83 +-------------------
 12 files changed, 302 insertions(+), 947 deletions(-)
 create mode 100644 benchmark/bench.h

diff --git a/benchmark/amax.c b/benchmark/amax.c
index 19ae95c8b..29310dd71 100644
--- a/benchmark/amax.c
+++ b/benchmark/amax.c
@@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AMAX
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define AMAX   BLASFUNC(dzamax)
+#define AMAX BLASFUNC(dzamax)
 #else
-#define AMAX   BLASFUNC(scamax)
+#define AMAX BLASFUNC(scamax)
 #endif
 #else
 #ifdef DOUBLE
-#define AMAX   BLASFUNC(damax)
+#define AMAX BLASFUNC(damax)
 #else
-#define AMAX   BLASFUNC(samax)
+#define AMAX BLASFUNC(samax)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
+  int from = 1;
+  int to = 200;
+  int step = 1;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-  struct timeval start, stop;
-  double time1,timeg;
+  double time1, timeg;
 
-  argc--;argv++;
+  argc--;
+  argv++;
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
 
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
 
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
   }
 
 #ifdef __linux
@@ -152,37 +100,31 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
+    timeg = 0;
+    fprintf(stderr, " %6d : ", (int)m);
 
+    for (l = 0; l < loops; l++)
+    {
 
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	gettimeofday( &start, (struct timezone *)0);
-    	AMAX (&m, x, &inc_x);
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
 
+      begin();
+      AMAX(&m, x, &inc_x);
+      end();
+      timeg += getsec();
     }
 
     timeg /= loops;
 
     fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
   }
 
   return 0;
diff --git a/benchmark/amin.c b/benchmark/amin.c
index d0cadbd3b..4bcff9bba 100644
--- a/benchmark/amin.c
+++ b/benchmark/amin.c
@@ -25,124 +25,74 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AMIN
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define AMIN   BLASFUNC(dzamin)
+#define AMIN BLASFUNC(dzamin)
 #else
-#define AMIN   BLASFUNC(scamin)
+#define AMIN BLASFUNC(scamin)
 #endif
 #else
 #ifdef DOUBLE
-#define AMIN   BLASFUNC(damin)
+#define AMIN BLASFUNC(damin)
 #else
-#define AMIN   BLASFUNC(samin)
-#endif
-#endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
+#define AMIN BLASFUNC(samin)
 #endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
 #endif
 
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
+  int from = 1;
+  int to = 200;
+  int step = 1;
 
   struct timeval start, stop;
-  double time1,timeg;
+  double time1, timeg;
 
-  argc--;argv++;
+  argc--;
+  argv++;
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
 
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
 
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
   }
 
 #ifdef __linux
@@ -151,39 +101,35 @@ int main(int argc, char *argv[]){
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
+    timeg = 0;
 
-   fprintf(stderr, " %6d : ", (int)m);
+    fprintf(stderr, " %6d : ", (int)m);
 
+    for (l = 0; l < loops; l++)
+    {
 
-   for (l=0; l<loops; l++)
-   {
-
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-
-    	gettimeofday( &start, (struct timezone *)0);
-
-    	AMIN (&m, x, &inc_x);
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
 
-    	gettimeofday( &stop, (struct timezone *)0);
+      begin();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+      AMIN(&m, x, &inc_x);
 
-	timeg += time1;
+      end();
 
+      timeg += getsec();
     }
 
     timeg /= loops;
 
     fprintf(stderr,
-	    " %10.2f MFlops %10.6f sec\n",
-	    COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
-
+            " %10.2f MFlops %10.6f sec\n",
+            COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
   }
 
   return 0;
diff --git a/benchmark/asum.c b/benchmark/asum.c
index bcccd9089..098ddc8ee 100644
--- a/benchmark/asum.c
+++ b/benchmark/asum.c
@@ -25,178 +25,108 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef ASUM
 
 #ifdef COMPLEX
 #ifdef DOUBLE
-#define ASUM   BLASFUNC(dzasum)
+#define ASUM BLASFUNC(dzasum)
 #else
-#define ASUM   BLASFUNC(scasum)
+#define ASUM BLASFUNC(scasum)
 #endif
 #else
 #ifdef DOUBLE
-#define ASUM   BLASFUNC(dasum)
+#define ASUM BLASFUNC(dasum)
 #else
-#define ASUM   BLASFUNC(sasum)
+#define ASUM BLASFUNC(sasum)
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
-int main(int argc, char *argv[]){
+int main(int argc, char *argv[])
+{
 
   FLOAT *x;
   FLOAT result;
   blasint m, i;
-  blasint inc_x=1;
+  blasint inc_x = 1;
   int loops = 1;
   int l;
   char *p;
 
-  int from =   1;
-  int to   = 200;
-  int step =   1;
-
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-  struct timeval start, stop;
-  double time1,timeg;
-#else
-  struct timespec start = { 0, 0 }, stop = { 0, 0 };
+  int from = 1;
+  int to = 200;
+  int step = 1;
   double time1, timeg;
-#endif
 
-  argc--;argv++;
+  argc--;
+  argv++;
 
-  if (argc > 0) { from     = atol(*argv);		argc--; argv++;}
-  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
-  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}
+  if (argc > 0)
+  {
+    from = atol(*argv);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    to = MAX(atol(*argv), from);
+    argc--;
+    argv++;
+  }
+  if (argc > 0)
+  {
+    step = atol(*argv);
+    argc--;
+    argv++;
+  }
 
-  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
-  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
+  if ((p = getenv("OPENBLAS_LOOPS")))
+    loops = atoi(p);
+  if ((p = getenv("OPENBLAS_INCX")))
+    inc_x = atoi(p);
 
-  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops);
+  fprintf(stderr, "From : %3d  To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
 
-  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
-    fprintf(stderr,"Out of Memory!!\n");exit(1);
+  if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
+  {
+    fprintf(stderr, "Out of Memory!!\n");
+    exit(1);
   }
 
-
 #ifdef __linux
   srandom(getpid());
 #endif
 
   fprintf(stderr, "   SIZE       Flops\n");
 
-  for(m = from; m <= to; m += step)
+  for (m = from; m <= to; m += step)
   {
 
-   timeg=0;
-
-   fprintf(stderr, " %6d : ", (int)m);
+    timeg = 0;
 
-   for (l=0; l<loops; l++)
-   {
+    fprintf(stderr, " %6d : ", (int)m);
 
-   	for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
-			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
-   	}
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-    	gettimeofday( &start, (struct timezone *)0);
-#else
-        clock_gettime(CLOCK_REALTIME, &start);
-#endif
-    	result = ASUM (&m, x, &inc_x);
-#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
-    	clock_gettime(CLOCK_REALTIME, &stop);
-   	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-#else
-  	gettimeofday( &stop, (struct timezone *)0);
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
-#endif
-
-	timeg += time1;
+    for (l = 0; l < loops; l++)
+    {
 
+      for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
+      {
+        x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
+      }
+      begin();
+      result = ASUM(&m, x, &inc_x);
+      end();
+      timeg += getsec();
     }
-if (loops >1)
-    timeg /= loops;
+    if (loops > 1)
+      timeg /= loops;
 
 #ifdef COMPLEX
     fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
 #else
     fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
 #endif
-
   }
 
   return 0;
diff --git a/benchmark/axpby.c b/benchmark/axpby.c
index 793ee7e40..d02d9a889 100644
--- a/benchmark/axpby.c
+++ b/benchmark/axpby.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AXPBY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -176,16 +104,10 @@ int main(int argc, char *argv[]){
 
     for (l=0; l<loops; l++)
     {
-        gettimeofday( &start, (struct timezone *)0);
-
+        begin();
         AXPBY (&m, alpha, x, &inc_x, beta, y, &inc_y );
-
-        gettimeofday( &stop, (struct timezone *)0);
-
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-        timeg += time1;
-
+        end();
+        timeg += getsec();
     }
 
     timeg /= loops;
diff --git a/benchmark/axpy.c b/benchmark/axpy.c
index 760703c1d..5a7dead33 100644
--- a/benchmark/axpy.c
+++ b/benchmark/axpy.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef AXPY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -127,8 +56,6 @@ int main(int argc, char *argv[]){
   int from =   1;
   int to   = 200;
   int step =   1;
-
-  struct timespec start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -175,13 +102,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	clock_gettime( CLOCK_REALTIME, &start);
+    	begin();
 
     	AXPY (&m, alpha, x, &inc_x, y, &inc_y );
 
-    	clock_gettime( CLOCK_REALTIME, &stop);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/bench.h b/benchmark/bench.h
new file mode 100644
index 000000000..9055beaa7
--- /dev/null
+++ b/benchmark/bench.h
@@ -0,0 +1,103 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#ifdef __CYGWIN32__
+#include <sys/time.h>
+#endif
+#include "common.h"
+
+#if defined(__WIN32__) || defined(__WIN64__)
+
+#ifndef DELTA_EPOCH_IN_MICROSECS
+#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
+#endif
+
+int gettimeofday(struct timeval *tv, void *tz){
+
+  FILETIME ft;
+  unsigned __int64 tmpres = 0;
+  static int tzflag;
+
+  if (NULL != tv)
+    {
+      GetSystemTimeAsFileTime(&ft);
+
+      tmpres |= ft.dwHighDateTime;
+      tmpres <<= 32;
+      tmpres |= ft.dwLowDateTime;
+
+      /*converting file time to unix epoch*/
+      tmpres /= 10;  /*convert into microseconds*/
+      tmpres -= DELTA_EPOCH_IN_MICROSECS;
+      tv->tv_sec = (long)(tmpres / 1000000UL);
+      tv->tv_usec = (long)(tmpres % 1000000UL);
+    }
+
+  return 0;
+}
+
+#endif
+
+#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
+
+static void *huge_malloc(BLASLONG size){
+  int shmid;
+  void *address;
+
+#ifndef SHM_HUGETLB
+#define SHM_HUGETLB 04000
+#endif
+
+  if ((shmid =shmget(IPC_PRIVATE,
+		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
+		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
+    printf( "Memory allocation failed(shmget).\n");
+    exit(1);
+  }
+
+  address = shmat(shmid, NULL, SHM_RND);
+
+  if ((BLASLONG)address == -1){
+    printf( "Memory allocation failed(shmat).\n");
+    exit(1);
+  }
+
+  shmctl(shmid, IPC_RMID, 0);
+
+  return address;
+}
+
+#define malloc huge_malloc
+
+#endif
+
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+  struct timeval start, stop;
+#else
+  struct timespec start = { 0, 0 }, stop = { 0, 0 };
+#endif
+
+double getsec()
+{
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+#else
+    return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
+#endif
+}
+
+void begin() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    gettimeofday( &start, (struct timezone *)0);
+#else
+    clock_gettime(CLOCK_REALTIME, &start);
+#endif
+}
+
+void end() {
+#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
+    gettimeofday( &stop, (struct timezone *)0);
+#else
+    clock_gettime(CLOCK_REALTIME, &stop);
+#endif
+}
\ No newline at end of file
diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c
index 5908b6085..65b20d039 100644
--- a/benchmark/cholesky.c
+++ b/benchmark/cholesky.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -71,41 +66,6 @@ double fabs(double);
 #endif
 #endif
 
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-
 static __inline double getmflops(int ratio, int m, double secs){
 
   double mm = (double)m;
@@ -145,7 +105,6 @@ int main(int argc, char *argv[]){
 
   FLOAT maxerr;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -220,20 +179,19 @@ int main(int argc, char *argv[]){
 
       SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
 
-      gettimeofday( &start, (struct timezone *)0);
+      begin();
 
       POTRF(uplo[uplos], &m, b, &m, &info);
 
-      gettimeofday( &stop, (struct timezone *)0);
+      end();
 
       if (info != 0) {
 	fprintf(stderr, "Info = %d\n", info);
 	exit(1);
       }
 
-     time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+     time1 = getsec();
 
-      maxerr = 0.;
 
       if (!(uplos & 1)) {
 	for (j = 0; j < m; j++) {
diff --git a/benchmark/copy.c b/benchmark/copy.c
index eb5148fff..c5e447521 100644
--- a/benchmark/copy.c
+++ b/benchmark/copy.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef COPY
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,11 +57,9 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1 = 0.0, timeg = 0.0;
   long nanos = 0;
   time_t seconds = 0;
-  struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
 
   argc--;argv++;
 
@@ -176,15 +103,10 @@ int main(int argc, char *argv[]){
 
    for (l=0; l<loops; l++)
    {
-       clock_gettime(CLOCK_REALTIME, &time_start);
+       begin();
        COPY (&m, x, &inc_x, y, &inc_y );
-       clock_gettime(CLOCK_REALTIME, &time_end);
-
-       nanos = time_end.tv_nsec - time_start.tv_nsec;
-       seconds = time_end.tv_sec - time_start.tv_sec;
-
-       time1 = seconds + nanos / 1.e9;
-       timeg += time1;
+       end();
+       timeg += getsec();
    }
 
       timeg /= loops;
diff --git a/benchmark/dot.c b/benchmark/dot.c
index aae3c04b0..86f4e3828 100644
--- a/benchmark/dot.c
+++ b/benchmark/dot.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef DOT
 
-
 #ifdef DOUBLE
 #define DOT   BLASFUNC(ddot)
 #else
 #define DOT   BLASFUNC(sdot)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -169,15 +96,12 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	result = DOT (&m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
+    	end();
+	    timeg += getsec();
 
     }
 
diff --git a/benchmark/geev.c b/benchmark/geev.c
index 4fd2c8d6f..6e22cdfb6 100644
--- a/benchmark/geev.c
+++ b/benchmark/geev.c
@@ -36,13 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEEV
 
@@ -74,71 +68,6 @@ extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
                 FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info );
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
@@ -154,7 +83,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -223,7 +151,7 @@ int main(int argc, char *argv[]){
   for(m = from; m <= to; m += step){
 
     fprintf(stderr, " %6d : ", (int)m);
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     lwork = -1;
 #ifndef COMPLEX
@@ -239,14 +167,14 @@ int main(int argc, char *argv[]){
     GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
 #endif
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "failed to compute eigenvalues .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops : %10.2f Sec : %d\n",
diff --git a/benchmark/gemm.c b/benchmark/gemm.c
index 8cd14bbed..35f5096f3 100644
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMM
 
@@ -55,71 +49,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   IFLOAT *a, *b;
@@ -139,7 +68,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1, timeg;
 
   argc--;argv++;
@@ -228,14 +156,14 @@ int main(int argc, char *argv[]){
     ldc = m;
 
     fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k);
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     for (j=0; j<loops; j++) {
       GEMM (&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
     }
 
-    gettimeofday( &stop, (struct timezone *)0);
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    end();
+    time1 = getsec();
 
     timeg = time1/loops;
     fprintf(stderr,
diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c
index 98c13e1be..76b8176b2 100644
--- a/benchmark/gemm3m.c
+++ b/benchmark/gemm3m.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMM
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -187,16 +116,12 @@ int main(int argc, char *argv[]){
       		}
     	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+    	end();
+	    timeg += getsec();
     }
 
     timeg /= loops;

From 81fcfd5ed3ecc3a5f1aefec9ab202d487af85da0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 23:28:29 +0200
Subject: [PATCH 011/121] Update version to 0.3.12.dev

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 53c1709a8..aeb4399e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 12)
+set(OpenBLAS_PATCH_VERSION 12.dev)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions

From 2f9fc9be30e33efb21b7873c8ee060af190aabd8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 24 Oct 2020 23:29:05 +0200
Subject: [PATCH 012/121] Update version to 0.3.12.dev

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index a4d11dc7c..1a0965d08 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.12
+VERSION = 0.3.12.dev
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From fd7da56965a5af99f7ec2af161f0057f8b9d6bdb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 25 Oct 2020 12:01:50 +0100
Subject: [PATCH 013/121] Move definitions that are neither needed nor
 supported on SUNOS

---
 driver/others/memory.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/driver/others/memory.c b/driver/others/memory.c
index ba2bb55b9..f0521ab2d 100644
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -1767,11 +1767,11 @@ int get_num_procs(void);
 int get_num_procs(void) {
 
   static int nums = 0;
+
+#if defined(__GLIBC_PREREQ)
   cpu_set_t cpuset,*cpusetp;
   size_t size;
   int ret;
-
-#if defined(__GLIBC_PREREQ)
 #if !__GLIBC_PREREQ(2, 7)
   int i;
 #if !__GLIBC_PREREQ(2, 6)

From eec517af0eb1bea187236ccd1072741fbabce01c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Oct 2020 00:21:56 +0100
Subject: [PATCH 014/121] Expressly enable neon for use with intrinsics if
 available

---
 Makefile.arm | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile.arm b/Makefile.arm
index fac6b56824..a27b58e84 100644
--- a/Makefile.arm
+++ b/Makefile.arm
@@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6)
 CCOMMON_OPT += -mfpu=vfp
 FCOMMON_OPT += -mfpu=vfp
 endif
+
+ifdef HAVE_NEON
+CCOMMON_OPT += -mfpu=neon
+FCOMMON_OPT += -mfpu=neon
+endif

From f917c26e83e040270cb98488b296a5c85cbb5ffb Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 26 Oct 2020 10:25:05 +0800
Subject: [PATCH 015/121] Refractoring remaining benchmark cases.

---
 benchmark/amin.c       |  1 -
 benchmark/bench.h      |  1 +
 benchmark/dot.c        |  1 -
 benchmark/gemm3m.c     |  1 -
 benchmark/gemv.c       | 86 ++++-------------------------------------
 benchmark/ger.c        | 86 +++--------------------------------------
 benchmark/gesv.c       | 83 ++--------------------------------------
 benchmark/getri.c      | 79 ++------------------------------------
 benchmark/hbmv.c       | 84 ++--------------------------------------
 benchmark/hemm.c       | 81 ++-------------------------------------
 benchmark/hemv.c       | 82 ++-------------------------------------
 benchmark/her.c        | 85 ++---------------------------------------
 benchmark/her2.c       | 85 ++---------------------------------------
 benchmark/her2k.c      | 81 ++-------------------------------------
 benchmark/herk.c       | 83 ++--------------------------------------
 benchmark/hpmv.c       | 82 ++-------------------------------------
 benchmark/iamax.c      | 80 ++------------------------------------
 benchmark/iamin.c      | 80 ++------------------------------------
 benchmark/imax.c       | 80 ++------------------------------------
 benchmark/imin.c       | 80 ++------------------------------------
 benchmark/linpack.c    | 85 ++++-------------------------------------
 benchmark/max.c        | 80 ++------------------------------------
 benchmark/min.c        | 80 ++------------------------------------
 benchmark/nrm2.c       | 80 ++------------------------------------
 benchmark/potrf.c      | 56 +++++----------------------
 benchmark/rot.c        | 79 ++------------------------------------
 benchmark/rotm.c       | 82 +++------------------------------------
 benchmark/scal.c       | 80 ++------------------------------------
 benchmark/spmv.c       | 81 ++-------------------------------------
 benchmark/spr.c        | 82 ++-------------------------------------
 benchmark/spr2.c       | 80 ++------------------------------------
 benchmark/swap.c       | 79 ++------------------------------------
 benchmark/symm.c       | 80 ++------------------------------------
 benchmark/symv.c       | 80 ++------------------------------------
 benchmark/syr.c        | 80 ++------------------------------------
 benchmark/syr2.c       | 81 ++-------------------------------------
 benchmark/syr2k.c      | 79 ++------------------------------------
 benchmark/syrk.c       | 80 ++------------------------------------
 benchmark/tpmv.c       | 48 ++---------------------
 benchmark/tpsv.c       | 48 ++---------------------
 benchmark/trmm.c       | 79 ++------------------------------------
 benchmark/trmv.c       | 48 ++---------------------
 benchmark/trsm.c       | 79 ++------------------------------------
 benchmark/trsv.c       | 87 ++----------------------------------------
 benchmark/zdot-intel.c | 83 +++-------------------------------------
 benchmark/zdot.c       | 81 ++-------------------------------------
 46 files changed, 184 insertions(+), 3114 deletions(-)

diff --git a/benchmark/amin.c b/benchmark/amin.c
index 4bcff9bba..54a1d266a 100644
--- a/benchmark/amin.c
+++ b/benchmark/amin.c
@@ -57,7 +57,6 @@ int main(int argc, char *argv[])
   int to = 200;
   int step = 1;
 
-  struct timeval start, stop;
   double time1, timeg;
 
   argc--;
diff --git a/benchmark/bench.h b/benchmark/bench.h
index 9055beaa7..1f9b8986c 100644
--- a/benchmark/bench.h
+++ b/benchmark/bench.h
@@ -67,6 +67,7 @@ static void *huge_malloc(BLASLONG size){
   return address;
 }
 
+
 #define malloc huge_malloc
 
 #endif
diff --git a/benchmark/dot.c b/benchmark/dot.c
index 86f4e3828..72a756249 100644
--- a/benchmark/dot.c
+++ b/benchmark/dot.c
@@ -49,7 +49,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
diff --git a/benchmark/gemm3m.c b/benchmark/gemm3m.c
index 76b8176b2..f505ca049 100644
--- a/benchmark/gemm3m.c
+++ b/benchmark/gemm3m.c
@@ -62,7 +62,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
diff --git a/benchmark/gemv.c b/benchmark/gemv.c
index fb1f541d3..a0001277a 100644
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef GEMV
@@ -52,72 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #endif
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -211,10 +139,10 @@ int main(int argc, char *argv[]){
    			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
-    			gettimeofday( &start, (struct timezone *)0);
+    			begin();
     			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-    			gettimeofday( &stop, (struct timezone *)0);
-    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    			end();
+    			time1 = getsec();
 			timeg += time1;
 
     		}
@@ -248,10 +176,10 @@ int main(int argc, char *argv[]){
    			for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    			}
-    			gettimeofday( &start, (struct timezone *)0);
+    			begin();
     			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
-    			gettimeofday( &stop, (struct timezone *)0);
-    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    			end();
+    			time1 = getsec();
 			timeg += time1;
 
     		}
diff --git a/benchmark/ger.c b/benchmark/ger.c
index d53d328f0..7ce08c3ad 100644
--- a/benchmark/ger.c
+++ b/benchmark/ger.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef GER
 
@@ -49,72 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -131,7 +59,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -198,16 +125,13 @@ int main(int argc, char *argv[]){
     for (l=0; l<loops; l++)
     {
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m);
 
-    	gettimeofday( &stop, (struct timezone *)0);
-
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
-	timeg += time1;
-
+    	end();
+      
+      timeg += getsec();
     }
 
     timeg /= loops;
diff --git a/benchmark/gesv.c b/benchmark/gesv.c
index 057cbd243..1806b31be 100644
--- a/benchmark/gesv.c
+++ b/benchmark/gesv.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -66,71 +61,6 @@ double fabs(double);
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -142,7 +72,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -194,22 +123,18 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GESV (&m, &m, a, &m, ipiv, b, &m,  &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
-
-
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
-
+    end();
 
+    time1 = getsec();
 
     fprintf(stderr,
 	    "%10.2f MFlops %10.6f s\n",
 	    COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
 
-
   }
 
   return 0;
diff --git a/benchmark/getri.c b/benchmark/getri.c
index a07014768..98a860906 100644
--- a/benchmark/getri.c
+++ b/benchmark/getri.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef GETRF
 #undef GETRI
@@ -72,71 +67,6 @@
 
 extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info);
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*work;
@@ -148,7 +78,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -205,21 +134,21 @@ int main(int argc, char *argv[]){
       exit(1);
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     lwork = -1;
     GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
 
     lwork = (blasint)wkopt[0];
     GETRI(&m, a, &m, ipiv, work, &lwork, &info);
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops : %10.2f Sec : %d\n",
diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c
index 60ba9fb89..35249bdf9 100644
--- a/benchmark/hbmv.c
+++ b/benchmark/hbmv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HBMV
 
-
 #ifdef DOUBLE
 #define HBMV   BLASFUNC(zhbmv)
 #else
 #define HBMV   BLASFUNC(chbmv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz) {
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size) {
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-                (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1){
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -125,7 +52,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1,timeg;
 
     argc--;argv++;
@@ -186,15 +112,13 @@ int main(int argc, char *argv[]){
             y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-        gettimeofday( &stop, (struct timezone *)0);
-
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        end();
 
-        timeg += time1;
+        timeg += getsec();
 
     }
 
diff --git a/benchmark/hemm.c b/benchmark/hemm.c
index 2bc165458..a0a9985ad 100644
--- a/benchmark/hemm.c
+++ b/benchmark/hemm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HEMM
 
@@ -41,72 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define HEMM   BLASFUNC(chemm)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -126,7 +54,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/hemv.c b/benchmark/hemv.c
index 98618a04e..ad130ddd0 100644
--- a/benchmark/hemv.c
+++ b/benchmark/hemv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HEMV
 
-
 #ifdef DOUBLE
 #define HEMV   BLASFUNC(zhemv)
 #else
 #define HEMV   BLASFUNC(chemv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -182,13 +108,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/her.c b/benchmark/her.c
index 010f8120d..cd1fb7f48 100644
--- a/benchmark/her.c
+++ b/benchmark/her.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER
 
-
 #ifdef DOUBLE
 #define HER   BLASFUNC(zher)
 #else
 #define HER   BLASFUNC(cher)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x;
@@ -126,8 +53,6 @@ int main(int argc, char *argv[]){
     int from =   1;
     int to   = 200;
     int step =   1;
-
-    struct timeval start, stop;
     double time1;
 
     argc--;argv++;
@@ -166,15 +91,13 @@ int main(int argc, char *argv[]){
             x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HER (&uplo, &m, alpha, x, &incx, a, &m );
 
-        gettimeofday( &stop, (struct timezone *)0);
-
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        end();
 
-        gettimeofday( &start, (struct timezone *)0);
+        time1 = getsec();
 
         fprintf(stderr,
                 " %10.2f MFlops\n",
diff --git a/benchmark/her2.c b/benchmark/her2.c
index 0f80f3ed9..d87bfd466 100644
--- a/benchmark/her2.c
+++ b/benchmark/her2.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER2
 
-
 #ifdef DOUBLE
 #define HER2   BLASFUNC(zher2)
 #else
 #define HER2   BLASFUNC(cher2)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1;
 
     argc--;argv++;
@@ -169,16 +95,13 @@ int main(int argc, char *argv[]){
             y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
-
+        begin();
 
         HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
 
-        gettimeofday( &stop, (struct timezone *)0);
-
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        end();
 
-        gettimeofday( &start, (struct timezone *)0);
+        time1 = getsec();
 
         fprintf(stderr,
                 " %10.2f MFlops\n",
diff --git a/benchmark/her2k.c b/benchmark/her2k.c
index 021873beb..d3cdce696 100644
--- a/benchmark/her2k.c
+++ b/benchmark/her2k.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HER2K
 #ifdef DOUBLE
@@ -40,72 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define HER2K   BLASFUNC(cher2k)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -169,13 +96,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/herk.c b/benchmark/herk.c
index c09d35c1f..628dc2c11 100644
--- a/benchmark/herk.c
+++ b/benchmark/herk.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HERK
 
-
 #ifdef DOUBLE
 #define HERK   BLASFUNC(zherk)
 #else
 #define HERK   BLASFUNC(cherk)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *c;
@@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -167,18 +93,17 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
 	    COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
-
   }
 
   return 0;
diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c
index b0157094e..907e2adc4 100644
--- a/benchmark/hpmv.c
+++ b/benchmark/hpmv.c
@@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef HPMV
 
-
 #ifdef DOUBLE
 #define HPMV   BLASFUNC(zhpmv)
 #else
 #define HPMV   BLASFUNC(chpmv)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz) {
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv)
-    {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10;  /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size) {
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-                (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1){
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
     FLOAT *a, *x, *y;
@@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
     int to   = 200;
     int step =   1;
 
-    struct timeval start, stop;
     double time1,timeg;
 
     argc--;argv++;
@@ -183,13 +109,13 @@ int main(int argc, char *argv[]){
             y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
         }
 
-        gettimeofday( &start, (struct timezone *)0);
+        begin();
 
         HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
 
-        gettimeofday( &stop, (struct timezone *)0);
+        end();
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
 
         timeg += time1;
 
diff --git a/benchmark/iamax.c b/benchmark/iamax.c
index c87044ab4..15618cbcc 100644
--- a/benchmark/iamax.c
+++ b/benchmark/iamax.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IAMAX
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IAMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/iamin.c b/benchmark/iamin.c
index e7c8e59e4..a57638ecc 100644
--- a/benchmark/iamin.c
+++ b/benchmark/iamin.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IAMIN
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IAMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/imax.c b/benchmark/imax.c
index b56ef64ba..b96b17167 100644
--- a/benchmark/imax.c
+++ b/benchmark/imax.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IMAX
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/imin.c b/benchmark/imin.c
index 4a92c8bd0..095eacca9 100644
--- a/benchmark/imin.c
+++ b/benchmark/imin.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef IMIN
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	IMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/linpack.c b/benchmark/linpack.c
index 661a44175..202035245 100644
--- a/benchmark/linpack.c
+++ b/benchmark/linpack.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -72,71 +67,6 @@ double fabs(double);
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
 
   FLOAT maxerr;
 
-  struct timeval start, stop;
   double time1, time2;
 
   argc--;argv++;
@@ -198,31 +127,31 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GETRF (&m, &m, a, &m, ipiv, &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "Matrix is not singular .. %d\n", info);
       exit(1);
     }
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
     if (info) {
       fprintf(stderr, "Matrix is not singular .. %d\n", info);
       exit(1);
     }
 
-    time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time2 = getsec();
 
     maxerr = 0.;
 
diff --git a/benchmark/max.c b/benchmark/max.c
index a19a386a2..301b943a5 100644
--- a/benchmark/max.c
+++ b/benchmark/max.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NAMAX
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NAMAX (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/min.c b/benchmark/min.c
index 4df8fb0fd..39df37a29 100644
--- a/benchmark/min.c
+++ b/benchmark/min.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NAMIN
 
@@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NAMIN (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c
index 0f416621a..cd64d564a 100644
--- a/benchmark/nrm2.c
+++ b/benchmark/nrm2.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef NRM2
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x;
@@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
 
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	NRM2 (&m, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/potrf.c b/benchmark/potrf.c
index cb4c23bab..116d0cca5 100644
--- a/benchmark/potrf.c
+++ b/benchmark/potrf.c
@@ -36,12 +36,7 @@
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 double fabs(double);
 
@@ -86,37 +81,7 @@ double fabs(double);
 // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info);
 // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info);
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
 
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
 
 int main(int argc, char *argv[]){
 
@@ -141,7 +106,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -217,18 +181,18 @@ int main(int argc, char *argv[]){
 
       SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
 
-      gettimeofday( &start, (struct timezone *)0);
+      begin();
 
       POTRF(uplo[uplos], &m, b, &m, &info);
 
-      gettimeofday( &stop, (struct timezone *)0);
+      end();
 
       if (info != 0) {
 	fprintf(stderr, "Potrf info = %d\n", info);
 	exit(1);
       }
 
-      time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+      time1 = getsec();
       flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
 
       if ( btest == 'S' )
@@ -240,17 +204,17 @@ int main(int argc, char *argv[]){
       		}
     	}
 
-      	gettimeofday( &start, (struct timezone *)0);
+      	begin();
 
       	POTRS(uplo[uplos], &m, &m, b, &m, a, &m,  &info);
 
-      	gettimeofday( &stop, (struct timezone *)0);
+      	end();
 
       	if (info != 0) {
 		fprintf(stderr, "Potrs info = %d\n", info);
 		exit(1);
         }
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
         flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
 
       }
@@ -258,18 +222,18 @@ int main(int argc, char *argv[]){
       if ( btest == 'I' )
       {
 	
-      	gettimeofday( &start, (struct timezone *)0);
+      	begin();
 
       	POTRI(uplo[uplos], &m, b, &m, &info);
 
-      	gettimeofday( &stop, (struct timezone *)0);
+      	end();
 
       	if (info != 0) {
 		fprintf(stderr, "Potri info = %d\n", info);
 		exit(1);
         }
 
-        time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+        time1 = getsec();
         flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
       }
 	
diff --git a/benchmark/rot.c b/benchmark/rot.c
index 69698988d..15b630e36 100644
--- a/benchmark/rot.c
+++ b/benchmark/rot.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef ROT
 
@@ -52,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -133,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -179,13 +108,13 @@ int main(int argc, char *argv[]){
 
    for (l=0; l<loops; l++)
    {
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	ROT (&m, x, &inc_x, y, &inc_y, c, s);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	    timeg += time1;
 
diff --git a/benchmark/rotm.c b/benchmark/rotm.c
index 17c8d5416..7f333e220 100644
--- a/benchmark/rotm.c
+++ b/benchmark/rotm.c
@@ -25,12 +25,7 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
 THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef ROTM
 
@@ -40,72 +35,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ROTM BLASFUNC(srotm)
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz)
-{
-
-    FILETIME ft;
-    unsigned __int64 tmpres = 0;
-    static int tzflag;
-
-    if (NULL != tv) {
-        GetSystemTimeAsFileTime(&ft);
-
-        tmpres |= ft.dwHighDateTime;
-        tmpres <<= 32;
-        tmpres |= ft.dwLowDateTime;
-
-        /*converting file time to unix epoch*/
-        tmpres /= 10; /*convert into microseconds*/
-        tmpres -= DELTA_EPOCH_IN_MICROSECS;
-        tv->tv_sec = (long)(tmpres / 1000000UL);
-        tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-    return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =
-             shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-                    SHM_HUGETLB | IPC_CREAT | 0600)) < 0) {
-        printf("Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf("Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -122,7 +51,7 @@ int main(int argc, char *argv[])
     int to = 200;
     int step = 1;
 
-    struct timeval start, stop;
+    
     double time1, timeg;
 
     argc--;
@@ -188,14 +117,13 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            gettimeofday(&start, (struct timezone *)0);
+            begin();
 
             ROTM(&m, x, &inc_x, y, &inc_y, param);
 
-            gettimeofday(&stop, (struct timezone *)0);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) +
-                    (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+            time1 = getsec();
 
             timeg += time1;
         }
diff --git a/benchmark/scal.c b/benchmark/scal.c
index 8bd62c77c..8de6cfd04 100644
--- a/benchmark/scal.c
+++ b/benchmark/scal.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SCAL
 
@@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,7 +57,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -174,13 +102,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SCAL (&m, alpha, x, &inc_x);
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/spmv.c b/benchmark/spmv.c
index cff504d3b..e4dcbf4ae 100644
--- a/benchmark/spmv.c
+++ b/benchmark/spmv.c
@@ -25,17 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SPMV
 
-
 #ifndef COMPLEX
 
 #ifdef DOUBLE
@@ -54,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -135,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -193,13 +120,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/spr.c b/benchmark/spr.c
index 5dcaa4f8b..2fc9994f8 100755
--- a/benchmark/spr.c
+++ b/benchmark/spr.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SPR
 
@@ -41,73 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SPR   BLASFUNC(sspr)
 #endif
 
-
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*c;
@@ -129,7 +56,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -173,13 +99,13 @@ int main(int argc, char *argv[]){
 			c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
 		}
 
-		gettimeofday( &start, (struct timezone *)0);
+		begin();
 
 		SPR (&uplo, &m, alpha, c, &inc_x, a);
 
-		gettimeofday( &stop, (struct timezone *)0);
+		end();
 
-		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+		time1 = getsec();
 		
 		timeg += time1;
    }
diff --git a/benchmark/spr2.c b/benchmark/spr2.c
index a5f2791f7..8f194e83a 100755
--- a/benchmark/spr2.c
+++ b/benchmark/spr2.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SPR2
@@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a,*b,*c;
@@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -182,13 +110,13 @@ int main(int argc, char *argv[]){
 			c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
 		}
 
-		gettimeofday( &start, (struct timezone *)0);
+		begin();
 
 		SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a);
 
-		gettimeofday( &stop, (struct timezone *)0);
+		end();
 
-		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+		time1 = getsec();
 		
 		timeg += time1;
    }
diff --git a/benchmark/swap.c b/benchmark/swap.c
index 76d545995..64ebe5e9b 100644
--- a/benchmark/swap.c
+++ b/benchmark/swap.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SWAP
@@ -49,71 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -128,7 +58,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -175,13 +104,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SWAP (&m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/symm.c b/benchmark/symm.c
index bb9849eb5..1c6d91d00 100644
--- a/benchmark/symm.c
+++ b/benchmark/symm.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYMM
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -181,13 +109,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/symv.c b/benchmark/symv.c
index e4c892b5a..0a35aaef0 100644
--- a/benchmark/symv.c
+++ b/benchmark/symv.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYMV
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x, *y;
@@ -134,7 +63,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -192,13 +120,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/syr.c b/benchmark/syr.c
index a9dd293e6..ebbf2bd3c 100644
--- a/benchmark/syr.c
+++ b/benchmark/syr.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SYR
@@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x,*a;
@@ -124,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
   
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -165,13 +93,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syr2.c b/benchmark/syr2.c
index 9efbca315..acbc86987 100644
--- a/benchmark/syr2.c
+++ b/benchmark/syr2.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYR2
 
@@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYR2   BLASFUNC(ssyr2)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y, *a;
@@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -174,13 +101,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c
index a906559eb..3895c2861 100644
--- a/benchmark/syr2k.c
+++ b/benchmark/syr2k.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef SYR2K
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b, *c;
@@ -137,7 +67,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -181,13 +110,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/syrk.c b/benchmark/syrk.c
index 0fbb943f6..82606a21a 100644
--- a/benchmark/syrk.c
+++ b/benchmark/syrk.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef SYRK
 
@@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *c;
@@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -177,13 +105,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops\n",
diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c
index fe9d07534..41f2e0fb8 100644
--- a/benchmark/tpmv.c
+++ b/benchmark/tpmv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TPMV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c
index 8472ac261..ebfa29692 100644
--- a/benchmark/tpsv.c
+++ b/benchmark/tpsv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TPSV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/trmm.c b/benchmark/trmm.c
index 23af122b4..3ab9fc255 100644
--- a/benchmark/trmm.c
+++ b/benchmark/trmm.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef TRMM
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -141,7 +71,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -180,13 +109,13 @@ int main(int argc, char *argv[]){
       }
     }
 
-    gettimeofday( &start, (struct timezone *)0);
+    begin();
 
     TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
 
-    gettimeofday( &stop, (struct timezone *)0);
+    end();
 
-    time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    time1 = getsec();
 
     fprintf(stderr,
 	    " %10.2f MFlops  %10.6f sec\n",
diff --git a/benchmark/trmv.c b/benchmark/trmv.c
index 46641b3e4..0e8088b54 100644
--- a/benchmark/trmv.c
+++ b/benchmark/trmv.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 #undef TRMV
 
@@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size)
-{
-    int shmid;
-    void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-    if ((shmid =shmget(IPC_PRIVATE,
-             (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-             SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-        printf( "Memory allocation failed(shmget).\n");
-        exit(1);
-    }
-
-    address = shmat(shmid, NULL, SHM_RND);
-
-    if ((BLASLONG)address == -1) {
-        printf( "Memory allocation failed(shmat).\n");
-        exit(1);
-    }
-
-    shmctl(shmid, IPC_RMID, 0);
-
-    return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[])
 {
 
@@ -112,7 +73,6 @@ int main(int argc, char *argv[])
     int to   = 200;
     int step =   1;
 
-    struct timespec start = { 0, 0 }, stop = { 0, 0 };
     double time1, timeg;
 
     argc--;argv++;
@@ -153,11 +113,11 @@ int main(int argc, char *argv[])
         }
 
         for (l = 0; l < loops; l++) {
-            clock_gettime(CLOCK_REALTIME, &start);
+            begin();
             TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x);
-            clock_gettime(CLOCK_REALTIME, &stop);
+            end();
 
-            time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
+            time1 = getsec();
             timeg += time1;
         }
 
diff --git a/benchmark/trsm.c b/benchmark/trsm.c
index 17676946a..d2ebd7f54 100644
--- a/benchmark/trsm.c
+++ b/benchmark/trsm.c
@@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
+#include "bench.h"
 
 
 #undef TRSM
@@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *b;
@@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1;
 
   argc--;argv++;
@@ -196,13 +125,13 @@ int main(int argc, char *argv[]){
       		 	}
     		 }
 
-    		gettimeofday( &start, (struct timezone *)0);
+    		begin();
 
     		TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
 
-    		gettimeofday( &stop, (struct timezone *)0);
+    		end();
 
-    		time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    		time1 = getsec();
 
 		timeg += time1;
         }
diff --git a/benchmark/trsv.c b/benchmark/trsv.c
index 1734e2adb..66ac3a3c7 100644
--- a/benchmark/trsv.c
+++ b/benchmark/trsv.c
@@ -25,14 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include <time.h>
-#include "common.h"
-
+#include "bench.h"
 
 #undef GEMV
 #undef TRSV
@@ -55,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *a, *x;
@@ -133,7 +61,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timespec time_start, time_end;
   time_t seconds = 0;
 
   double time1,timeg;
@@ -189,19 +116,13 @@ int main(int argc, char *argv[]){
 
       for(l =0;l< loops;l++){
 
-          clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start);
-
+          begin();
           TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x);
-
-          clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end);
-          nanos = time_end.tv_nsec - time_start.tv_nsec;
-          seconds = time_end.tv_sec - time_start.tv_sec;
-
-          time1 = seconds + nanos /1.e9;
+          end();
+          time1 = getsec();
           timeg += time1;
       }
 
-
       timeg /= loops;
       long long muls = n*(n+1)/2.0;
       long long adds = (n - 1.0)*n/2.0;
diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c
index ba1515365..06cdde13a 100644
--- a/benchmark/zdot-intel.c
+++ b/benchmark/zdot-intel.c
@@ -25,90 +25,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#define RETURN_BY_STACK 1
-#include "common.h"
+#include "bench.h"
 
+#define RETURN_BY_STACK 1
 
 #undef DOT
 
-
 #ifdef DOUBLE
 #define DOT   BLASFUNC(zdotu)
 #else
 #define DOT   BLASFUNC(cdotu)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -123,7 +51,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 
     	DOT (&result, &m, x, &inc_x, y, &inc_y );
 
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 
diff --git a/benchmark/zdot.c b/benchmark/zdot.c
index fa624e859..23b3efcad 100644
--- a/benchmark/zdot.c
+++ b/benchmark/zdot.c
@@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-#include <stdio.h>
-#include <stdlib.h>
-#ifdef __CYGWIN32__
-#include <sys/time.h>
-#endif
-#include "common.h"
-
+#include "bench.h"
 
 #undef DOT
 
@@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define DOT   BLASFUNC(cdotu)
 #endif
 
-
-#if defined(__WIN32__) || defined(__WIN64__)
-
-#ifndef DELTA_EPOCH_IN_MICROSECS
-#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
-#endif
-
-int gettimeofday(struct timeval *tv, void *tz){
-
-  FILETIME ft;
-  unsigned __int64 tmpres = 0;
-  static int tzflag;
-
-  if (NULL != tv)
-    {
-      GetSystemTimeAsFileTime(&ft);
-
-      tmpres |= ft.dwHighDateTime;
-      tmpres <<= 32;
-      tmpres |= ft.dwLowDateTime;
-
-      /*converting file time to unix epoch*/
-      tmpres /= 10;  /*convert into microseconds*/
-      tmpres -= DELTA_EPOCH_IN_MICROSECS;
-      tv->tv_sec = (long)(tmpres / 1000000UL);
-      tv->tv_usec = (long)(tmpres % 1000000UL);
-    }
-
-  return 0;
-}
-
-#endif
-
-#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
-
-static void *huge_malloc(BLASLONG size){
-  int shmid;
-  void *address;
-
-#ifndef SHM_HUGETLB
-#define SHM_HUGETLB 04000
-#endif
-
-  if ((shmid =shmget(IPC_PRIVATE,
-		     (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
-		     SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
-    printf( "Memory allocation failed(shmget).\n");
-    exit(1);
-  }
-
-  address = shmat(shmid, NULL, SHM_RND);
-
-  if ((BLASLONG)address == -1){
-    printf( "Memory allocation failed(shmat).\n");
-    exit(1);
-  }
-
-  shmctl(shmid, IPC_RMID, 0);
-
-  return address;
-}
-
-#define malloc huge_malloc
-
-#endif
-
 int main(int argc, char *argv[]){
 
   FLOAT *x, *y;
@@ -122,7 +50,6 @@ int main(int argc, char *argv[]){
   int to   = 200;
   int step =   1;
 
-  struct timeval start, stop;
   double time1,timeg;
 
   argc--;argv++;
@@ -169,15 +96,15 @@ int main(int argc, char *argv[]){
    	for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
    	}
-    	gettimeofday( &start, (struct timezone *)0);
+    	begin();
 #ifdef RETURN_BY_STACK
     	DOT (&result , &m, x, &inc_x, y, &inc_y );
 #else
     	result = DOT (&m, x, &inc_x, y, &inc_y );
 #endif
-    	gettimeofday( &stop, (struct timezone *)0);
+    	end();
 
-    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
+    	time1 = getsec();
 
 	timeg += time1;
 

From c24ba8b1dd155b30eb5b7c4e7dc7b38c9e6597e3 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Mon, 26 Oct 2020 13:24:59 -0500
Subject: [PATCH 016/121] Optimize saxpy for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores.
---
 kernel/power/KERNEL.POWER10         |   2 +-
 kernel/power/saxpy_microk_power10.c | 181 ++++++++++++++++++++++++++++
 kernel/power/saxpy_power10.c        | 119 ++++++++++++++++++
 3 files changed, 301 insertions(+), 1 deletion(-)
 create mode 100644 kernel/power/saxpy_microk_power10.c
 create mode 100644 kernel/power/saxpy_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 86df7e3a2..1e514fcc9 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -141,7 +141,7 @@ DASUMKERNEL  = dasum.c
 CASUMKERNEL  = casum.c
 ZASUMKERNEL  = zasum.c
 #
-SAXPYKERNEL  = saxpy.c
+SAXPYKERNEL  = saxpy_power10.c
 DAXPYKERNEL  = daxpy_power10.c
 ifneq ($(GCCVERSIONGTEQ9),1)
 CAXPYKERNEL  = caxpy_power9.S
diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c
new file mode 100644
index 000000000..6ede1dcdd
--- /dev/null
+++ b/kernel/power/saxpy_microk_power10.c
@@ -0,0 +1,181 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static void saxpy_kernel_64(long n, float *x, float *y, float alpha)
+{
+  __vector float t0 = {alpha, alpha,alpha, alpha};
+
+  __asm__
+    (
+
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "lxvp		32, 0(%2)	\n\t"
+       "lxvp		34, 32(%2)	\n\t"
+       "lxvp		40, 64(%2)	\n\t"
+       "lxvp		42, 96(%2)	\n\t"
+       "lxvp		48, 128(%2)	\n\t"
+       "lxvp		50, 160(%2)	\n\t"
+       "lxvp		52, 192(%2)	\n\t"
+       "lxvp		54, 224(%2)	\n\t"
+
+       "lxvp		36, 0(%3)	\n\t"
+       "lxvp		38, 32(%3)	\n\t"
+       "lxvp		44, 64(%3)	\n\t"
+       "lxvp		46, 96(%3)	\n\t"
+       "lxvp		56, 128(%3)	\n\t"
+       "lxvp		58, 160(%3)	\n\t"
+       "lxvp		60, 192(%3)	\n\t"
+       "lxvp		62, 224(%3)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align 5			\n"
+     "one%=:				\n\t"
+
+       "xvmaddasp	36, 32, %x4	\n\t"
+       "xvmaddasp	37, 33, %x4	\n\t"
+
+       "lxvp		32, 0(%2)	\n\t"
+       "stxvp		36, 0(%3)	\n\t"
+
+       "xvmaddasp	38, 34, %x4	\n\t"
+       "xvmaddasp	39, 35, %x4	\n\t"
+
+       "lxvp		34, 32(%2)	\n\t"
+       "stxvp		38, 32(%3)	\n\t"
+
+       "lxvp		36, 256(%3)	\n\t"
+       "lxvp		38, 288(%3)	\n\t"
+
+       "xvmaddasp	44, 40, %x4	\n\t"
+       "xvmaddasp	45, 41, %x4	\n\t"
+
+       "lxvp		40, 64(%2)	\n\t"
+       "stxvp		44, 64(%3)	\n\t"
+
+       "xvmaddasp	46, 42, %x4	\n\t"
+       "xvmaddasp	47, 43, %x4	\n\t"
+
+       "lxvp		42, 96(%2)	\n\t"
+       "stxvp		46, 96(%3)	\n\t"
+
+       "lxvp		44, 320(%3)	\n\t"
+       "lxvp		46, 352(%3)	\n\t"
+
+       "xvmaddasp	56, 48, %x4	\n\t"
+       "xvmaddasp	57, 49, %x4	\n\t"
+
+       "lxvp		48, 128(%2)	\n\t"
+       "stxvp		56, 128(%3)	\n\t"
+
+       "xvmaddasp	58, 50, %x4	\n\t"
+       "xvmaddasp	59, 51, %x4	\n\t"
+
+       "lxvp		50, 160(%2)	\n\t"
+       "stxvp		58, 160(%3)	\n\t"
+
+       "lxvp		56, 384(%3)	\n\t"
+       "lxvp		58, 416(%3)	\n\t"
+
+       "xvmaddasp	60, 52, %x4	\n\t"
+       "xvmaddasp	61, 53, %x4	\n\t"
+
+       "lxvp		52, 192(%2)	\n\t"
+       "stxvp		60, 192(%3)	\n\t"
+
+       "xvmaddasp	62, 54, %x4	\n\t"
+       "xvmaddasp	63, 55, %x4	\n\t"
+
+       "lxvp		54, 224(%2)	\n\t"
+       "stxvp		62, 224(%3)	\n\t"
+
+       "lxvp		60, 448(%3)	\n\t"
+       "lxvp		62, 480(%3)	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+       "addi		%3, %3, 256	\n\t"
+
+       "addic.		%1, %1, -64	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddasp	36, 32, %x4	\n\t"
+       "xvmaddasp	37, 33, %x4	\n\t"
+       "xvmaddasp	38, 34, %x4	\n\t"
+       "xvmaddasp	39, 35, %x4	\n\t"
+
+       "xvmaddasp	44, 40, %x4	\n\t"
+       "xvmaddasp	45, 41, %x4	\n\t"
+       "xvmaddasp	46, 42, %x4	\n\t"
+       "xvmaddasp	47, 43, %x4	\n\t"
+
+       "xvmaddasp	56, 48, %x4	\n\t"
+       "xvmaddasp	57, 49, %x4	\n\t"
+       "xvmaddasp	58, 50, %x4	\n\t"
+       "xvmaddasp	59, 51, %x4	\n\t"
+
+       "xvmaddasp	60, 52, %x4	\n\t"
+       "xvmaddasp	61, 53, %x4	\n\t"
+       "xvmaddasp	62, 54, %x4	\n\t"
+       "xvmaddasp	63, 55, %x4	\n\t"
+       "stxvp		36, 0(%3)	\n\t"
+       "stxvp		38, 32(%3)	\n\t"
+       "stxvp		44, 64(%3)	\n\t"
+       "stxvp		46, 96(%3)	\n\t"
+       "stxvp		56, 128(%3)	\n\t"
+       "stxvp		58, 160(%3)	\n\t"
+       "stxvp		60, 192(%3)	\n\t"
+       "stxvp		62, 224(%3)	\n\t"
+
+     "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "wa" (t0),	// 4
+       "m" (*x)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
+     );
+
+}
+
+
diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c
new file mode 100644
index 000000000..8c7c22390
--- /dev/null
+++ b/kernel/power/saxpy_power10.c
@@ -0,0 +1,119 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "saxpy_microk_power10.c"
+#endif
+
+#ifndef HAVE_KERNEL_8
+static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
+{
+	BLASLONG register i = 0;
+
+	while(i < n)
+        {
+              y[i]   += alpha * x[i];
+              y[i+1] += alpha * x[i+1];
+              y[i+2] += alpha * x[i+2];
+              y[i+3] += alpha * x[i+3];
+              y[i+4] += alpha * x[i+4];
+              y[i+5] += alpha * x[i+5];
+              y[i+6] += alpha * x[i+6];
+              y[i+7] += alpha * x[i+7];
+              i+=8 ;
+
+       }
+
+}
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -64;
+
+		if ( n1 )
+			saxpy_kernel_64(n1, x, y, da);
+
+		i = n1;
+		while(i < n)
+		{
+
+			y[i] += da * x[i] ;
+			i++ ;
+
+		}
+		return(0);
+
+
+	}
+
+	BLASLONG n1 = n & -4;
+
+	while(i < n1)
+	{
+
+		FLOAT m1      = da * x[ix] ;
+		FLOAT m2      = da * x[ix+inc_x] ;
+		FLOAT m3      = da * x[ix+2*inc_x] ;
+		FLOAT m4      = da * x[ix+3*inc_x] ;
+
+		y[iy]         += m1 ;
+		y[iy+inc_y]   += m2 ;
+		y[iy+2*inc_y] += m3 ;
+		y[iy+3*inc_y] += m4 ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		y[iy] += da * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+

From 878b6d1f410c740372a9b5addf6c5033d893cc12 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Oct 2020 21:35:40 +0100
Subject: [PATCH 017/121] Remove spurious expr in flang version check

---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index 30d8f4ccf..6d985786d 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -855,7 +855,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
 FCOMMON_OPT += -Mrecursive -Kieee
 ifeq ($(OSNAME), Linux)
 ifeq ($(ARCH), x86_64)
-FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`)
+FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
 ifeq ($(FLANG_VENDOR),AOCC)
 FCOMMON_OPT += -fno-unroll-loops
 endif

From 6a1f3e40af7bd018f47afbf8fc543327b6552e48 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 26 Oct 2020 21:37:04 +0100
Subject: [PATCH 018/121] Remove debug printout of object list

---
 interface/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/interface/Makefile b/interface/Makefile
index 6b247b49f..7b60111f9 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -507,7 +507,7 @@ ifneq ($(BUILD_COMPLEX16),1)
 endif
 
 FUNCOBJS    = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
-$(info FUNCOBJS = {[$(FUNCOBJS)]} )
+
 ifdef EXPRECISION
 FUNCOBJS   += $(QBLASOBJS) $(XBLASOBJS)
 endif

From b937d78a6d87dbda61a14788c33d48b9c885c6ca Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Oct 2020 17:51:32 +0100
Subject: [PATCH 019/121] Try to read cpu information from
 /sys/devices/system/cpu/cpu0 if HWCAP_CPUID fails

---
 driver/others/dynamic_arm64.c | 28 +++++++++++++++++++++-------
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index be22b247c..007a221db 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -139,19 +139,30 @@ static gotoblas_t *force_coretype(char *coretype) {
 
 static gotoblas_t *get_coretype(void) {
   int implementer, variant, part, arch, revision, midr_el1;
+  char coremsg[128];
+
+#if (!defined OS_LINUX && !defined OS_ANDROID)
+  return NULL;
+#endif
 
-#if (defined OS_LINUX || defined OS_ANDROID)
   if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
-    char coremsg[128];
+#ifdef __linux
+        FILE *infile;
+        char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
+        p = (char *) NULL ;
+	infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
+	if (!infile) return NULL;
+	fgets(buffer, sizeof(buffer), infile);
+	midr_el1=strtoul(buffer,NULL,16);
+	fclose(infile);
+#else
     snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
     openblas_warning(1, coremsg);
     return NULL;
-  }
-#else
-   return NULL;
 #endif
-
-  get_cpu_ftr(MIDR_EL1, midr_el1);
+  } else {
+    get_cpu_ftr(MIDR_EL1, midr_el1);
+  }
   /*
    * MIDR_EL1
    *
@@ -219,6 +230,9 @@ static gotoblas_t *get_coretype(void) {
           return &gotoblas_FALKOR;
       }
       break;
+    default:
+      snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
+      openblas_warning(1, coremsg);
   }
   return NULL;
 }

From e8cbf0fc50547e5b50bc2f15549515f64767d104 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 27 Oct 2020 23:01:19 +0100
Subject: [PATCH 020/121] Output predefined HAVE_ entries to Makefile.conf for
 ARM with specified TARGET

---
 getarch.c | 35 ++++++++++++++++++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/getarch.c b/getarch.c
index 3f1448305..ab90f36d9 100644
--- a/getarch.c
+++ b/getarch.c
@@ -1405,8 +1405,41 @@ int main(int argc, char *argv[]){
 
     printf("NUM_CORES=%d\n", get_num_cores());
 
-#if defined(__arm__) && !defined(FORCE)
+#if defined(__arm__) 
+#if !defined(FORCE)
+    fprintf(stderr,"get features!\n");
         get_features();
+#else
+    fprintf(stderr,"split archconfig!\n");
+    sprintf(buffer, "%s", ARCHCONFIG);
+
+    p = &buffer[0];
+
+    while (*p) {
+      if ((*p == '-') && (*(p + 1) == 'D')) {
+	p += 2;
+        if (*p != 'H') {
+		while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; }
+		if (*p == '-') continue;
+	}
+	while ((*p != ' ') && (*p != '\0')) {
+
+	  if (*p == '=') {
+	    printf("=");
+	    p ++;
+	    while ((*p != ' ') && (*p != '\0')) {
+	      printf("%c", *p);
+	      p ++;
+	    }
+	  } else {
+	    printf("%c", *p);
+	    p ++;
+	    if ((*p == ' ') || (*p =='\0')) printf("=1\n");
+	  }
+	}
+      } else p ++;
+    }
+#endif
 #endif
 
 

From a7b1f9b1bbbfefb3f8b9dae126afdf054be97eda Mon Sep 17 00:00:00 2001
From: "Chen, Guobing" <guobing.chen@intel.com>
Date: Wed, 28 Oct 2020 08:49:12 +0800
Subject: [PATCH 021/121] Implementation of BF16 based gemv

1. Add a new API -- sbgemv to support bfloat16 based gemv
2. Implement a generic kernel for sbgemv
3. Implement an avx512-bf16 based kernel for sbgemv

Signed-off-by: Chen, Guobing <guobing.chen@intel.com>
---
 cblas.h                                       |    1 +
 cmake/kernel.cmake                            |    4 +-
 common_interface.h                            |    2 +
 common_level2.h                               |    4 +
 common_macro.h                                |   10 +-
 common_param.h                                |    4 +-
 common_sb.h                                   |    4 +
 driver/level2/Makefile                        |   16 +-
 driver/level2/sbgemv_thread.c                 |  149 +
 driver/others/blas_server_omp.c               |    1 -
 exports/gensymbol                             |    4 +-
 interface/Makefile                            |   17 +-
 interface/gemv.c                              |    1 -
 interface/sbgemv.c                            |  210 ++
 kernel/Makefile.L2                            |   22 +
 kernel/setparam-ref.c                         |    2 +-
 kernel/x86_64/KERNEL                          |    8 +
 kernel/x86_64/bf16_common_macros.h            |  795 +++++
 kernel/x86_64/sbgemv_n.c                      |  137 +
 kernel/x86_64/sbgemv_n_microk_cooperlake.c    |   76 +
 .../sbgemv_n_microk_cooperlake_template.c     |  234 ++
 kernel/x86_64/sbgemv_t.c                      |  142 +
 kernel/x86_64/sbgemv_t_microk_cooperlake.c    |  202 ++
 .../sbgemv_t_microk_cooperlake_template.c     | 3082 +++++++++++++++++
 24 files changed, 5111 insertions(+), 16 deletions(-)
 create mode 100644 driver/level2/sbgemv_thread.c
 create mode 100644 interface/sbgemv.c
 create mode 100644 kernel/x86_64/bf16_common_macros.h
 create mode 100644 kernel/x86_64/sbgemv_n.c
 create mode 100644 kernel/x86_64/sbgemv_n_microk_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
 create mode 100644 kernel/x86_64/sbgemv_t.c
 create mode 100644 kernel/x86_64/sbgemv_t_microk_cooperlake.c
 create mode 100644 kernel/x86_64/sbgemv_t_microk_cooperlake_template.c

diff --git a/cblas.h b/cblas.h
index bf310bed2..da00d46d6 100644
--- a/cblas.h
+++ b/cblas.h
@@ -393,6 +393,7 @@ void   cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
 void   cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
 /* dot production of BFLOAT16 input arrays, and output as float */
 float  cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
+void   cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order,  OPENBLAS_CONST enum CBLAS_TRANSPOSE trans,  OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
 
 #ifdef __cplusplus
 }
diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake
index 7d7f5ffda..0c102bae5 100644
--- a/cmake/kernel.cmake
+++ b/cmake/kernel.cmake
@@ -184,8 +184,8 @@ macro(SetDefaultL2)
   set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
   set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
 if (BUILD_BFLOAT16)
-  set(SBGEMVNKERNEL ../arm/gemv_n.c)
-  set(SBGEMVTKERNEL ../arm/gemv_t.c)
+  set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
+  set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
   set(SHGERKERNEL ../generic/ger.c)
 endif ()
 endmacro ()
diff --git a/common_interface.h b/common_interface.h
index 032877fe1..b9ebb2772 100644
--- a/common_interface.h
+++ b/common_interface.h
@@ -250,6 +250,8 @@ void BLASFUNC(xgeru)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 void BLASFUNC(xgerc)(blasint *,    blasint *, xdouble *, xdouble *, blasint *,
 		    xdouble *, blasint *, xdouble *, blasint *);
 
+void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float  *, bfloat16 *, blasint *,
+            bfloat16  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(sgemv)(char *, blasint *, blasint *, float  *, float  *, blasint *,
 		    float  *, blasint *, float  *, float  *, blasint *);
 void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,
diff --git a/common_level2.h b/common_level2.h
index 640d4a073..9a5ebb4d9 100644
--- a/common_level2.h
+++ b/common_level2.h
@@ -44,6 +44,10 @@
 extern "C" {
 #endif
 
+int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
+int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
 int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
 int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
 int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);
diff --git a/common_macro.h b/common_macro.h
index 54deed57c..c6ea1bfd9 100644
--- a/common_macro.h
+++ b/common_macro.h
@@ -646,10 +646,12 @@
 
 #elif defined(BFLOAT16)
 
-#define  D_TO_BF16_K    SBDTOBF16_K
-#define  D_BF16_TO_K    DBF16TOD_K
-#define  S_TO_BF16_K    SBSTOBF16_K
-#define  S_BF16_TO_K    SBF16TOS_K
+#define D_TO_BF16_K     SBDTOBF16_K
+#define D_BF16_TO_K     DBF16TOD_K
+#define S_TO_BF16_K     SBSTOBF16_K
+#define S_BF16_TO_K     SBF16TOS_K
+#define SBGEMV_N        SBGEMV_N_K
+#define SBGEMV_T        SBGEMV_T_K
 
 #define	AMAX_K			SAMAX_K
 #define	AMIN_K			SAMIN_K
diff --git a/common_param.h b/common_param.h
index b50e4ff80..3e3ae06f8 100644
--- a/common_param.h
+++ b/common_param.h
@@ -78,8 +78,8 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
   int    (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
   int    (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
 
-  int    (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
-  int    (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
+  int    (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
+  int    (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
   int    (*sbger_k)  (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
 
   int    (*sbsymv_L) (BLASLONG, BLASLONG, float,  float  *, BLASLONG, float  *, BLASLONG, float  *, BLASLONG, float *);
diff --git a/common_sb.h b/common_sb.h
index 66968ab00..9976e812e 100644
--- a/common_sb.h
+++ b/common_sb.h
@@ -8,6 +8,8 @@
 #define SBDTOBF16_K         sbdtobf16_k
 #define SBF16TOS_K          sbf16tos_k
 #define DBF16TOD_K          dbf16tod_k
+#define SBGEMV_N_K          sbgemv_n
+#define SBGEMV_T_K          sbgemv_t
 
 #define	SBGEMM_ONCOPY		sbgemm_oncopy
 #define	SBGEMM_OTCOPY		sbgemm_otcopy
@@ -29,6 +31,8 @@
 #define SBDTOBF16_K         gotoblas -> sbdtobf16_k
 #define SBF16TOS_K          gotoblas -> sbf16tos_k
 #define DBF16TOD_K          gotoblas -> dbf16tod_k
+#define SBGEMV_N_K          gotoblas -> sbgemv_n
+#define SBGEMV_T_K          gotoblas -> sbgemv_t
 
 #define	SBGEMM_ONCOPY		gotoblas -> sbgemm_oncopy
 #define	SBGEMM_OTCOPY		gotoblas -> sbgemm_otcopy
diff --git a/driver/level2/Makefile b/driver/level2/Makefile
index 7212d6662..caecf4f97 100644
--- a/driver/level2/Makefile
+++ b/driver/level2/Makefile
@@ -413,7 +413,13 @@ XBLASOBJS   += \
 	xtbmv_thread_RUU.$(SUFFIX)	xtbmv_thread_RUN.$(SUFFIX) \
 	xtbmv_thread_RLU.$(SUFFIX)	xtbmv_thread_RLN.$(SUFFIX) \
 	xtbmv_thread_CUU.$(SUFFIX)	xtbmv_thread_CUN.$(SUFFIX) \
-	xtbmv_thread_CLU.$(SUFFIX)	xtbmv_thread_CLN.$(SUFFIX) \
+	xtbmv_thread_CLU.$(SUFFIX)	xtbmv_thread_CLN.$(SUFFIX)
+
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS     += \
+        sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \
+        sbgemv_thread_t$(TSUFFIX).$(SUFFIX)
+endif
 
 endif
 
@@ -3693,4 +3699,12 @@ xtrsv_CUU.$(SUFFIX)  xtrsv_CUU.$(PSUFFIX)  : ztrsv_L.c ../../param.h
 xtrsv_CUN.$(SUFFIX)  xtrsv_CUN.$(PSUFFIX)  : ztrsv_L.c ../../param.h
 	$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE  -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
+sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h
+	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE  -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
+endif
+
+
 include ../../Makefile.tail
diff --git a/driver/level2/sbgemv_thread.c b/driver/level2/sbgemv_thread.c
new file mode 100644
index 000000000..534c60f95
--- /dev/null
+++ b/driver/level2/sbgemv_thread.c
@@ -0,0 +1,149 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#ifndef TRANSA
+#define SBGEMV	SBGEMV_N
+#else
+#define SBGEMV	SBGEMV_T
+#endif
+
+static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){
+
+    bfloat16 *a, *x;
+    float    *y;
+    BLASLONG lda, incx, incy;
+    BLASLONG m_from, m_to, n_from, n_to;
+
+    a = (bfloat16 *)args->a;
+    x = (bfloat16 *)args->b;
+    y = (float *)args->c;
+
+    lda  = args->lda;
+    incx = args->ldb;
+    incy = args->ldc;
+    
+#ifndef TRANSA          // N
+    m_from = *(range_m + 0);
+    m_to   = *(range_m + 1);
+    n_from = 0;
+    n_to   = args -> n;
+    a += m_from;
+    y += m_from * incy;
+#else                   // T
+    m_from = 0;
+    m_to   = args->m;
+    n_from = *(range_n + 0);
+    n_to   = *(range_n + 1);
+    a += n_from * lda;
+    y += n_from * incy;
+#endif
+
+    SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy);
+
+    return 0;
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads)
+{
+    blas_arg_t args;
+    blas_queue_t queue[MAX_CPU_NUMBER];
+    BLASLONG range[MAX_CPU_NUMBER + 1];
+
+#ifndef TRANSA
+    BLASLONG width_for_split = m;
+#else
+    BLASLONG width_for_split = n;
+#endif
+
+    BLASLONG BLOCK_WIDTH = width_for_split/threads;
+
+    int mode  =  BLAS_BFLOAT16  | BLAS_REAL;
+
+    args.m     = m;
+    args.n     = n;
+    args.a     = (void *)a;
+    args.b     = (void *)x;
+    args.c     = (void *)y;
+    args.lda   = lda;
+    args.ldb   = incx;
+    args.ldc   = incy;
+    args.alpha = (void *)&alpha;
+    args.beta  = (void *)&beta;
+
+    range[0] = 0;
+
+    int thread_idx;
+
+    for (thread_idx=0; thread_idx<threads; thread_idx++) {
+        if (thread_idx != threads-1) {
+            range[thread_idx + 1] = range[thread_idx] + BLOCK_WIDTH;
+        } else {
+            range[thread_idx + 1] = range[thread_idx] + width_for_split;
+        }
+
+        queue[thread_idx].mode    = mode;
+        queue[thread_idx].routine = sbgemv_kernel;
+        queue[thread_idx].args    = &args;
+#ifndef TRANSA
+        queue[thread_idx].range_m = &range[thread_idx];
+        queue[thread_idx].range_n = NULL;
+#else
+        queue[thread_idx].range_m = NULL;
+        queue[thread_idx].range_n = &range[thread_idx];
+#endif
+        queue[thread_idx].sa      = NULL;
+        queue[thread_idx].sb      = NULL;
+        queue[thread_idx].next    = &queue[thread_idx + 1];
+
+        width_for_split -= BLOCK_WIDTH;
+    }
+
+    if (thread_idx) {
+        queue[0].sa = NULL;
+        queue[0].sb = NULL;
+        queue[thread_idx - 1].next = NULL;
+
+        exec_blas(thread_idx, queue);
+    }
+
+    return 0;
+}
diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index d546553c1..a8b3e9a4b 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -352,7 +352,6 @@ fprintf(stderr,"UNHANDLED COMPLEX\n");
           /* Other types in future */
 	  }
       }
-if (!sb) fprintf(stderr,"SB not declared!!!\n");
       queue->sb=sb;
     }
   }
diff --git a/exports/gensymbol b/exports/gensymbol
index 22e470da5..857a17a9e 100644
--- a/exports/gensymbol
+++ b/exports/gensymbol
@@ -51,7 +51,7 @@
     zgeadd, dzsum);
 
 @blasobjs = (lsame, xerbla);
-@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
+@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
 @cblasobjsc = (
     cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
     cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@@ -94,7 +94,7 @@
 
 @cblasobjs = (  cblas_xerbla );
 
-@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
+@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
 
 @exblasobjs = (
     qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
diff --git a/interface/Makefile b/interface/Makefile
index 7b60111f9..7b0bf1792 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -48,6 +48,7 @@ SBLAS3OBJS    = \
 
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLAS1OBJS    = sbdot.$(SUFFIX)
+SBBLAS2OBJS    = sbgemv.$(SUFFIX)
 SBBLAS3OBJS    = sbgemm.$(SUFFIX)
 SBEXTOBJS      = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
 endif
@@ -284,6 +285,7 @@ CSBLAS3OBJS   = \
 
 ifeq ($(BUILD_BFLOAT16),1)
 CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
+CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
 CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
 CSBEXTOBJS   = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
 endif
@@ -382,6 +384,7 @@ SBLAS1OBJS   += $(CSBLAS1OBJS)
 SBLAS2OBJS   += $(CSBLAS2OBJS)
 SBLAS3OBJS   += $(CSBLAS3OBJS)
 SBBLAS1OBJS  += $(CSBBLAS1OBJS)
+SBBLAS2OBJS  += $(CSBBLAS2OBJS)
 SBBLAS3OBJS  += $(CSBBLAS3OBJS)
 DBLAS1OBJS   += $(CDBLAS1OBJS)
 DBLAS2OBJS   += $(CDBLAS2OBJS)
@@ -399,7 +402,7 @@ CBAUXOBJS += $(CXERBLAOBJ)
 endif
 
 SBLASOBJS    = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
-SBBLASOBJS   = $(SBBLAS1OBJS) $(SBBLAS3OBJS)
+SBBLASOBJS   = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
 DBLASOBJS    = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
 QBLASOBJS    = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
 CBLASOBJS    = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@@ -538,7 +541,7 @@ clean ::
 level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 
-level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
+level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
 	$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
 
 level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) 
@@ -929,6 +932,11 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c
 xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
 	$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c
+	$(CC) $(CFLAGS) -c $< -o $(@F)
+endif
+
 ifndef USE_NETLIB_GEMV
 sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
 	$(CC) -c $(CFLAGS) -o $(@F) $<
@@ -1656,6 +1664,11 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
 cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
 	$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
 
+ifeq ($(BUILD_BFLOAT16),1)
+cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
+	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
+endif
+
 cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c
 	$(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $<
 
diff --git a/interface/gemv.c b/interface/gemv.c
index c9d52cd69..d5d739fb1 100644
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -191,7 +191,6 @@ void CNAME(enum CBLAS_ORDER order,
   }
 
 #endif
-  //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta);
   if ((m==0) || (n==0)) return;
 
   lenx = n;
diff --git a/interface/sbgemv.c b/interface/sbgemv.c
new file mode 100644
index 000000000..89debe82d
--- /dev/null
+++ b/interface/sbgemv.c
@@ -0,0 +1,210 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include "l1param.h"
+#ifdef FUNCTION_PROFILE
+#include "functable.h"
+#endif
+
+#define ERROR_NAME "SBGEMV "
+
+#ifdef SMP
+static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = {
+    sbgemv_thread_n, sbgemv_thread_t,
+};
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY)
+{
+    char trans = *TRANS;
+    blasint m = *M;
+    blasint n = *N;
+    blasint lda = *LDA;
+    blasint incx = *INCX;
+    blasint incy = *INCY;
+    float alpha = *ALPHA;
+    float beta  = *BETA;
+#ifdef SMP
+    int nthreads;
+#endif
+
+    int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+        SBGEMV_N, SBGEMV_T,
+    };
+
+    blasint info;
+    blasint lenx, leny;
+    blasint i;
+
+    PRINT_DEBUG_NAME;
+
+    TOUPPER(trans);
+
+    info = 0;
+
+    i = -1;
+
+    if (trans == 'N') {i = 0;}
+    if (trans == 'T') {i = 1;}
+    if (trans == 'R') {i = 0;}
+    if (trans == 'C') {i = 1;}
+
+    if (incy == 0)       {info = 11;}
+    if (incx == 0)       {info = 8;}
+    if (lda < MAX(1, m)) {info = 6;}
+    if (n < 0)           {info = 3;}
+    if (m < 0)           {info = 2;}
+    if (i < 0)           {info = 1;}
+
+    trans = i;
+
+    if (info != 0) {
+        BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+        return;
+    }
+
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy)
+{
+    blasint lenx,  leny;
+    int     trans;
+    blasint info,  t;
+#ifdef SMP
+    int     nthreads;
+#endif
+
+    int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG,  bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
+        SBGEMV_N, SBGEMV_T,
+    };
+
+    PRINT_DEBUG_CNAME;
+
+    trans = -1;
+    info  =  0;
+
+    if (order == CblasColMajor) {   // Column Major
+        if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+            trans = 0;
+        } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+            trans = 1;
+        }
+    } else {                        // Row Major
+        if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
+            trans = 1;
+        } else if (TransA == CblasTrans || TransA == CblasConjTrans) {
+            trans = 0;
+        }
+
+        t = n;
+        n = m;
+        m = t;
+    }
+
+    info = -1;
+
+    if (incy == 0)       {info = 11;}
+    if (incx == 0)       {info = 8;}
+    if (lda < MAX(1, m)) {info = 6;}
+    if (n < 0)           {info = 3;}
+    if (m < 0)           {info = 2;}
+    if (trans < 0)       {info = 1;}
+
+    if (info >= 0) {
+        BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
+        return;
+    }
+
+#endif
+
+    if ((m==0) || (n==0)) return;
+
+    if (trans) {
+        lenx = m;
+        leny = n;
+    } else {
+        lenx = n;
+        leny = m;
+    }
+
+    if (alpha == ZERO) {
+        if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
+        return;
+    }
+
+    IDEBUG_START;
+    FUNCTION_PROFILE_START();
+
+    if (incx < 0) {x -= (lenx - 1) * incx;}
+    if (incy < 0) {y -= (leny - 1) * incy;}
+
+#ifdef SMP
+    int thread_thres_row = 20480;
+    if (trans) {
+        if (n <= thread_thres_row) {
+            nthreads = 1;
+        } else {
+            nthreads = num_cpu_avail(1);
+        }
+    } else {
+        if (m <= thread_thres_row) {
+            nthreads = 1;
+        } else {
+            nthreads = num_cpu_avail(1);
+        }
+    }
+
+
+    if (nthreads == 1) {
+#endif
+        (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy);
+#ifdef SMP
+    } else {
+        (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads);
+    }
+#endif
+
+    FUNCTION_PROFILE_END(1, m * n + m + n,  2 * m * n);
+    IDEBUG_END;
+
+    return;
+}
diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2
index 79399c342..888a9b959 100644
--- a/kernel/Makefile.L2
+++ b/kernel/Makefile.L2
@@ -48,6 +48,16 @@ ifndef XGEMVTKERNEL
 XGEMVTKERNEL = zgemv_t.S
 endif
 
+ifeq ($(BUILD_BFLOAT16),1)
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = ../x86_64/sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = ../x86_64/sbgemv_t.c
+endif
+endif
+
 ### GER ###
 
 ifndef SGERKERNEL
@@ -234,6 +244,12 @@ XBLASOBJS	+= \
 	xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \
 	xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX)
 
+ifeq ($(BUILD_BFLOAT16),1)
+SBBLASOBJS     += \
+        sbgemv_n$(TSUFFIX).$(SUFFIX) \
+        sbgemv_t$(TSUFFIX).$(SUFFIX)
+endif
+
 ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
 $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX)  $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
 	$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX  -UTRANS $< -o $@
@@ -483,4 +499,10 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX)  $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX)  : $(KER
 $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX)  $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XHEMV_M_KERNEL)  ../symcopy.h
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
 
+ifeq ($(BUILD_BFLOAT16),1)
+$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL)
+	$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
+endif
 
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index 849a4194a..d0317a745 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -69,7 +69,7 @@ gotoblas_t TABLE_NAME = {
   snrm2_kTS,  sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS,
   dsdot_kTS,
   srot_kTS,   saxpy_kTS,  sscal_kTS, sswap_kTS,
-  sgemv_nTS,  sgemv_tTS, sger_kTS,
+  sbgemv_nTS, sbgemv_tTS, sger_kTS,
   ssymv_LTS, ssymv_UTS,
 
   sbgemm_kernelTS, sbgemm_betaTS,
diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL
index 855e1ff8c..b92f480e9 100644
--- a/kernel/x86_64/KERNEL
+++ b/kernel/x86_64/KERNEL
@@ -384,6 +384,14 @@ endif
 
 GEMVDEP = ../l2param.h
 
+ifndef SBGEMVNKERNEL
+SBGEMVNKERNEL = sbgemv_n.c
+endif
+
+ifndef SBGEMVTKERNEL
+SBGEMVTKERNEL = sbgemv_t.c
+endif
+
 ifndef SGEMVNKERNEL
 SGEMVNKERNEL = sgemv_n.c
 endif
diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h
new file mode 100644
index 000000000..1014ecc4d
--- /dev/null
+++ b/kernel/x86_64/bf16_common_macros.h
@@ -0,0 +1,795 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#ifndef __BF16_COMMON_MACROS
+#define __BF16_COMMON_MACROS
+
+#include <immintrin.h>
+
+#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512)   \
+    reg256##_0 = _mm512_castps512_ps256(reg512##_0);  \
+    reg256##_1 = _mm512_castps512_ps256(reg512##_1);
+
+
+#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n)      \
+    regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n)      \
+    regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n)    \
+    regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n)       \
+    regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);  \
+    regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]);  \
+    regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]);  \
+    regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);  \
+    regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]);  \
+    regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]);  \
+    regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]);  \
+    regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]);
+
+
+#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask)    \
+    regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]);  \
+    regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]);  \
+    regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]);  \
+    regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);
+
+#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask)      \
+    regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]);
+
+#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n)     \
+    reg = _mm512_loadu_si512(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n)     \
+    reg = _mm256_loadu_si256(x + idx_n);
+
+
+#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n)      \
+    reg = _mm_loadu_si128(x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask)     \
+    reg = _mm512_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask)     \
+    reg = _mm256_maskz_loadu_epi16(mask, x + idx_n);
+
+
+#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask)      \
+    reg = _mm_maskz_loadu_epi16(mask, x + idx_n);
+
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+    |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27
+    |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+    |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31
+    |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+    |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25
+    |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+    |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29
+    |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31
+*/
+#define BF16_INTERLEAVE_8x32(regArray)                                  \
+    regArray##_8  = _mm512_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm512_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7);  \
+                                                                        \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_8,  regArray##_9);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_8,  regArray##_9);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \
+    regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \
+    regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \
+    regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \
+    regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+    |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11
+    |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+    |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15
+    |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+    |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9
+    |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+    |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13
+    |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15
+*/
+#define BF16_INTERLEAVE_8x16(regArray)                                  \
+    regArray##_8  = _mm256_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm256_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7);  \
+                                                                        \
+    regArray##_0  = _mm256_unpacklo_epi64(regArray##_8,  regArray##_9);    \
+    regArray##_1  = _mm256_unpackhi_epi64(regArray##_8,  regArray##_9);    \
+    regArray##_2  = _mm256_unpacklo_epi64(regArray##_10, regArray##_11);   \
+    regArray##_3  = _mm256_unpackhi_epi64(regArray##_10, regArray##_11);   \
+    regArray##_4  = _mm256_unpacklo_epi64(regArray##_12, regArray##_13);   \
+    regArray##_5  = _mm256_unpackhi_epi64(regArray##_12, regArray##_13);   \
+    regArray##_6  = _mm256_unpacklo_epi64(regArray##_14, regArray##_15);   \
+    regArray##_7  = _mm256_unpackhi_epi64(regArray##_14, regArray##_15);
+
+/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
+*/
+#define BF16_INTERLEAVE_4x32(regArray)                                 \
+    regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3);  \
+                                                                       \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7);  \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
+    |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
+    |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
+    |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
+
+    Step 2: 4-element interleave for matrix
+    |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
+    |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
+    |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
+    |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
+*/
+#define BF16_INTERLEAVE_4x16(regArray)                                 \
+    regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1);  \
+    regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3);  \
+    regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1);  \
+    regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3);  \
+                                                                       \
+    regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5);  \
+    regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5);  \
+    regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7);  \
+    regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7);
+
+
+/* 2-step interleave for x with 32 BF16 elements
+    Input  - original vector
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for x:
+    |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27
+    |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31
+ 
+    Step 2: 4-element interleave for x:
+    |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25
+    |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27
+    |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29
+    |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31
+*/
+#define BF16_INTERLEAVE_1x32(regArray)                                 \
+    regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0);  \
+    regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0);  \
+                                                                       \
+    regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1);  \
+    regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1);  \
+    regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3);  \
+    regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3);
+
+
+/* 2-step interleave for x with 16 BF16 elements
+    Input  - original vector
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for x:
+    |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11
+    |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15
+
+    Step 2: 4-element interleave for x:
+    |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9
+    |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11
+    |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13
+    |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15
+*/
+#define BF16_INTERLEAVE_1x16(regArray)                                 \
+    regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0);  \
+    regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0);  \
+                                                                       \
+    regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1);  \
+    regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1);  \
+    regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3);  \
+    regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3);
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers
+   |a0|a1|...|a14|a15|i0|i1|...|i14|i15|
+   |b0|b1|...|b14|b15|j0|j1|...|j14|j15|
+   |c0|c1|...|c14|c15|k0|k1|...|k14|k15|
+   |d0|d1|...|d14|d15|l0|l1|...|l14|l15|
+   |e0|e1|...|e14|e15|m0|m1|...|m14|m15|
+   |f0|f1|...|f14|f15|n0|n1|...|n14|n15|
+   |g0|g1|...|g14|g15|o0|o1|...|o14|o15|
+   |h0|h1|...|h14|h15|p0|p1|...|p14|p15|
+*/
+#define BF16_INTERLEAVE256_8x32(regArray)                                     \
+    regArray##_0 = _mm512_shuffle_i32x4(regArray##_8,  regArray##_12, 0x44);  \
+    regArray##_1 = _mm512_shuffle_i32x4(regArray##_8,  regArray##_12, 0xee);  \
+    regArray##_2 = _mm512_shuffle_i32x4(regArray##_9,  regArray##_13, 0x44);  \
+    regArray##_3 = _mm512_shuffle_i32x4(regArray##_9,  regArray##_13, 0xee);  \
+    regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44);  \
+    regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee);  \
+    regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44);  \
+    regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee);
+
+
+/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers
+   |a0|a1|...|a14|a15|e0|e1|...|e14|e15|
+   |b0|b1|...|b14|b15|f0|f1|...|f14|f15|
+   |c0|c1|...|c14|c15|g0|g1|...|g14|g15|
+   |d0|d1|...|d14|d15|h0|h1|...|h14|h15|
+*/
+#define BF16_INTERLEAVE256_4x32(regArray)                                    \
+    regArray##_0 = _mm512_shuffle_i32x4(regArray##_4,  regArray##_6, 0x44);  \
+    regArray##_1 = _mm512_shuffle_i32x4(regArray##_4,  regArray##_6, 0xee);  \
+    regArray##_2 = _mm512_shuffle_i32x4(regArray##_5,  regArray##_7, 0x44);  \
+    regArray##_3 = _mm512_shuffle_i32x4(regArray##_5,  regArray##_7, 0xee);
+
+
+#define BF16_PERMUTE_8x32(idx, regArray) \
+    regArray##_8  = _mm512_permutexvar_epi16(idx, regArray##_0);  \
+    regArray##_9  = _mm512_permutexvar_epi16(idx, regArray##_1);  \
+    regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2);  \
+    regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3);  \
+    regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4);  \
+    regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5);  \
+    regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6);  \
+    regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_8x32_2(idx, regArray) \
+    regArray##_8  = _mm512_permutexvar_epi32(idx, regArray##_0);  \
+    regArray##_9  = _mm512_permutexvar_epi32(idx, regArray##_1);  \
+    regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2);  \
+    regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3);  \
+    regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4);  \
+    regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5);  \
+    regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6);  \
+    regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7);
+
+
+#define BF16_PERMUTE_4x32(idx, regArray) \
+    regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0);  \
+    regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1);  \
+    regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2);  \
+    regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3);
+
+
+#define BF16_PERMUTE_4x32_2(idx, regArray) \
+    regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0);  \
+    regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1);  \
+    regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2);  \
+    regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3);
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1);  \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3);
+
+
+/* Calculate the dot result for 2-step interleaved matrix and vector
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray)                                   \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1);  \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3);
+
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x32(accumArray, matArray, xArray)                                                 \
+    accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray);  \
+    accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray);  \
+    accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray);  \
+    accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray);  \
+    accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray);  \
+    accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray);  \
+    accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray);  \
+    accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 32 elements per row
+   (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_1x32(accumArray, matArray, xArray)                                                 \
+    accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray);
+
+/* Calculate the dot result for matrix and vector at 16 elements per row
+   (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
+*/
+#define BF16_DOT_8x16(accumArray, matArray, xArray)                                                 \
+    accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray);  \
+    accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray);  \
+    accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray);  \
+    accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray);  \
+    accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray);  \
+    accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray);  \
+    accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray);  \
+    accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray);
+
+
+/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13|
+    |c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13|
+    |e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13|
+    |g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13|
+    |a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15|
+    |c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15|
+    |e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15|
+    |g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15|
+
+    Step 2: 4-element interleave for matrix
+    |a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12|
+    |a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13|
+    |e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12|
+    |e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13|
+    |a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14|
+    |a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15|
+    |e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14|
+    |e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15|
+*/
+#define FP32_INTERLEAVE_8x16(regArray)                               \
+    regArray##_8  = _mm512_unpacklo_ps(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm512_unpacklo_ps(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7);  \
+                                                                     \
+    regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8,  (__m512d) regArray##_9);  \
+    regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8,  (__m512d) regArray##_9);  \
+    regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+    regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
+    regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+    regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
+    regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \
+    regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15);
+
+#define FP32_INTERLEAVE_8x16_ARRAY(regArray)                               \
+    regArray[8]  = _mm512_unpacklo_ps(regArray[0], regArray[1]);  \
+    regArray[9]  = _mm512_unpacklo_ps(regArray[2], regArray[3]);  \
+    regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]);  \
+    regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]);  \
+    regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]);  \
+    regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]);  \
+    regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]);  \
+    regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]);  \
+                                                                     \
+    regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8],  (__m512d) regArray[9]);  \
+    regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8],  (__m512d) regArray[9]);  \
+    regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+    regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
+    regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+    regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
+    regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \
+    regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]);
+
+/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row
+    Input  - register array of 8 rows of raw-major matrix
+    Output - the output of Step 2
+
+    Step 1: 2-element interleave for matrix
+    |a0|b0|a1|b1|a4|b4|a5|b5|
+    |c0|d0|c1|d1|c4|d4|c5|d5|
+    |e0|f0|e1|f1|e4|f4|e5|f5|
+    |g0|h0|g1|h1|g4|h4|g5|h5|
+    |a2|b2|a3|b3|a6|b6|a7|b7|
+    |c2|d2|c3|d3|c6|d6|c7|d7|
+    |e2|f2|e3|f3|e6|f6|e7|f7|
+    |g2|h2|g3|h3|g6|h6|g7|h7|
+
+    Step 2: 4-element interleave for matrix
+    |a0|b0|c0|d0|a4|b4|c4|d4|
+    |a1|b1|c1|d1|a5|b5|c5|d5|
+    |e0|f0|g0|h0|e4|f4|g4|h4|
+    |e1|f1|g1|h1|e5|f5|g5|h5|
+    |a2|b2|c2|d2|a6|b6|c6|d6|
+    |a3|b3|c3|d3|a7|b7|c7|d7|
+    |e2|f2|g2|h2|e6|f6|g6|h6|
+    |e3|f3|g3|h3|e7|f7|g7|h7|
+*/
+#define FP32_INTERLEAVE_8x8(regArray)                                \
+    regArray##_8  = _mm256_unpacklo_ps(regArray##_0, regArray##_1);  \
+    regArray##_9  = _mm256_unpacklo_ps(regArray##_2, regArray##_3);  \
+    regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5);  \
+    regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7);  \
+    regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1);  \
+    regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3);  \
+    regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5);  \
+    regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7);  \
+                                                                     \
+    regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8,  (__m256d) regArray##_9);  \
+    regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8,  (__m256d) regArray##_9);  \
+    regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+    regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
+    regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+    regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
+    regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \
+    regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15);
+
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x16(regArray)                             \
+    regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1);  \
+    regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3);  \
+    regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5);  \
+    regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7);  \
+    regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2);  \
+    regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6);
+
+#define FP32_ACCUM2_8x16_ARRAY(regArray)                             \
+    regArray[0] = _mm512_add_ps(regArray[0], regArray[1]);  \
+    regArray[2] = _mm512_add_ps(regArray[2], regArray[3]);  \
+    regArray[4] = _mm512_add_ps(regArray[4], regArray[5]);  \
+    regArray[6] = _mm512_add_ps(regArray[6], regArray[7]);  \
+    regArray[0] = _mm512_add_ps(regArray[0], regArray[2]);  \
+    regArray[4] = _mm512_add_ps(regArray[4], regArray[6]);
+
+/* Accumulate the result for 2 batch of 4-registers
+*/
+#define FP32_ACCUM2_8x8(regArray)                              \
+    regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1);  \
+    regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3);  \
+    regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5);  \
+    regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7);  \
+    regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2);  \
+    regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6);
+
+
+/* Store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                 \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr)));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + beta * y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                  \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                                                                  \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr)));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + beta * y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                                                                   \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr)                                                                                         \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr)));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + beta * y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask)                                                                                          \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr)));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result + y) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                       \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr));  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 (alpha * result + y) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                        \
+    regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr));  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 (alpha * result + y) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                                                \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr));  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 (alpha * result + y) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                                                 \
+    regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr));  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 (alpha * result + y) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr)                                          \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr));  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 (alpha * result + y) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask)                                           \
+    regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr));  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 16 (alpha * result) to y
+*/
+#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Masked store 16 (alpha * result) to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult));
+
+
+/* Store 8 (alpha * result) to y
+*/
+#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Masked store 8 (alpha * result) to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
+
+
+/* Store 4 (alpha * result) to y
+*/
+#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr)  \
+    _mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Masked store 4 (alpha * result) to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask)  \
+    _mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
+
+
+/* Store 16 result to y
+*/
+#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm512_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 16 result to y
+*/
+#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm512_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 8 result to y
+*/
+#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm256_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 8 result to y
+*/
+#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm256_mask_storeu_ps(targetAddr, mask, regResult);
+
+
+/* Store 4 result to y
+*/
+#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr)  \
+    _mm_storeu_ps(targetAddr, regResult);
+
+
+/* Masked store 4 result to y
+*/
+#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask)  \
+    _mm_mask_storeu_ps(targetAddr, mask, regResult);
+
+#endif
diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c
new file mode 100644
index 000000000..18e64dc3f
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n.c
@@ -0,0 +1,137 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_n_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr)   \
+    ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+    ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+    free(ptr)
+
+#ifndef HAVE_SBGEMV_N_ACCL_KERNEL
+static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+	BLASLONG offset_lda, offset_m;
+    float accum = 0.0;
+    float tmp_x = 0.0;
+
+    bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+    float *    a_fp32 = malloc(sizeof(float)*m*n);
+    float *    x_fp32 = malloc(sizeof(float)*n);
+
+    for (BLASLONG j=0; j<n; j++) {
+        offset_lda = lda * j;
+        offset_m = m * j;
+        for (BLASLONG i=0; i<m; i++) {
+            a_bf16[offset_m + i] = a[offset_lda + i];
+        }
+    }
+
+    SBF16TOS_K(n, x, 1, x_fp32, 1);
+    SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+    for (BLASLONG i=0; i<m; i++) {
+        accum = 0.0;
+		for (BLASLONG j=0; j<n; j++) {
+		    accum += a_fp32[j*m + i] * x_fp32[j];
+		}
+        if (beta == ZERO) {
+		    y[i] = alpha * accum;
+        } else {
+            y[i] = alpha * accum + beta * y[i];
+        }
+	}
+
+    free(a_bf16);
+    free(a_fp32);
+    free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i*inc] = src[i];
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+    if ( m < 1 || n < 1) return(0);
+
+    bfloat16 * xbuffer_align = x;
+    float    * ybuffer_align = y;
+
+    bfloat16 * xbuffer = NULL;
+    float    * ybuffer = NULL;
+
+    if (incx != 1) {
+        ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+        bf16_compress_vector(n, x, xbuffer_align, incx);
+    }
+
+    if (incy != 1) {
+        ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+        if (beta != ZERO) {
+            fp32_compress_vector(m, y, ybuffer_align, incy);
+        }
+    }
+
+    sbgemv_kernel_n(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+    if (incy != 1) {
+        fp32_expand_vector(m, ybuffer_align, y, incy);
+        ALIGN64_FREE(ybuffer);
+    }
+
+    if (incx != 1) {
+        ALIGN64_FREE(xbuffer);
+    }
+
+	return(0);
+}
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake.c b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
new file mode 100644
index 000000000..d875e0d96
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake.c
@@ -0,0 +1,76 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_N_ACCL_KERNEL 1
+#include "common.h"
+#include <immintrin.h>
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef  ZERO_BETA
+#undef  ONE_BETA
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef  ZERO_BETA
+#define ONE_BETA  1
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef  ONE_ALPHA
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_n_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+    if (beta == ZERO) {          // BETA == 0.0, no need to accumulate the original Y data
+        if (alpha == ONE) {           // ALPHA == 1.0, no need to multipy ALPHA
+            sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y);
+        } else {                      // ALPHA != 1.0, need to multipy ALPHA
+            sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+        }
+    } else {                     // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+        if (beta == ONE) {
+            sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+        } else {
+            sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+        }
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
new file mode 100644
index 000000000..46e6d0ff9
--- /dev/null
+++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c
@@ -0,0 +1,234 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA  // Beta is non-zero
+
+#ifndef ONE_BETA       // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else                  // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else  // BETA is zero
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else                  // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x = m & (~31);
+    BLASLONG tag_m_128x = m & (~127);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
+    __m512i xArray_0;
+
+    __m512i ZERO512 = _mm512_setzero_si512();
+
+    unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa);
+    __mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value);
+    unsigned int blend_lo_mask_value = ((unsigned int)0x55555555);
+    __mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value);
+
+    __m512i M512_EPI32_8 = _mm512_set1_epi32(8);
+    __m512i idx_base_0   = _mm512_set_epi32(23,  7, 22,  6, 21,  5,  20,  4, 19,  3, 18,  2, 17,  1,  16,  0);
+    __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_8);
+
+    for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) {
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+        accum512_2 = _mm512_setzero_ps();
+        accum512_3 = _mm512_setzero_ps();
+        accum512_4 = _mm512_setzero_ps();
+        accum512_5 = _mm512_setzero_ps();
+        accum512_6 = _mm512_setzero_ps();
+        accum512_7 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+ 
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m +  0)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64)
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1);
+            matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1);
+            matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2);
+            matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2);
+            matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3);
+            matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+            BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0)
+            BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0)
+            BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0)
+            BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0)
+            BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0)
+            BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+        accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3);
+        accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3);
+        accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5);
+        accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5);
+        accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7);
+        accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7);
+
+        STORE16_COMPLETE_RESULT(accum512_8,  y+idx_m+0)
+        STORE16_COMPLETE_RESULT(accum512_9,  y+idx_m+16)
+        STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32)
+        STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48)
+        STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64)
+        STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80)
+        STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96)
+        STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112)
+    }
+
+    for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) {
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+            BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+        STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0)
+        STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16)
+    }
+
+    if (tag_m_32x != m) {
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+        unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
+        __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
+
+        accum512_0 = _mm512_setzero_ps();
+        accum512_1 = _mm512_setzero_ps();
+ 
+        for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
+            xArray_0 = _mm512_set1_epi16(x[idx_n]);
+
+            BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask)
+
+            matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
+            matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
+
+            BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
+        }
+        accum512_8  = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+        accum512_9  = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+
+        if ((m-tag_m_32x) > 16) {
+            STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0)
+            STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask)
+        } else {
+            STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask)
+        }
+    }
+
+    return 0;
+}
diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c
new file mode 100644
index 000000000..22b099116
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t.c
@@ -0,0 +1,142 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined (COOPERLAKE)
+#include "sbgemv_t_microk_cooperlake.c"
+#endif
+
+#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr)   \
+    ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
+    ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
+
+#define ALIGN64_FREE(ptr) \
+    free(ptr)
+
+#ifndef HAVE_SBGEMV_T_ACCL_KERNEL
+static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+	BLASLONG offset_lda, offset_n;
+    float accum = 0.0;
+
+    bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
+    float *    a_fp32 = malloc(sizeof(float)*m*n);
+    float *    x_fp32 = malloc(sizeof(float)*n);
+
+    for (BLASLONG i=0; i<m; i++)  {
+        offset_lda = lda * i;
+        offset_n = n * i;
+        for (BLASLONG j=0; j<n; j++) {
+            a_bf16[offset_n + j] = a[offset_lda + j];
+        }
+    }
+
+    SBF16TOS_K(n, x, 1, x_fp32, 1);
+    SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
+
+	for (BLASLONG i=0; i<m; i++) {
+		offset_n = n * i;
+        accum = 0.0;
+		for (BLASLONG j=0; j<n; j++) {
+		    accum += a_fp32[offset_n + j] * x_fp32[j];
+		}
+        if (beta == ZERO) {
+		    y[i] = alpha * accum;
+        } else {
+            y[i] = alpha * accum + beta * y[i];
+        }
+	}
+
+    free(a_bf16);
+    free(a_fp32);
+    free(x_fp32);
+}
+#endif
+
+static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i] = src[i*inc];
+    }
+}
+
+static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
+{
+    for(BLASLONG i=0; i<n; i++) {
+        target[i*inc] = src[i];
+    }
+}
+
+int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
+{
+    if ( m < 1 || n < 1) return(0);
+
+    bfloat16 * xbuffer_align = x;
+    float    * ybuffer_align = y;
+
+    bfloat16 * xbuffer = NULL;
+    float    * ybuffer = NULL;
+
+    // Switch m and n
+    BLASLONG t = m;
+    m = n;
+    n = t;
+
+    if (incx != 1) {
+        ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
+        bf16_compress_vector(n, x, xbuffer_align, incx);
+    }
+
+    if (incy != 1) {
+        ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
+        if (beta != ZERO) {
+            fp32_compress_vector(m, y, ybuffer_align, incy);
+        }
+    }
+
+    sbgemv_kernel_t(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
+
+    if (incy != 1) {
+        fp32_expand_vector(m, ybuffer_align, y, incy);
+        ALIGN64_FREE(ybuffer);
+    }
+
+    if (incx != 1) {
+        ALIGN64_FREE(xbuffer);
+    }
+
+	return(0);
+}
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake.c b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
new file mode 100644
index 000000000..23da2e809
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake.c
@@ -0,0 +1,202 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SBGEMV_T_ACCL_KERNEL 1
+
+// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
+#undef  ZERO_BETA
+#undef  ONE_BETA
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
+#undef  ZERO_BETA
+#define ONE_BETA  1
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#undef  ONE_ALPHA
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
+#define ZERO_BETA 1
+#define ONE_ALPHA 1
+#include "sbgemv_t_microk_cooperlake_template.c"
+
+static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+{
+    if (beta == ZERO) {          // BETA == 0.0, no need to accumulate the original Y data
+        if (alpha == ONE) {           // ALPHA == 1.0, no need to multipy ALPHA
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1 (m, alpha, a, x, y); break;
+                            case 2:  sbgemv_kernel_32x2 (m, alpha, a, x, y); break;
+                            case 3:  sbgemv_kernel_32x3 (m, alpha, a, x, y); break;
+                            case 4:  sbgemv_kernel_16x4 (m, alpha, a, x, y); break;
+                            case 5:  sbgemv_kernel_30x5 (m, alpha, a, x, y); break;
+                            case 6:  sbgemv_kernel_16x6 (m, alpha, a, x, y); break;
+                            case 7:  sbgemv_kernel_16x7 (m, alpha, a, x, y); break;
+                            case 8:  sbgemv_kernel_16x8 (m, alpha, a, x, y); break;
+                            case 9:  sbgemv_kernel_14x9 (m, alpha, a, x, y); break;
+                            case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break;
+                            case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break;
+                            case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break;
+                            case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break;
+                            case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break;
+                            case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break;
+                            case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y);
+                    }
+                }
+            }
+        } else {                      // ALPHA != 1.0, need to multipy ALPHA
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y);
+                    }
+                }
+            }
+        }
+    } else {                     // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
+        if (beta == ONE) {
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
+                    }
+                }
+            }
+        } else {
+            if (n > 127) {
+                sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+            } else if (n > 32) {
+                sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+            } else {
+                if (n > 16) {
+                    sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+                } else {
+                    if (lda == n) {
+                        switch(n) {
+                            case 1:  sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 2:  sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 3:  sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 4:  sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 5:  sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 6:  sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 7:  sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 8:  sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 9:  sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break;
+                            case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break;
+                            case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break;
+                            default: break;
+                        }
+                    } else {
+                        sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
+                    }
+                }
+            }
+        }
+    }
+
+    return 0;
+}
+
+#endif
diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
new file mode 100644
index 000000000..51e681add
--- /dev/null
+++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c
@@ -0,0 +1,3082 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#include <immintrin.h>
+#include "common.h"
+// Include common macros for BF16 based operations with IA intrinsics
+#include "bf16_common_macros.h"
+
+#ifndef ZERO_BETA  // Beta is non-zero
+
+#ifndef ONE_BETA       // BETA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_BETA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_BETA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_BETA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
+
+#else                  // BETA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA_ONE
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA_ONE
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA_ONE
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
+
+#endif
+
+#else  // BETA is zero
+
+#ifndef ONE_ALPHA      // ALPHA is not ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_ALPHA
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_ALPHA
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_ALPHA
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_ALPHA
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_ALPHA
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_ALPHA
+
+#else                  // ALPHA is ONE
+
+#define STORE16_COMPLETE_RESULT       STORE16_COMPLETE_RESULT_DIRECT
+#define STORE16_MASK_COMPLETE_RESULT  STORE16_MASK_COMPLETE_RESULT_DIRECT
+#define STORE8_COMPLETE_RESULT        STORE8_COMPLETE_RESULT_DIRECT
+#define STORE8_MASK_COMPLETE_RESULT   STORE8_MASK_COMPLETE_RESULT_DIRECT
+#define STORE4_COMPLETE_RESULT        STORE4_COMPLETE_RESULT_DIRECT
+#define STORE4_MASK_COMPLETE_RESULT   STORE4_MASK_COMPLETE_RESULT_DIRECT
+
+#endif
+
+#endif
+
+
+// 32 rows parallel processing BF16 GEMV kernel for n=1 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x1_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x1_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x1_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x1(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+    __m512i xArray;
+    __m512  result_0, result_1;
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+#endif
+
+    __m512i load_idx_lo   = _mm512_set_epi16(0, 15,  0, 14,  0, 13,  0, 12,  0, 11,  0, 10,  0,  9,  0,  8,\
+                                             0,  7,  0,  6,  0,  5,  0,  4,  0,  3,  0,  2,  0,  1,  0,  0);
+    __m512i M512_EPI16_16 = _mm512_set1_epi16(16);
+    __m512i load_idx_hi   = _mm512_add_epi16(load_idx_lo, M512_EPI16_16);
+
+    unsigned int interleve_mask_value = ((unsigned int) 0x55555555);
+    __mmask32 interleave_mask = *((__mmask32*) &interleve_mask_value);
+
+    xArray = _mm512_set1_epi16((short) x[0]);
+    xArray = _mm512_mask_blend_epi16(interleave_mask, _mm512_setzero_si512(), xArray);
+
+    if (tag_m_32x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)]);  // Load 32 rows with n=1
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0);  // Expand the low 16 elements
+            matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0);  // Expand the high 16 elements
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    BLASLONG tail_num = m - tag_m_32x;
+    if (tail_num > 16) {
+        result_0 = _mm512_setzero_ps();
+        result_1 = _mm512_setzero_ps();
+
+        unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-tail_num));
+        __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 32 rows with n=1
+        matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0);  // Expand the low 16 elements
+        matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0);  // Expand the high 16 elements
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray);
+        result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> (32-tail_num));
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+        STORE16_MASK_COMPLETE_RESULT(result_1, y+tag_m_32x+16, store_mask)
+    } else if (tail_num > 8) {
+        __m256 result256_0 = _mm256_setzero_ps();
+        __m256 result256_1 = _mm256_setzero_ps();
+
+        __m256i load_idx_lo256 = _mm512_castsi512_si256(load_idx_lo);
+        __m256i load_idx_hi256 = _mm512_extracti32x8_epi32(load_idx_lo, 0x1);
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        __m256i matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 16 rows with n=1
+        __m256i matrixArray256_1 = _mm256_permutexvar_epi16(load_idx_lo256, matrixArray256_0);  // Expand the low 8 elements
+        __m256i matrixArray256_2 = _mm256_permutexvar_epi16(load_idx_hi256, matrixArray256_0);  // Expand the high 8 elements
+
+        result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_1, (__m256bh) xArray256);
+        result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_2, (__m256bh) xArray256);
+
+        unsigned char store_mask_value = (((unsigned char)0xff) >> (16-tail_num));
+        __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+        STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+        STORE8_MASK_COMPLETE_RESULT(result256_1, y+tag_m_32x+8, store_mask)
+    } else {
+        __m128 result128_0 = _mm_setzero_ps();
+        __m128 result128_1 = _mm_setzero_ps();
+
+        __m128i load_idx_lo128 = _mm_set_epi16(0, 3, 0, 2, 0, 1, 0, 0);
+        __m128i M128_EPI16_4   = _mm_set1_epi16(4);
+        __m128i load_idx_hi128 = _mm_add_epi16(load_idx_lo128, M128_EPI16_4);
+
+        __m128i xArray128 = _mm512_castsi512_si128(xArray);
+
+        unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+        __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+        __m128i matrixArray128_0 = _mm_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]);  // Load 8 rows with n=1
+        __m128i matrixArray128_1 = _mm_permutexvar_epi16(load_idx_lo128, matrixArray128_0);  // Expand the low 4 elements
+        __m128i matrixArray128_2 = _mm_permutexvar_epi16(load_idx_hi128, matrixArray128_0);  // Expand the high 4 elements
+
+        result128_0 = _mm_dpbf16_ps(result128_0, (__m128bh) matrixArray128_1, (__m128bh) xArray128);
+        result128_1 = _mm_dpbf16_ps(result128_1, (__m128bh) matrixArray128_2, (__m128bh) xArray128);
+
+        if (tail_num > 4) {
+            unsigned char store_mask_value = (((unsigned char)0xf) >> (8-tail_num));
+            __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+            STORE4_COMPLETE_RESULT(result128_0, y+tag_m_32x)
+            STORE4_MASK_COMPLETE_RESULT(result128_1, y+tag_m_32x+4, store_mask)
+        } else {
+            unsigned char store_mask_value = (((unsigned char)0xf) >> (4-tail_num));
+            __mmask8 store_mask = *((__mmask8*) &store_mask_value);
+            STORE4_MASK_COMPLETE_RESULT(result128_0, y+tag_m_32x, store_mask)
+        }
+    }
+
+    return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=2 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x2_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x2_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x2_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512i matrixArray_0, matrixArray_1;
+    __m512i xArray;
+    __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    unsigned char load_mask_value = (((unsigned char)0xff) >> 6);
+    __mmask8 load_mask = *((__mmask8*) &load_mask_value);
+    xArray = _mm512_broadcastd_epi32(_mm_maskz_loadu_epi16(load_mask, x));
+
+    if (tag_m_32x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*2]);     // Load 16 rows as n=2
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+16)*2]);  // Load 16 rows as n=2
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    if (m - tag_m_32x >= 16) {
+        result_0 = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_32x)*2]);     // Load 16 rows with n=2
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+        STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x)
+
+        tag_m_32x += 16;
+    }
+
+    BLASLONG tail_num = m - tag_m_32x;
+    if (tail_num > 8) {
+        result_0 = _mm512_setzero_ps();
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(m&15)));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]);  // Load 16 rows with n=2
+
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray);
+
+        STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_32x, tail_mask)
+    } else if (tail_num == 8) {
+        __m256 result256 = _mm256_setzero_ps();
+
+        __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]);     // Load 8 rows with n=2
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+        result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+        STORE8_COMPLETE_RESULT(result256, y+tag_m_32x)
+    } else {
+        __m256 result256 = _mm256_setzero_ps();
+
+        unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-(m&7)));
+        __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+        __m256i matrixArray256 = _mm256_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]);  // Load 8 rows with n=2
+        __m256i xArray256 = _mm512_castsi512_si256(xArray);
+        result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256);
+
+        STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_32x, tail_mask)
+    }
+
+    return 0;
+}
+
+// 32 rows parallel processing BF16 GEMV kernel for n=3 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_32x3_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_32x3_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_32x3_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_32x  = m & (~31);
+
+    __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i xTmp = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|0|0|0|0|0|
+    __m512i xArray_0 = _mm512_broadcastd_epi32(xTmp);                          // x0|x1|x0|x1|...|x0|x1|
+    __m512i xArray_1 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1));  // x2| 0|x2| 0|...|x2| 0|
+
+    __m512i load_idx_base;
+    __m512i M512_EPI16_2, M512_EPI16_8, M512_EPI16_16;
+    M512_EPI16_2  = _mm512_set1_epi16(2);
+    M512_EPI16_8  = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+    M512_EPI16_8  = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+    M512_EPI16_16 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8);
+    load_idx_base = _mm512_set_epi16(46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24,
+                                     22, 21, 19, 18, 16, 15, 13, 12, 10,  9,  7,  6,  4,  3,  1,  0);
+
+    if (tag_m_32x > 0) {
+        __m512i load_idx01_1st, load_idx01_2nd, load_idx2_1st, load_idx2_2nd;
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6;
+
+        unsigned int idx_blend_mask_value = ((unsigned int)0x80000000);
+        __mmask32 idx_blend_mask = *((__mmask32*) &idx_blend_mask_value);
+
+        load_idx01_1st = load_idx_base;
+        load_idx01_2nd = _mm512_add_epi16(load_idx01_1st, M512_EPI16_16);
+        load_idx2_1st  = _mm512_add_epi16(load_idx01_1st, M512_EPI16_2);
+        load_idx2_2nd  = _mm512_add_epi16(load_idx01_2nd, M512_EPI16_2);
+        load_idx2_2nd  = _mm512_mask_blend_epi16(idx_blend_mask, load_idx2_2nd, _mm512_setzero_si512());
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*3]);           // Load 10 rows with n=3 plus 2 element
+            matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+10)*3 + 2)]);  // Load 10 rows with n=3 plus 2 element
+            matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+21)*3 + 1)]);  // Load 10 rows with n=3 plus 2 element
+
+            matrixArray_3 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_1st, matrixArray_1);  // Select the first 2 elements for each row
+            matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_1, load_idx01_2nd, matrixArray_2);  // Select the first 2 elements for each row
+            matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_0, load_idx2_1st,  matrixArray_1);  // Select the third element for each row
+            matrixArray_6 = _mm512_permutex2var_epi16(matrixArray_1, load_idx2_2nd,  matrixArray_2);  // Select the third element for each row
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_3, (__m512bh) xArray_0);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_5, (__m512bh) xArray_1);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_4, (__m512bh) xArray_0);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_6, (__m512bh) xArray_1);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_COMPLETE_RESULT(result_1, y+idx_m+16)
+        }
+    }
+
+    if (tag_m_32x != m) {
+        __m256i load256_idx01_1st, load256_idx01_2nd, load256_idx2_1st, load256_idx2_2nd;
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6;
+        __m256 result256_0, result256_1;
+
+        unsigned short idx256_blend_mask_value = ((unsigned short)0x8000);
+        __mmask16 idx256_blend_mask = *((__mmask16*) &idx256_blend_mask_value);
+
+        load256_idx01_1st = _mm512_castsi512_si256(load_idx_base);
+        load256_idx01_2nd = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_8));
+        load256_idx2_1st  = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_2)); 
+        load256_idx2_2nd  = _mm256_add_epi16(load256_idx01_2nd, _mm512_castsi512_si256(M512_EPI16_2));
+        load256_idx2_2nd  = _mm256_mask_blend_epi16(idx256_blend_mask, load256_idx2_2nd, _mm256_setzero_si256());
+
+        if (m - tag_m_32x > 15) {
+            result256_0 = _mm256_setzero_ps();
+            result256_1 = _mm256_setzero_ps();
+
+            matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+            matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+            matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]);  // Load 5 rows with n=3 plus 1 element
+
+            matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+            matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+            matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+            matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+            result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+
+            STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x)
+            STORE8_COMPLETE_RESULT(result256_1, y+tag_m_32x+8)
+
+            tag_m_32x += 16;
+        }
+
+        if (tag_m_32x != m) {
+            result256_0 = _mm256_setzero_ps();
+            result256_1 = _mm256_setzero_ps();
+            BLASLONG tail_num = m-tag_m_32x;
+
+            if (tail_num > 10) {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]);   // Load 5 rows with n=3 plus 1 element
+                matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]);  // Load m-tag_m_32x-10 rows
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+                matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            } else if (tail_num > 5) {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]);       // Load 5 rows with n=3 plus 1 element
+                matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]);   // Load m-tag_m_32x-5 rows
+                matrixArray256_2 = _mm256_setzero_si256();
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+                matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd,  matrixArray256_2);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            } else {
+                unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num*3)));
+                __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+                matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)*3]);       // Load m-tag_m_32x rows
+                matrixArray256_1 = _mm256_setzero_si256();
+
+                matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1);  // Select the first 2 elements for each row
+                matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st,  matrixArray256_1);  // Select the third element for each row
+
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0));
+                result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1));
+            }
+
+            unsigned short store_tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num)));
+            __mmask16 store_tail_mask = *((__mmask16*) &store_tail_mask_value);
+            __m512 result512 = _mm512_insertf32x8(_mm512_castps256_ps512(result256_0), result256_1, 0x1);
+            STORE16_MASK_COMPLETE_RESULT(result512, y+tag_m_32x, store_tail_mask)
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=4 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x4_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x4_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x4_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+    __m512i xArray_01, xArray_23, xArray_remix;
+    __m512  result;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+    __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0);
+    __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1);
+    __m512i idx_base_remix = _mm512_inserti32x8(idx_base_0, _mm512_castsi512_si256(idx_base_1), 0x1);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xf) >> 2);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i xTmp = _mm_maskz_loadu_epi32(x_load_mask, x);               // |x0|x1|x2|x3|0|0|0|0|
+    xArray_01 = _mm512_broadcastd_epi32(xTmp);                          // |x0|x1|x0|x1|...|x0|x1|
+    xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1));  // |x2|x3|x2|x3|...|x2|x3|
+    unsigned short blend_mask_value = ((unsigned short)0xff00);
+    __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+    xArray_remix = _mm512_mask_blend_epi32(blend_mask, xArray_01, xArray_23); // |x0|x1|x0|x1|x0|x1|x0|x1|...|x2|x3|x2|x3|x2|x3|x2|x3| 
+
+    if (tag_m_16x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*4]);      // Load 8 rows with n=4
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+8)*4]);    // Load 8 rows with n=4
+
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_0, matrixArray_1);  // |a0|a1|...|h0|h1|i0|i1|...|p0|p1|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_1, matrixArray_1);  // |a2|a3|...|h2|h3|i2|i3|...|p2|p3|
+
+            result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_01);
+            result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_3, (__m512bh) xArray_23);
+
+            STORE16_COMPLETE_RESULT(result, y+idx_m)
+        }
+    }
+
+    if (m - tag_m_16x > 7) {
+        result = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*4]);                // Load 8 rows with n=4
+        matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0);  // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+        result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+        __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+        STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+        tag_m_16x += 8;
+    }
+
+    BLASLONG tail_num = m-tag_m_16x;
+    if (tail_num != 0) {
+        result = _mm512_setzero_ps();
+
+        unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num*2));
+        __mmask16 tail_mask = *((__mmask16*) &tail_mask_value);
+        matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_16x)*4]);  // Load 8 rows with n=4
+        matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0);  // a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+
+        result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix);
+        __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1));
+
+        unsigned char store_tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+        __mmask8 store_tail_mask = *((__mmask8*) &store_tail_mask_value);
+        STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, store_tail_mask)
+    }
+
+    return 0;
+}
+
+// 30 rows parallel processing BF16 GEMV kernel for n=5 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_30x5_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_30x5_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_30x5_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_30x = m - (m%30);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 3);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);                       // x0|x1|x2|x3|x4|0|0|0|
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512  result_0, result_1;
+    __m512i xArray_01 = _mm512_broadcastd_epi32(x128);                          // x0|x1|x0|x1|...|x0|x1|
+    __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1));  // x2|x3|x2|x3|...|x2|x3|
+    __m512i xArray_4  = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2));  // x4| 0|x4| 0|...|x4| 0|
+
+    __m512i M512_EPI16_2 = _mm512_set1_epi16(2);
+    __m512i load_idx01_stage1_1st = _mm512_set_epi16( 0,  0,  0,  0,  0,  0,  0,  0, 58, 57, 53, 52, 48, 47, 43, 42, 
+                                                        38, 37, 33, 32, 26, 25, 21, 20, 16, 15, 11, 10,  6,  5,  1,  0);
+    __m512i load_idx01_stage1_2nd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x39);
+    __m512i load_idx01_stage1_3rd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x4f);
+
+    __m512i load_idx23_stage1_1st = _mm512_add_epi16(load_idx01_stage1_1st, M512_EPI16_2);
+    __m512i load_idx23_stage1_2nd = _mm512_add_epi16(load_idx01_stage1_2nd, M512_EPI16_2);
+    __m512i load_idx23_stage1_3rd = _mm512_add_epi16(load_idx01_stage1_3rd, M512_EPI16_2);
+
+    __m512i load_idx4_stage1_1st  = _mm512_add_epi16(load_idx23_stage1_1st, M512_EPI16_2);
+    __m512i load_idx4_stage1_2nd  = _mm512_add_epi16(load_idx23_stage1_2nd, M512_EPI16_2);
+    __m512i load_idx4_stage1_3rd  = _mm512_add_epi16(load_idx23_stage1_3rd, M512_EPI16_2);
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+    __m512i matrixArray_stage1_0, matrixArray_stage1_1, matrixArray_stage1_2;
+    __m512i matrixArray_stage2_0, matrixArray_stage2_1;
+
+    unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+    __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+    unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+    __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+    if (tag_m_30x > 0) {
+        unsigned short blend_mask_value_0 = ((unsigned short)0xf000);
+        __mmask16 blend_mask_0 = *((__mmask16*) &blend_mask_value_0);
+        unsigned short blend_mask_value_1 = ((unsigned short)0x3f00);
+        __mmask16 blend_mask_1 = *((__mmask16*) &blend_mask_value_1);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_30x; idx_m+=30) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]);       // Load 6 rows with n=5
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]);   // Load 6 rows with n=5
+            matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+12)*5)]);  // Load 6 rows with n=5
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+18)*5)]);  // Load 6 rows with n=5
+            matrixArray_4 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+24)*5)]);  // Load 6 rows with n=5
+
+            // Process the 0|1 elements
+            // Stage 1: Select the 0|1 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx01_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx01_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 0|1 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_01);
+
+            // Process the 2|3 elements
+            // Stage 1: Select the 2|3 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx23_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx23_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 2|3 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 2|3 elements and accumulate the result of 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_23);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_23);
+
+            // Process the for 4 elements
+            // Stage 1: Select the 4 elements for each row
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx4_stage1_2nd, matrixArray_3);
+            matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx4_stage1_3rd, matrixArray_4);
+            // Stage 2: Reorder and compress all the 4 elements
+            matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1);
+            matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2);
+            // Calculate the result of the 4 element and accumulate the result of 0|1 and 2|3 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0,  (__m512bh) xArray_4);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1,  (__m512bh) xArray_4);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+            STORE16_MASK_COMPLETE_RESULT(result_1, y+idx_m+16, store_mask)
+        }
+    }
+
+    if (m - tag_m_30x > 11) {
+        BLASLONG tag_m_12x = m - ((m-tag_m_30x)%12);
+        for (BLASLONG idx_m = tag_m_30x; idx_m < tag_m_12x; idx_m+=12) {
+            unsigned short store_less_mask_value = (((unsigned short)0xffff) >> 4);
+            __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]);       // Load 6 rows with n=5
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]);   // Load 6 rows with n=5
+
+            // Interleave the elements
+            matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+            matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+            matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+            // Calculate and accumulate the result
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_less_mask)
+            tag_m_30x += 12;
+        }
+    }
+
+    BLASLONG tail_num = m - tag_m_30x;
+    if (tail_num > 6) {
+        unsigned short store_less_mask_value = (((unsigned short)0xffff) >> (4+(12-tail_num)));
+        __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value);
+        unsigned int load_less_mask_value = (((unsigned int)0xffffffff) >> (2+(12-tail_num)*5));
+        __mmask32 load_less_mask = *((__mmask32*) &load_less_mask_value);
+        result_0 = _mm512_setzero_ps();
+
+        matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_30x)*5]);           // Load 6 rows with n=5
+        matrixArray_1 = _mm512_maskz_loadu_epi16(load_less_mask, &a[((tag_m_30x+6)*5)]);  // Load x rows with n=5
+
+        // Interleave the elements
+        matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1);
+        matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1);
+        matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1);
+        // Calculate and accumulate the result
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01);
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23);
+        result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4);
+
+        STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_30x, store_less_mask)
+    } else {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_30x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*5]);       // Load 1 rows with n=5
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=6 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x6_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x6_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x6_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 2);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);                       // x0|x1|x2|x3|x4|x5|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512  result_0;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_1 = _mm512_set1_epi32(1);
+        __m512i load_idx01_1st = _mm512_set_epi32( 0,  0,  0,  0,  0, 30, 27, 24, 21, 18, 15, 12,  9,  6,  3,  0);
+        __m512i load_idx01_2nd = _mm512_set_epi32(13, 10,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0);
+
+        __m512i load_idx23_1st = _mm512_add_epi32(load_idx01_1st, M512_EPI32_1);
+        __m512i load_idx23_2nd = _mm512_add_epi32(load_idx01_2nd, M512_EPI32_1);
+
+        __m512i load_idx45_1st = _mm512_add_epi32(load_idx23_1st, M512_EPI32_1);
+        __m512i load_idx45_2nd = _mm512_add_epi32(load_idx23_2nd, M512_EPI32_1);
+
+        unsigned short blend_mask_value = ((unsigned short)0x0400);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        // Set the 11th element to be 0 as invalid index for a 512 bit epi32 register
+        load_idx45_1st = _mm512_mask_blend_epi32(blend_mask, load_idx45_1st, load_idx01_2nd);
+        // Set the 11th element to be 0 as 0 is the correct index
+        load_idx45_2nd = _mm512_mask_blend_epi32(blend_mask, load_idx45_2nd, load_idx01_2nd);
+
+        __m512i xArray_01 = _mm512_broadcastd_epi32(x128);                          // x0|x1|x0|x1|...|x0|x1|
+        __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1));  // x2|x3|x2|x3|...|x2|x3|
+        __m512i xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2));  // x4|x5|x4|x5|...|x4|x5|
+
+        unsigned short permute_mask01_uint = (((unsigned short)0xf800));
+        __mmask16 permute_mask01 = *((__mmask16*) &permute_mask01_uint);
+        unsigned short permute_mask45_uint = (((unsigned short)0xfc00));
+        __mmask16 permute_mask45 = *((__mmask16*) &permute_mask45_uint);
+
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2;
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*6]);           // Load 5 rows with n=6 plus 2 element
+            matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+5)*6 + 2)]);   // Load 5 rows with n=6 plus 2 element
+            matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+10)*6 + 4)]);  // Load 5 rows with n=6 plus 2 element
+
+            // Stage 1: interleave for the a..k elements
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+
+            // Stage 2: interleave for the l..p elements and remix together
+            matrixArray_stage_0 = _mm512_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+            matrixArray_stage_1 = _mm512_mask_permutexvar_epi32(matrixArray_stage_1, permute_mask01, load_idx23_2nd, matrixArray_2);
+            matrixArray_stage_2 = _mm512_mask_permutexvar_epi32(matrixArray_stage_2, permute_mask45, load_idx45_2nd, matrixArray_2);
+
+            // Calculate the result of the 0|1 elements
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_01);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_45);
+
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m256i M256_EPI32_1 = _mm512_castsi512_si256(M512_EPI32_1);
+            __m256i load_idx01_1st = _mm256_set_epi32( 0,  0, 15, 12,  9,  6,  3,  0);
+            __m256i load_idx01_2nd = _mm256_set_epi32( 5,  2,  0,  0,  0,  0,  0,  0);
+
+            __m256i load_idx23_1st = _mm256_add_epi32(load_idx01_1st, M256_EPI32_1);
+            __m256i load_idx23_2nd = _mm256_add_epi32(load_idx01_2nd, M256_EPI32_1);
+            unsigned char blend_mask_value = ((unsigned char)0x20);
+            __mmask8 blend_mask = *((__mmask8*) &blend_mask_value);
+            // Set the 6th element to be 0 as invalid index for a 512 bit epi32 register
+            load_idx23_1st = _mm256_mask_blend_epi32(blend_mask, load_idx23_1st, load_idx01_2nd);
+            // Set the 6th element to be 0 as 0 is the correct index
+            load_idx23_2nd = _mm256_mask_blend_epi32(blend_mask, load_idx23_2nd, load_idx01_2nd);
+
+            __m256i load_idx45_1st = _mm256_add_epi32(load_idx23_1st, M256_EPI32_1);
+            __m256i load_idx45_2nd = _mm256_add_epi32(load_idx23_2nd, M256_EPI32_1);
+
+            unsigned char permute_mask01_uint = (((unsigned char)0xc0));
+            __mmask8 permute_mask01 = *((__mmask8*) &permute_mask01_uint);
+            unsigned char permute_mask45_uint = (((unsigned char)0xe0));
+            __mmask8 permute_mask45 = *((__mmask8*) &permute_mask45_uint);
+
+            __m256i matrixArray_0, matrixArray_1, matrixArray_2;
+            __m256i matrixArray_stage_0;
+            __m256  result256_0;
+
+            result256_0 = _mm256_setzero_ps();
+
+            matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]);          // Load 2 rows with n=6 plus 4 element
+            matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]);  // Load 2 rows with n=6 plus 4 element
+            matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]);  // Load 2 rows with n=6 plus 4 element
+
+            // Process the 0|1 elements
+            // Select the 0|1 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_01));
+
+            // Process the 2|3 elements
+            // Select the 2|3 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx23_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_23));
+
+            // Process the for 4 elements
+            // Select the 4|5 elements for each row
+            matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1);
+            matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx45_2nd, matrixArray_2);
+            // Calculate the result of the 0|1 elements
+            result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_45));
+
+            STORE8_COMPLETE_RESULT(result256_0, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*6]);       // Load 1 rows with n=6
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=7 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x7_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x7_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x7_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 1);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x);               // |x0|x1|x2|x3|x4|x5|x6|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_0123, xArray_4567;
+        __m512  result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+        __m512i load_idx_stage1_0 = _mm512_set_epi16(31, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14,
+                                                     31, 13, 12, 11, 10,  9,  8,  7, 31,  6,  5,  4,  3,  2,  1,  0);
+        __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13,  9,  5,  1, 28, 24, 20, 16, 12,  8,  4,  0);
+        __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+        unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+        __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+        xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+        xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*7]);      // Load 4 rows with n=7
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+4)*7]);    // Load 4 rows with n=7
+            matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+8)*7]);    // Load 4 rows with n=7
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+12)*7]);   // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+            matrixArray_2 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_2);                        // |i0|i1|i2|i3|...|j6|j7|k0|k1|k2|k3|...|l6|l7|
+            matrixArray_3 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_3);                        // |m0|m1|m2|m3|...|n6|n7|o0|o1|o2|o3|...|p6|p7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3);  // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3);  // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+            // Stage 3: interleave per 256 bits
+            result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+            result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+            result_2 = _mm512_add_ps(result_2, result_3);
+
+            STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]);      // Load 4 rows with n=7
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x+4)*7]);    // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+
+            tag_m_16x += 8;
+        }
+
+        BLASLONG tail_num = m - tag_m_16x;
+        if (tail_num > 3) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]);      // Load 4 rows with n=7
+            unsigned int tail_load_mask_value = (((unsigned int)0xffffffff) >> (4+(8-tail_num)*7));
+            __mmask32 tail_load_mask = *((__mmask32*) &tail_load_mask_value);
+            matrixArray_1 = _mm512_maskz_loadu_epi16(tail_load_mask, &a[(tag_m_16x+4)*7]);    // Load 4 rows with n=7
+
+            // Stage 1: padding
+            matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0);                        // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7|
+            matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1);                        // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+            __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+            STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+            tag_m_16x = m;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*7]);       // Load 1 rows with n=7
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=8 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x8_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x8_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x8_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    __m128i x128 = _mm_loadu_si128(x);               // |x0|x1|x2|x3|x4|x5|x6|x7|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_0123, xArray_4567;
+        __m512  result_0, result_1, result_2, result_3;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_2 = _mm512_set1_epi32(2);
+        __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13,  9,  5,  1, 28, 24, 20, 16, 12,  8,  4,  0);
+        __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2);
+
+        unsigned short x_blend_mask_value = ((unsigned short)0xff00);
+        __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value);
+        xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)));
+        xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \
+                                                            _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3)));
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*8]);     // Load 4 rows with n=8
+            matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+4)*8]);   // Load 4 rows with n=8
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+8)*8]);   // Load 4 rows with n=8
+            matrixArray_3 = _mm512_loadu_si512(&a[(idx_m+12)*8]);  // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|...|h0|h1|a2|a3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|...|h4|h5|a6|a7|...|h6|h7|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3);  // |i0|i1|...|p0|p1|i2|i3|...|p2|p3|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3);  // |i4|i5|...|p4|p5|i6|i7|...|p6|p7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567);
+
+            // Stage 2: interleave per 256 bits
+            result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44);
+            result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee);
+
+            result_2 = _mm512_add_ps(result_2, result_3);
+
+            STORE16_COMPLETE_RESULT(result_2, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]);      // Load 4 rows with n=8
+            matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+4)*8]);    // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        BLASLONG tail_num = m - tag_m_16x;
+        if (tail_num > 3) {
+            result_0 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]);      // Load 4 rows with n=8
+            unsigned short tail_load_mask_value = (((unsigned int)0xffff) >> ((8-tail_num)*4));
+            __mmask16 tail_load_mask = *((__mmask16*) &tail_load_mask_value);
+            matrixArray_1 = _mm512_maskz_loadu_epi32(tail_load_mask, &a[(tag_m_16x+4)*8]);    // Load 4 rows with n=8
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1);  // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1);  // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567);
+
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1));
+
+            unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num));
+            __mmask8 tail_mask = *((__mmask8*) &tail_mask_value);
+            STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask)
+            tag_m_16x = m;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m128i matrixArray128;
+        __m128  result128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            result128 = _mm_setzero_ps();
+            matrixArray128 = _mm_loadu_si128(&a[(i)*8]);       // Load 1 rows with n=8
+            result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 14 rows parallel processing BF16 GEMV kernel for n=9 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_14x9_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_14x9_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_14x9_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_14x = m - (m%14);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|0 |0 | 0| 0| 0| 0| 0|
+
+    if (tag_m_14x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m256i M256_EPI16_2 = _mm256_set1_epi16(2);
+        __m256i idx_base_0 = _mm256_set_epi16( 0,  0, 55, 54, 46, 45, 37, 36, 28, 27, 19, 18, 10,  9,  1,  0);
+        __m256i idx_base_1 = _mm256_add_epi16(idx_base_0, M256_EPI16_2);
+        __m256i idx_base_2 = _mm256_add_epi16(idx_base_1, M256_EPI16_2);
+        __m256i idx_base_3 = _mm256_add_epi16(idx_base_2, M256_EPI16_2);
+        __m256i idx_base_4 = _mm256_add_epi16(idx_base_3, M256_EPI16_2);
+        __m512i idx_idx    = _mm512_set_epi32( 0,  0, 22, 21, 20, 19, 18, 17, 16,  6,  5,  4,  3,  2,  1,  0);
+
+        __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+        __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+        __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+        __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+        __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+        __m512i load_idx_stage2_0 = _mm512_set_epi32( 0,  0, 22, 21, 20, 19, 18, 17, 16, 13, 12, 11, 10,  9,  8,  7);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0|x1|x0|x1| ... |x0|x1|
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2|x3|x2|x3| ... |x2|x3|
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4|x5|x4|x5| ... |x4|x5|
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6|x7|x6|x7| ... |x6|x7|
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8|0 |x8| 0| ... |x8| 0|
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 1);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+        unsigned short blend_mask_value = ((unsigned short)0x3f80);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 2);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_14x; idx_m+=14) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*9]);                          // Load 3 rows with n=9 plus 5 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+3)*9 + 5]);   // Load 3 rows with n=9 plus 4 elements
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+7)*9]);                        // Load 3 rows with n=9 plus 5 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*9 + 5]);  // Load 3 rows with n=9 plus 4 elements
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_0, matrixArray_1);  // |a0|a1|...|g0|g1|a2|a3|...|g2|g3|x|x|x|x|
+            matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_1, matrixArray_1);  // |a4|a5|...|g4|g5|a6|a7|...|g6|g7|x|x|x|x|
+            matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_2, matrixArray_3);  // |h2|h3|...|n2|n3|h0|h1|...|n0|n1|x|x|x|x|
+            matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_3, matrixArray_3);  // |h6|h7|...|n6|n7|h4|h5|...|n4|n5|x|x|x|x|
+            matrixArray_4       = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_4, matrixArray_1);  // |a8| x|...|g8| x| x| x|...| x| x|x|x|x|x|
+            matrixArray_5       = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_4, matrixArray_3);  // | x| x|...| x| x|h8| x|...|n8| x|x|x|x|x|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2);           // |a0|a1|b0|b1|...|h0|h1|i0|i1|j0|j1|...|n0|n1|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2);  // |a2|a3|b2|b3|...|h2|h3|i2|i3|j2|j3|...|n2|n3|x|x|x|x|
+            matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3);           // |a4|a5|b4|b5|...|h4|h5|i4|i5|j4|j5|...|n4|n5|x|x|x|x|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3);  // |a6|a7|b6|b7|...|h6|h7|i6|i7|j6|j7|...|n6|n7|x|x|x|x|
+            matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_4, matrixArray_5);                       // |a8| x|b8| x|...|h8| x|i8| x|j8| x|...|n8| x|x|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_14x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 7);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_14x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*9]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 12 rows parallel processing BF16 GEMV kernel for n=10 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_12x10_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_12x10_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_12x10_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_12x  = m - (m%12);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                                  // |x0|x1|x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8));           // |x8|x9|0 | 0| 0| 0| 0| 0|
+
+    if (tag_m_12x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m256i M256_EPI32_1 = _mm256_set1_epi32(1);
+        __m256i idx_base_0 = _mm256_set_epi32( 0,  0, 26, 21, 16, 10,  5,  0);
+        __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_1);
+        __m256i idx_base_2 = _mm256_add_epi32(idx_base_1, M256_EPI32_1);
+        __m256i idx_base_3 = _mm256_add_epi32(idx_base_2, M256_EPI32_1);
+        __m256i idx_base_4 = _mm256_add_epi32(idx_base_3, M256_EPI32_1);
+        __m512i idx_idx    = _mm512_set_epi32( 0,  0,  0,  0, 21, 20, 19, 18, 17, 16,  5,  4,  3,  2,  1,  0);
+
+        __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1));
+        __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3));
+        __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0));
+        __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2));
+        __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4));
+        __m512i load_idx_stage2_0 = _mm512_set_epi32( 0,  0,  0,  0, 21, 20, 19, 18, 17, 16, 11, 10,  9,  8,  7,  6);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0|x1|x0|x1| ... |x0|x1|
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2|x3|x2|x3| ... |x2|x3|
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4|x5|x4|x5| ... |x4|x5|
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6|x7|x6|x7| ... |x6|x7|
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8|x9|x8|x9| ... |x8|x9|
+
+        unsigned short blend_mask_value = ((unsigned short)0x0fc0);
+        __mmask16 blend_mask = *((__mmask16*) &blend_mask_value);
+        unsigned short load_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 4);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+        for (BLASLONG idx_m = 0; idx_m < tag_m_12x; idx_m+=12) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m)*10]);     // Load 3 rows with n=10
+            matrixArray_1 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+3)*10]);   // Load 3 rows with n=10
+            matrixArray_2 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+6)*10]);   // Load 3 rows with n=10
+            matrixArray_3 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+9)*10]);   // Load 3 rows with n=10
+
+            // Stage 1: interleave per 32 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_0, matrixArray_1);  // |a0|a1|...|f0|f1|a2|a3|...|f2|f3|x|x|x|x|x|x|x|x|
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_1, matrixArray_1);  // |a4|a5|...|f4|f5|a6|a7|...|f6|f7|x|x|x|x|x|x|x|x|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_2, matrixArray_3);  // |g2|g3|...|l2|l3|g0|g1|...|l0|l1|x|x|x|x|x|x|x|x|
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_3, matrixArray_3);  // |g6|g7|...|l6|l7|g4|g5|...|l4|l5|x|x|x|x|x|x|x|x|
+            matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_4, matrixArray_1);  // |a8|a9|...|f8|f9| x| x|...| x| x|x|x|x|x|x|x|x|x|
+            matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_4, matrixArray_3);  // | x| x|...| x| x|g8|g9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+            // Stage 3: interleave per 256 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2);           // |a0|a1|...|l0|l1|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2);  // |a2|a3|...|l2|l3|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3);           // |a4|a5|...|l4|l5|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3);  // |a6|a7|...|l6|l7|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_4, matrixArray_stage_5);           // |a8|a9|...|l8|l9|x|x|x|x|x|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_12x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned char load256_mask_value = (((unsigned char)0xff) >> 3);
+        __mmask8 load256_mask = *((__mmask8*) &load256_mask_value);
+        for (BLASLONG i = tag_m_12x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi32(load256_mask, &a[(i)*10]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=11 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x11_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x11_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x11_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_15x = m - (m%15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2|x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10| 0| 0| 0| 0| 0|
+
+    if (tag_m_15x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+        __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+        __m512i M512_EPI16_2, M512_EPI16_4, M512_EPI16_6, M512_EPI32_5;
+        M512_EPI16_2 = _mm512_set1_epi16(2);
+        M512_EPI16_4 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2);
+        M512_EPI16_6 = _mm512_add_epi16(M512_EPI16_4, M512_EPI16_2);
+        M512_EPI32_5 = _mm512_set1_epi32(5);
+
+        unsigned int BASE_MASK_10_value = ((unsigned int)0x000003ff);
+        __mmask32 BASE_MASK_10 = *((__mmask32*) &BASE_MASK_10_value);
+        unsigned int BASE_MASK_20_value = ((unsigned int)0x000ffc00);
+        __mmask32 BASE_MASK_20 = *((__mmask32*) &BASE_MASK_20_value);
+        unsigned int BASE_MASK_30_value = ((unsigned int)0x3ff00000);
+        __mmask32 BASE_MASK_30 = *((__mmask32*) &BASE_MASK_30_value);
+
+        idx_stage1_base_0 = _mm512_set_epi16( 0,  0, 49, 48, 38, 37, 27, 26, 16, 15,  5,  4, 47, 46, 36, 35,
+                                             25, 24, 14, 13,  3,  2, 45, 44, 34, 33, 23, 22, 12, 11,  1,  0);
+        idx_stage1_base_1 = _mm512_add_epi16(idx_stage1_base_0, M512_EPI16_6);
+
+        idx_stage1_base_2 = _mm512_mask_add_epi16(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI16_2);
+        idx_stage1_base_2 = _mm512_mask_sub_epi16(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI16_2);
+        idx_stage1_base_3 = _mm512_add_epi16(idx_stage1_base_2, M512_EPI16_6);
+
+        idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI16_2);
+        idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI16_2);
+        idx_stage1_base_4 = _mm512_mask_sub_epi16(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI16_4);
+        idx_stage1_base_5 = _mm512_add_epi16(idx_stage1_base_4, M512_EPI16_6);
+
+        unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+        __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+        unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+        __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+        idx_stage2_base_0 = _mm512_set_epi32( 0,  0,  0,  0,  0,  0, 20, 19, 18, 17, 16,  9,  8,  7,  6,  5);
+        idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0);
+        idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+        idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+        idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+        xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1));  // |x10|0  |x10|0  | ... |x10|0  |
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 9);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[idx_m*11]);                             // Load 2 rows with n=11 plus 10 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*11 + 32]);       // Load 2 rows with n=11 plus 1 element
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*11]);                         // Load 2 rows with n=11 plus 10 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*11 + 32]);   // Load 2 rows with n=11 plus 1 element
+            matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*11]);                        // Load 2 rows with n=11 plus 10 elements
+            matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*11 + 32]);  // Load 2 rows with n=11 plus 1 element
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_0, matrixArray_1);  // |a0|a1|...|e0|e1|a2|a3|...|e2|e3|a4 |a5|...|e4 |e5|
+            matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_1, matrixArray_1);  // |a6|a7|...|e6|e7|a8|a9|...|e8|e9|a10|x |...|e10|x |
+            matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_2, matrixArray_3);  // |f2|f3|...|j2|j3|f0|f1|...|j0|j1|f4 |f5|...|j4 |j5|
+            matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_3, matrixArray_3);  // |f8|f9|...|j8|j9|f6|f7|...|j6|j7|f10|x |...|j10|x |
+            matrixArray_stage_4 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_4, matrixArray_5);  // |k4|k5|...|o4|o5|k2|k3|...|o2|o3|k0 |k1|...|o0 |o1|
+            matrixArray_stage_5 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_5, matrixArray_5);  // |k10|x|...|o10|x|k8|k9|...|o8|o9|k6 |k7|...|o6 |o7|
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2);    // |a0|a1|...|j0|j1|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3);    // |a6|a7|...|j6|j7|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2);  // |a2|a3|...|j2|j3|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2);  // |a4|a5|...|j4|j5|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3);  // |a8|a9|...|j8|j9|x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3);  // |a10|x|...|j10|x|x|x|x|x|x|x|x|x|x|x|x|x|
+
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0,       matrixArray_stage_4);    // |a0|a1|.......................|o0|o1|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3,       matrixArray_stage_5);    // |a6|a7|.......................|o6|o7|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1      , idx_stage2_base_1, matrixArray_stage_4);  // |a2|a3|.......................|o2|o3|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2      , idx_stage2_base_3, matrixArray_stage_4);  // |a4|a5|.......................|o4|o5|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4      , idx_stage2_base_1, matrixArray_stage_5);  // |a8|a9|.......................|o8|o9|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5      , idx_stage2_base_3, matrixArray_stage_5);  // |a10|x|.......................|o10|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_15x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 5);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_15x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*11]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 15 rows parallel processing BF16 GEMV kernel for n=12 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_15x12_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_15x12_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_15x12_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_15x = m - (m%15);
+
+    unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4);
+    __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value);
+    __m128i x128_0 = _mm_loadu_si128(x);                         // |x0|x1| x2| x3|x4|x5|x6|x7|
+    __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8));  // |x8|x9|x10|x11| 0| 0| 0| 0|
+
+    if (tag_m_15x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5;
+        __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5;
+        __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5;
+        __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3;
+
+        __m512i M512_EPI32_1, M512_EPI32_2, M512_EPI32_3, M512_EPI32_5;
+        M512_EPI32_1 = _mm512_set1_epi32(1);
+        M512_EPI32_2 = _mm512_add_epi32(M512_EPI32_1, M512_EPI32_1);
+        M512_EPI32_3 = _mm512_add_epi32(M512_EPI32_2, M512_EPI32_1);
+        M512_EPI32_5 = _mm512_add_epi32(M512_EPI32_3, M512_EPI32_2);
+
+        unsigned short BASE_MASK_10_value = ((unsigned short)0x001f);
+        __mmask16 BASE_MASK_10 = *((__mmask16*) &BASE_MASK_10_value);
+        unsigned short BASE_MASK_20_value = ((unsigned short)0x03e0);
+        __mmask16 BASE_MASK_20 = *((__mmask16*) &BASE_MASK_20_value);
+        unsigned short BASE_MASK_30_value = ((unsigned short)0xfc00);
+        __mmask16 BASE_MASK_30 = *((__mmask16*) &BASE_MASK_30_value);
+
+        idx_stage1_base_0 = _mm512_set_epi32( 0, 26, 20, 14,  8,  2, 25, 19, 13,  7,  1,  24, 18, 12,  6,  0);
+        idx_stage1_base_1 = _mm512_add_epi32(idx_stage1_base_0, M512_EPI32_3);
+
+        idx_stage1_base_2 = _mm512_mask_add_epi32(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI32_1);
+        idx_stage1_base_2 = _mm512_mask_sub_epi32(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI32_1);
+        idx_stage1_base_3 = _mm512_add_epi32(idx_stage1_base_2, M512_EPI32_3);
+
+        idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI32_1);
+        idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI32_1);
+        idx_stage1_base_4 = _mm512_mask_sub_epi32(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI32_2);
+        idx_stage1_base_5 = _mm512_add_epi32(idx_stage1_base_4, M512_EPI32_3);
+
+        unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0);
+        __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value);
+        unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00);
+        __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value);
+        idx_stage2_base_0 = _mm512_set_epi32( 0,  0,  0,  0,  0,  0, 20, 19, 18, 17, 16,  9,  8,  7,  6,  5);
+        idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21,  9,  8,  7,  6,  5,  4,  3,  2,  1,  0);
+        idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5);
+        idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5);
+        idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5);
+
+        xArray_01 = _mm512_broadcastd_epi32(x128_0);                          // |x0 |x1 |x0 |x1 | ... |x0 |x1 |
+        xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1));  // |x2 |x3 |x2 |x3 | ... |x2 |x3 |
+        xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2));  // |x4 |x5 |x4 |x5 | ... |x4 |x5 |
+        xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3));  // |x6 |x7 |x6 |x7 | ... |x6 |x7 |
+        xArray_89 = _mm512_broadcastd_epi32(x128_1);                          // |x8 |x9 |x8 |x9 | ... |x8 |x9 |
+        xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1));  // |x10|x11|x10|x11| ... |x10|x11|
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        unsigned short store_mask_value = (((unsigned short)0xffff) >> 1);
+        __mmask16 store_mask = *((__mmask16*) &store_mask_value);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) {
+            result_0 = _mm512_setzero_ps();
+            result_1 = _mm512_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[idx_m*12]);                             // Load 2 rows with n=12 plus 8 elements
+            matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*12 + 32]);       // Load 2 rows with n=12 plus 4 element
+            matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*12]);                         // Load 2 rows with n=12 plus 8 elements
+            matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*12 + 32]);   // Load 2 rows with n=12 plus 4 element
+            matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*12]);                        // Load 2 rows with n=12 plus 8 elements
+            matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*12 + 32]);  // Load 2 rows with n=12 plus 4 element
+
+            // Stage 1: interleave per 16 bits
+            matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_0, matrixArray_1);  // |a0 |a1 |...|e0 |e1 |a2|a3|...|e2|e3|a4 |a5 |...|e4 |e5 |
+            matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_1, matrixArray_1);  // |a6 |a7 |...|e6 |e7 |a8|a9|...|e8|e9|a10|a11|...|e10|e11|
+            matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_2, matrixArray_3);  // |f2 |f3 |...|j2 |j3 |f0|f1|...|j0|j1|f4 |f5 |...|j4 |j5 |
+            matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_3, matrixArray_3);  // |f8 |f9 |...|j8 |j9 |f6|f7|...|j6|j7|f10|f11|...|j10|j11|
+            matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_4, matrixArray_5);  // |k4 |k5 |...|o4 |o5 |k2|k3|...|o2|o3|k0 |k1 |...|o0 |o1 |
+            matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_5, matrixArray_5);  // |k10|k11|...|o10|o11|k8|k9|...|o8|o9|k6 |k7 |...|o6 |o7 |       
+
+            // Stage 2: interleave per 32 bits
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2);    // |a0 |a1 |...|j0 |j1 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3);    // |a6 |a7 |...|j6 |j7 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2);  // |a2 |a3 |...|j2 |j3 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2);  // |a4 |a5 |...|j4 |j5 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3);  // |a8 |a9 |...|j8 |j9 |x|x|x|x|x|x|x|x|x|x|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3);  // |a10|a11|...|j10|j11|x|x|x|x|x|x|x|x|x|x|x|x|
+
+            matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0,       matrixArray_stage_4);    // |a0|a1|.......................|o0|o1|x|x|
+            matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3,       matrixArray_stage_5);    // |a6|a7|.......................|o6|o7|x|x|
+            matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1      , idx_stage2_base_1, matrixArray_stage_4);  // |a2|a3|.......................|o2|o3|x|x|
+            matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2      , idx_stage2_base_3, matrixArray_stage_4);  // |a4|a5|.......................|o4|o5|x|x|
+            matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4      , idx_stage2_base_1, matrixArray_stage_5);  // |a8|a9|.......................|o8|o9|x|x|
+            matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5      , idx_stage2_base_3, matrixArray_stage_5);  // |a10|x|.......................|o10|x|x|x|
+
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67);
+            result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89);
+            result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10);
+            result_0 = _mm512_add_ps(result_0, result_1);
+
+            STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask)
+        }
+    }
+
+    if (tag_m_15x != m) {
+        __m256i matrixArray256;
+        __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1);
+        __m256  result256;
+        __m128  result128, tmp128;
+        unsigned short load256_mask_value = (((unsigned short)0xffff) >> 4);
+        __mmask16 load256_mask = *((__mmask16*) &load256_mask_value);
+        for (BLASLONG i = tag_m_15x; i < m; i++) {
+            result256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*12]);
+            result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256);
+            result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1));
+            tmp128 = _mm_shuffle_ps(result128, result128, 14);
+            result128 = _mm_add_ps(result128, tmp128);
+            tmp128 = _mm_shuffle_ps(result128, result128, 1);
+            result128 = _mm_add_ps(result128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * result128[0] + beta * y[i];
+#else
+            y[i] = alpha * result128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = result128[0] * alpha;
+#else
+            y[i] = result128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+
+// 16 rows parallel processing BF16 GEMV kernel for n=13 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x13_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x13_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x13_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 3);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|0|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m+8, 0, x_load_mask)
+
+            matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0x44);
+            matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0xee);
+            matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0x44);
+            matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0xee);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*13]);       // Load 1 rows with n=13
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=14 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x14_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x14_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x14_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 2);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|0|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+        __m512i shift_idx    = _mm512_set_epi32(0,  13, 12, 11, 10,  9,  8,  7,  0,  6,  5,  4,  3,  2,  1,  0);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x32_2(matrixArray, a, 14, idx_m, 0, load_mask)
+
+            // Pre-stage: shift the 2nd vector 1 position right for each register
+            BF16_PERMUTE_8x32_2(shift_idx, matrixArray)
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_4x32_2(matrixArray, a, 14, tag_m_16x, 0, load_mask)
+
+            // Pre-stage: shift the 2nd vector 1 position right for each register
+            BF16_PERMUTE_4x32_2(shift_idx, matrixArray)
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_4x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 14, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*14]);       // Load 1 rows with n=14
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=15 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x15_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x15_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x15_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 1);
+    __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value);
+    __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|0|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+        __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2);
+        __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m+8, 0, x_load_mask)
+
+            matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load matrix
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+            matrixArray_8  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1);
+            matrixArray_9  = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1);
+            matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1);
+            matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1);
+
+            // interleave per 256 bits
+            matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0x44);
+            matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8,  matrixArray_10, 0xee);
+            matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0x44);
+            matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9,  matrixArray_11, 0xee);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*15]);       // Load 1 rows with n=15
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 16 rows parallel processing BF16 GEMV kernel for n=16 && lda ineffective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_16x16_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_16x16_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_16x16_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_16x  = m & (~15);
+
+    __m256i x256 = _mm256_loadu_si256(x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|
+
+    if (tag_m_16x > 0) {
+        __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+                matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+        __m512  accum512_0, accum512_1;
+        __m512  result_0, result_1;
+
+#ifndef ONE_ALPHA
+        __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+        __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1);
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            matrixArray_8  = _mm512_loadu_si512(&a[(idx_m   )*16]);  // Load 2 rows with n=16
+            matrixArray_9  = _mm512_loadu_si512(&a[(idx_m+2 )*16]);  // Load 2 rows with n=16
+            matrixArray_10 = _mm512_loadu_si512(&a[(idx_m+4 )*16]);  // Load 2 rows with n=16
+            matrixArray_11 = _mm512_loadu_si512(&a[(idx_m+6 )*16]);  // Load 2 rows with n=16
+            matrixArray_12 = _mm512_loadu_si512(&a[(idx_m+8 )*16]);  // Load 2 rows with n=16
+            matrixArray_13 = _mm512_loadu_si512(&a[(idx_m+10)*16]);  // Load 2 rows with n=16
+            matrixArray_14 = _mm512_loadu_si512(&a[(idx_m+12)*16]);  // Load 2 rows with n=16
+            matrixArray_15 = _mm512_loadu_si512(&a[(idx_m+14)*16]);  // Load 2 rows with n=16
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_8x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..p[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            result_0 = _mm512_add_ps(result_0, result_1);
+            STORE16_COMPLETE_RESULT(result_0, y+idx_m)
+        }
+
+        if (m - tag_m_16x > 7) {
+            __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12,  7,  6,  5,  4, 11, 10,  9,  8,  3,  2,  1,  0);
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            matrixArray_4 = _mm512_loadu_si512(&a[(tag_m_16x   )*16]);  // Load 2 rows with n=16
+            matrixArray_5 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]);  // Load 2 rows with n=16
+            matrixArray_6 = _mm512_loadu_si512(&a[(tag_m_16x+4 )*16]);  // Load 2 rows with n=16
+            matrixArray_7 = _mm512_loadu_si512(&a[(tag_m_16x+6 )*16]);  // Load 2 rows with n=16
+
+            // interleave per 256 bits
+            BF16_INTERLEAVE256_4x32(matrixArray)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0);
+            __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            STORE8_COMPLETE_RESULT(result256, y+tag_m_16x)
+            tag_m_16x += 8;
+        }
+
+        if (m - tag_m_16x > 3) {
+            __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2,  matrixArray256_3, \
+                    matrixArray256_4, matrixArray256_5, matrixArray256_6,  matrixArray256_7;
+            __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3;
+            __m256  accum256_0, accum256_1;
+
+            xArray256_0 = _mm512_castsi512_si256(xArray_0);
+            xArray256_1 = _mm512_castsi512_si256(xArray_1);
+            xArray256_2 = _mm512_castsi512_si256(xArray_2);
+            xArray256_3 = _mm512_castsi512_si256(xArray_3);
+
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+
+            matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x   )*16]);  // Load 2 rows with n=16
+            matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]);  // Load 2 rows with n=16
+
+            matrixArray256_0 = _mm512_castsi512_si256(matrixArray_0);
+            matrixArray256_1 = _mm512_extracti32x8_epi32(matrixArray_0, 0x1);
+            matrixArray256_2 = _mm512_castsi512_si256(matrixArray_1);
+            matrixArray256_3 = _mm512_extracti32x8_epi32(matrixArray_1, 0x1);
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x16(matrixArray256)
+
+            // Calculate the temp result for a..d[0:15]
+            BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256)
+
+            accum256_0 = _mm256_add_ps(accum256_0, accum256_1);
+            __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            STORE4_COMPLETE_RESULT(result128, y+tag_m_16x)
+            tag_m_16x += 4;
+        }
+    }
+
+    if (tag_m_16x != m) {
+        __m256i matrixArray256;
+        __m256  accum256;
+        __m128  accum128, tmp128;
+        for (BLASLONG i = tag_m_16x; i < m; i++) {
+            accum256 = _mm256_setzero_ps();
+            matrixArray256 = _mm256_loadu_si256(&a[(i)*16]);       // Load 1 rows with n=16
+            accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256);
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n>16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16p_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16p_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+
+    unsigned int load_mask_value = (((unsigned int)0xffffffff) >> (32-n));
+    __mmask32 load_mask = *((__mmask32*) &load_mask_value);
+    __m512i x512 = _mm512_maskz_loadu_epi16(load_mask, x);    // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|...
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7, \
+            matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15;
+    __m512  accum512_0, accum512_1, accum512_2, accum512_3;
+    __m256  accum256;
+    __m128  accum128;
+
+    if (tag_m_8x > 0) {
+        __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        // Prepare X with 2-step interleave way
+        xArray_0 = x512;
+        BF16_INTERLEAVE_1x32(xArray)
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load 8 rows from matrix
+            BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, 0, load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_8x32(matrixArray)
+
+            // Calculate the temp result for a..h[0:31]
+            BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray)
+
+            // Reorder and add up the final result
+            accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
+            accum512_3 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
+            accum512_2 = _mm512_add_ps(accum512_2, accum512_3);
+            accum256   = _mm256_add_ps(_mm512_castps512_ps256(accum512_2), _mm512_extractf32x8_ps(accum512_2, 1));
+            STORE8_COMPLETE_RESULT(accum256, y+idx_m)
+        }
+
+        if (m - tag_m_8x > 3) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+
+            // Load 4 rows from matrix
+            BF16_MATRIX_MASKZ_LOAD_4x32(matrixArray, a, lda, tag_m_8x, 0, load_mask)
+
+            // 2-step interleave for matrix
+            BF16_INTERLEAVE_4x32(matrixArray)
+
+            // Calculate the temp result for a..d[0:31]
+            BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray)
+
+            accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
+            accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            STORE4_COMPLETE_RESULT(accum128, y+tag_m_8x)
+            tag_m_8x += 4;
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG i = tag_m_8x; i < m; i++) {
+            accum512_0 = _mm512_setzero_ps();
+            matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(i)*lda]);       // Load 1 rows with n=16
+            accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) matrixArray_0, (__m512bh) x512);
+            accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_1x128_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_1x128_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x   = m & (~7);
+    BLASLONG tag_n_32x  = n & (~31);
+    BLASLONG tag_n_128x = n & (~127);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+    __m512 accum512_bridge[8];
+    __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3;
+    __m256 accum256_0;
+    __m128 accum128;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3;
+    __m512i xArray_0, xArray_1, xArray_2, xArray_3;
+
+    unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+    __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+    __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+    __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+    __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+    if (tag_m_8x > 0) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            for (int j = idx_m; j < idx_m + 8; j++) {
+                accum512_t_0 = _mm512_setzero_ps();
+                accum512_t_1 = _mm512_setzero_ps();
+                accum512_t_2 = _mm512_setzero_ps();
+                accum512_t_3 = _mm512_setzero_ps();
+                /* Processing the main chunk with 128-elements per round */
+                for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+                    BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n +  0)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+                    BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+                    BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+                    BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+                    BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+                    BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+                    BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                    BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+                    BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+                    BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+                }
+
+                /* Processing the remaining <128 chunk with 32-elements per round */
+                for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+                    BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+                    BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+                    BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                }
+
+                /* Processing the remaining <32 chunk with masked 32-elements processing */
+                if ((n&31) != 0) {
+                    BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+                    BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+                    BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+                }
+
+                /* Accumulate the 4 registers into 1 register */
+                accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+                accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+                accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+                // Temply save the result into a ZMM
+                accum512_bridge[j-idx_m] = accum512_t_0;
+            }
+
+            FP32_INTERLEAVE_8x16_ARRAY(accum512_bridge)
+            FP32_ACCUM2_8x16_ARRAY(accum512_bridge)
+            accum512_bridge[1] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_0, accum512_bridge[4]);
+            accum512_bridge[2] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_1, accum512_bridge[4]);
+            accum512_bridge[1] = _mm512_add_ps(accum512_bridge[1], accum512_bridge[2]);
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_bridge[1]), _mm512_extractf32x8_ps(accum512_bridge[1], 1));
+            STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG j = tag_m_8x; j < m; j++) {
+            accum512_t_0 = _mm512_setzero_ps();
+            accum512_t_1 = _mm512_setzero_ps();
+            accum512_t_2 = _mm512_setzero_ps();
+            accum512_t_3 = _mm512_setzero_ps();
+            /* Processing the main chunk with 128-elements per round */
+            for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) {
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n +  0)
+                BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32)
+                BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64)
+                BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96)
+
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0)
+                BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32)
+                BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64)
+                BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96)
+
+                BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+                BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1)
+                BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2)
+                BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3)
+            }
+
+            /* Processing the remaining <128 chunk with 32-elements per round */
+            for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) {
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n)
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+                BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0)
+            }
+
+            /* Processing the remaining <32 chunk with masked 32-elements processing */
+            if ((n&31) != 0) {
+                BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask)
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+                BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0)
+            }
+
+            /* Accumulate the 4 registers into 1 register */
+            accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1);
+            accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3);
+            accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2);
+
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_t_0), _mm512_extractf32x8_ps(accum512_t_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[j] = alpha * accum128[0] + beta * y[j];
+#else
+            y[j] = alpha * accum128[0] + y[j];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[j] = accum128[0] * alpha;
+#else
+            y[j] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n=32 && lda effective scenario (process before interleave)
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x32_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x32_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+    BLASLONG tag_n_32x = n & (~31);
+
+    __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
+           accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
+    __m256 accum256_0;
+    __m128 accum128;
+
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_set1_ps(alpha);
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_set1_ps(beta);
+#endif
+
+    __m512i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
+    __m512i xArray_0;
+
+    unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31)));
+    __mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
+
+    if (tag_m_8x > 0) {
+        __m512i M512_EPI32_4 = _mm512_set1_epi32(4);
+        __m512i idx_base_0   = _mm512_set_epi32(27, 26, 25, 24, 11, 10,  9,  8, 19, 18, 17, 16,  3,  2,  1,  0);
+        __m512i idx_base_1   = _mm512_add_epi32(idx_base_0, M512_EPI32_4);
+
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum512_0 = _mm512_setzero_ps();
+            accum512_1 = _mm512_setzero_ps();
+            accum512_2 = _mm512_setzero_ps();
+            accum512_3 = _mm512_setzero_ps();
+            accum512_4 = _mm512_setzero_ps();
+            accum512_5 = _mm512_setzero_ps();
+            accum512_6 = _mm512_setzero_ps();
+            accum512_7 = _mm512_setzero_ps();
+
+            for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+                // Load 8 rows from matrix
+                BF16_MATRIX_LOAD_8x32(matrixArray, a, lda, idx_m, idx_n)
+
+                // Load x
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+                // Calculate the temp result for a..h[0:31]
+                BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+            }
+
+            if (tag_n_32x != n) {         // Go with masked 512
+                // Load 8 rows from matrix
+                BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, tag_n_32x, tail_mask)
+
+                // Load x
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+                // Calculate the temp result for a..h[0:31]
+                BF16_DOT_8x32(accum512, matrixArray, xArray_0)
+            }
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x16(accum512)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x16(accum512)
+
+            accum512_1 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_4);
+            accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_4);
+            accum512_1 = _mm512_add_ps(accum512_1, accum512_2);
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_1), _mm512_extractf32x8_ps(accum512_1, 1));
+            STORE8_COMPLETE_RESULT(accum256_0, y+idx_m)
+        }
+    }
+
+    if (tag_m_8x != m) {
+        __m128  tmp128;
+        for (BLASLONG i = tag_m_8x; i < m; i++) {
+            accum512_0 = _mm512_setzero_ps();
+            for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) {
+                // Load 32 elements from matrix
+                BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, i, idx_n)
+
+                // Load 32 elements from x
+                BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n)
+
+                // Calculate and accumulate the temp result
+                BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            }
+
+            if (tag_n_32x != n) {
+                // Load tail elements from matrix
+                BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, i, tag_n_32x, tail_mask)
+
+                // Load 32 elements from x
+                BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask)
+
+                // Calculate and accumulate the temp result
+                BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
+            }
+
+            accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
+            accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+            accum128 = _mm_add_ps(accum128, tmp128);
+            tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+            accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+            y[i] = alpha * accum128[0] + beta * y[i];
+#else
+            y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+            y[i] = accum128[0] * alpha;
+#else
+            y[i] = accum128[0];
+#endif
+#endif
+        }
+    }
+
+    return 0;
+}
+
+// 8 rows parallel processing BF16 GEMV kernel for n<16 && lda effective scenario
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+static int sbgemv_kernel_8x16m_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
+#endif
+#else
+#ifndef ONE_ALPHA
+static int sbgemv_kernel_8x16m_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#else
+static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
+#endif
+#endif
+{
+    BLASLONG tag_m_8x  = m & (~7);
+
+    __m256i matrixArray_0, matrixArray_1, matrixArray_2,  matrixArray_3,  matrixArray_4,  matrixArray_5,  matrixArray_6,  matrixArray_7;
+    __m256i xArray256;
+
+    // Keep align with other kernels and macro definition, the high 256bit is never used
+#ifndef ONE_ALPHA
+    __m512  ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha));
+#endif
+#ifndef ZERO_BETA
+    __m512  BETAVECTOR  = _mm512_castps256_ps512(_mm256_set1_ps(beta));
+#endif
+
+    __m256  accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \
+            accum256_8, accum256_9, accum256_10, accum256_11, accum256_12, accum256_13, accum256_14, accum256_15;
+
+    __m256i M256_EPI32_4 = _mm256_set1_epi32(4);
+    __m256i idx_base_0   = _mm256_set_epi32(11, 10,  9,  8,  3,  2,  1,  0);
+    __m256i idx_base_1   = _mm256_add_epi32(idx_base_0, M256_EPI32_4);
+
+    unsigned short load_mask_value = (((unsigned short)0xffff) >> (16-n));
+    __mmask16 load_mask = *((__mmask16*) &load_mask_value);
+
+    if (n == 16) {
+        BF16_VECTOR_LOAD_1x16(xArray256, x, 0)
+    } else {
+        BF16_VECTOR_MASKZ_LOAD_1x16(xArray256, x, 0, load_mask)
+    }
+
+    if (n == 16) {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+            accum256_2 = _mm256_setzero_ps();
+            accum256_3 = _mm256_setzero_ps();
+            accum256_4 = _mm256_setzero_ps();
+            accum256_5 = _mm256_setzero_ps();
+            accum256_6 = _mm256_setzero_ps();
+            accum256_7 = _mm256_setzero_ps();
+
+            BF16_MATRIX_LOAD_8x16(matrixArray, a, lda, idx_m, 0)
+
+            BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x8(accum256)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x8(accum256)
+
+            accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+            accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+            accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+            STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+        }
+
+        if (tag_m_8x != m) {
+            __m128  accum128, tmp128;
+            for (BLASLONG i = tag_m_8x; i < m; i++) {
+                accum256_0 = _mm256_setzero_ps();
+                matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]);       // Load 1 rows with n=16
+                accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+                accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                y[i] += accum128[0] * alpha;
+            }
+        }
+    } else {
+        for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) {
+            accum256_0 = _mm256_setzero_ps();
+            accum256_1 = _mm256_setzero_ps();
+            accum256_2 = _mm256_setzero_ps();
+            accum256_3 = _mm256_setzero_ps();
+            accum256_4 = _mm256_setzero_ps();
+            accum256_5 = _mm256_setzero_ps();
+            accum256_6 = _mm256_setzero_ps();
+            accum256_7 = _mm256_setzero_ps();
+
+            BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray, a, lda, idx_m, 0, load_mask)
+
+            BF16_DOT_8x16(accum256, matrixArray, xArray256)
+
+            // 2-step interleave for FP32 regsiter array
+            FP32_INTERLEAVE_8x8(accum256)
+
+            // Accumulate the 2 batch of registers into 2 register (0 and 4)
+            FP32_ACCUM2_8x8(accum256)
+
+            accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4);
+            accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4);
+            accum256_1 = _mm256_add_ps(accum256_1, accum256_2);
+
+            STORE8_COMPLETE_RESULT(accum256_1, y+idx_m)
+        }
+
+        if (tag_m_8x != m) {
+            __m128  accum128, tmp128;
+            for (BLASLONG i = tag_m_8x; i < m; i++) {
+                accum256_0 = _mm256_setzero_ps();
+                matrixArray_0 = _mm256_maskz_loadu_epi16(load_mask, &a[(i)*lda]);       // Load 1 rows with n=16
+                accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256);
+                accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1));
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e);
+                accum128 = _mm_add_ps(accum128, tmp128);
+                tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01);
+                accum128 = _mm_add_ps(accum128, tmp128);
+#ifndef ZERO_BETA
+#ifndef ONE_BETA
+                y[i] = alpha * accum128[0] + beta * y[i];
+#else
+                y[i] = alpha * accum128[0] + y[i];
+#endif
+#else
+#ifndef ONE_ALPHA
+                y[i] = accum128[0] * alpha;
+#else
+                y[i] = accum128[0];
+#endif
+#endif
+            }
+        }
+    }
+
+    return 0;
+}

From c5e62dad69ca13d48c2e9ce29a6398668e687dc9 Mon Sep 17 00:00:00 2001
From: "Chen, Guobing" <guobing.chen@intel.com>
Date: Thu, 29 Oct 2020 03:37:51 +0800
Subject: [PATCH 022/121] Fix cooperlake compile issue

Add a missing macro which is required in Makefile.x86_64 due to recent
clearnup, which causes cooperlake platform build failure.
---
 Makefile.system | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.system b/Makefile.system
index 6d985786d..52d3e2cdc 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -319,6 +319,7 @@ ifeq ($(GCCVERSIONGTEQ7),1)
 else
 	GCCDUMPVERSION_PARAM := -dumpversion
 endif
+GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
 GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
 GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
 endif

From b43549188525741f311d6e5574c0fd960f964204 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Thu, 29 Oct 2020 14:57:51 -0500
Subject: [PATCH 023/121] Optimize caxpy for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores.
---
 kernel/power/KERNEL.POWER10         |   6 +-
 kernel/power/caxpy_microk_power10.c | 188 ++++++++++++++++++++++++++++
 kernel/power/caxpy_power10.c        | 126 +++++++++++++++++++
 3 files changed, 315 insertions(+), 5 deletions(-)
 create mode 100644 kernel/power/caxpy_microk_power10.c
 create mode 100644 kernel/power/caxpy_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 1e514fcc9..b4c7a5e41 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -143,11 +143,7 @@ ZASUMKERNEL  = zasum.c
 #
 SAXPYKERNEL  = saxpy_power10.c
 DAXPYKERNEL  = daxpy_power10.c
-ifneq ($(GCCVERSIONGTEQ9),1)
-CAXPYKERNEL  = caxpy_power9.S
-else
-CAXPYKERNEL  = caxpy.c
-endif
+CAXPYKERNEL  = caxpy_power10.c
 ZAXPYKERNEL  = zaxpy_power10.c
 #
 SCOPYKERNEL  = scopy_power10.c
diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c
new file mode 100644
index 000000000..0d13416b3
--- /dev/null
+++ b/kernel/power/caxpy_microk_power10.c
@@ -0,0 +1,188 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+static void caxpy_kernel_8 (long n, float *x, float *y,
+			    float alpha_r, float alpha_i)
+{
+#if !defined(CONJ)
+  static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 };
+#else
+  static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 };
+#endif
+  const float *mvecp = mvec;
+  /* We have to load reverse mask for big endian.  */
+  /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
+
+  __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
+  long ytmp;
+
+  __asm__
+    (
+       "xscvdpspn 32, %7    \n\t"
+       "xscvdpspn 33, %8    \n\t"
+       "xxspltw 32, 32, 0   \n\t"
+       "xxspltw 33, 33, 0   \n\t"
+       "lxvd2x          36, 0, %9       \n\t"   // mvec
+
+#if !defined(CONJ)
+       "xvmulsp		33, 33, 36	\n\t"	// alpha_i * mvec
+#else
+       "xvmulsp		32, 32, 36	\n\t"	// alpha_r * mvec
+#endif
+       "mr		%4, %3		\n\t"
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"	// x0
+       "lxvp		42, 32(%2)	\n\t"	// x2
+       "lxvp		48, 0(%3)	\n\t"	// y0
+       "lxvp		50, 32(%3)	\n\t"	// y2
+
+       "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
+       "xxperm 53, 41, %x10 \n\t"       // exchange real and imag part
+       "xxperm 54, 42, %x10 \n\t"       // exchange real and imag part
+       "xxperm 55, 43, %x10 \n\t"       // exchange real and imag part
+
+       "lxvp		44, 64(%2)	\n\t"	// x4
+       "lxvp		46, 96(%2)	\n\t"	// x6
+       "lxvp		34, 64(%3)	\n\t"	// y4
+       "lxvp		38, 96(%3)	\n\t"	// y6
+
+       "xxperm 56, 44, %x10 \n\t"       // exchange real and imag part
+       "xxperm 57, 45, %x10 \n\t"       // exchange real and imag part
+       "xxperm 58, 46, %x10 \n\t"       // exchange real and imag part
+       "xxperm 59, 47, %x10 \n\t"       // exchange real and imag part
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+       "one%=:				\n\t"
+
+       "xvmaddasp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddasp	49, 41, 32	\n\t"
+       "lxvp		40, 0(%2)	\n\t"	// x0
+       "xvmaddasp	50, 42, 32	\n\t"
+       "xvmaddasp	51, 43, 32	\n\t"
+       "lxvp		42, 32(%2)	\n\t"	// x2
+
+       "xvmaddasp	34, 44, 32	\n\t"
+       "xvmaddasp	35, 45, 32	\n\t"
+       "lxvp		44, 64(%2)	\n\t"	// x4
+       "xvmaddasp	38, 46, 32	\n\t"
+       "xvmaddasp	39, 47, 32	\n\t"
+       "lxvp		46, 96(%2)	\n\t"	// x6
+
+       "xvmaddasp	48, 52, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "addi		%2, %2, 128	\n\t"
+       "xvmaddasp	49, 53, 33	\n\t"
+       "xvmaddasp	50, 54, 33	\n\t"
+       "xvmaddasp	51, 55, 33	\n\t"
+
+       "xvmaddasp	34, 56, 33	\n\t"
+       "xvmaddasp	35, 57, 33	\n\t"
+       "xvmaddasp	38, 58, 33	\n\t"
+       "xvmaddasp	39, 59, 33	\n\t"
+
+       "stxvp		48, 0(%4)	\n\t"
+       "stxvp		50, 32(%4)	\n\t"
+       "stxvp		34, 64(%4)	\n\t"
+       "stxvp		38, 96(%4)	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+       "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
+       "xxperm 53, 41, %x10 \n\t"       // exchange real and imag part
+
+       "lxvp		48, 0(%3)	\n\t"	// y0
+       "xxperm 54, 42, %x10 \n\t"       // exchange real and imag part
+       "xxperm 55, 43, %x10 \n\t"       // exchange real and imag part
+       "lxvp		50, 32(%3)	\n\t"	// y2
+
+       "xxperm 56, 44, %x10 \n\t"       // exchange real and imag part
+       "xxperm 57, 45, %x10 \n\t"       // exchange real and imag part
+       "lxvp		34, 64(%3)	\n\t"	// y4
+       "xxperm 58, 46, %x10 \n\t"       // exchange real and imag part
+       "xxperm 59, 47, %x10 \n\t"       // exchange real and imag part
+       "lxvp		38, 96(%3)	\n\t"	// y6
+
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+       "two%=:				\n\t"
+       "xvmaddasp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddasp	49, 41, 32	\n\t"
+       "xvmaddasp	50, 42, 32	\n\t"
+       "xvmaddasp	51, 43, 32	\n\t"
+
+       "xvmaddasp	34, 44, 32	\n\t"
+       "xvmaddasp	35, 45, 32	\n\t"
+       "xvmaddasp	38, 46, 32	\n\t"
+       "xvmaddasp	39, 47, 32	\n\t"
+
+       "xvmaddasp	48, 52, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "xvmaddasp	49, 53, 33	\n\t"
+       "xvmaddasp	50, 54, 33	\n\t"
+       "xvmaddasp	51, 55, 33	\n\t"
+
+       "xvmaddasp	34, 56, 33	\n\t"
+       "xvmaddasp	35, 57, 33	\n\t"
+       "xvmaddasp	38, 58, 33	\n\t"
+       "xvmaddasp	39, 59, 33	\n\t"
+
+       "stxvp		48, 0(%4)	\n\t"
+       "stxvp		50, 32(%4)	\n\t"
+       "stxvp		34, 64(%4)	\n\t"
+       "stxvp		38, 96(%4)	\n\t"
+
+     "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y),	// 3
+       "=b" (ytmp)	// 4 
+     :
+       "m" (*x),
+       "m" (*mvecp),
+       "d" (alpha_r),	// 7
+       "d" (alpha_i),	// 8
+       "4" (mvecp),	// 9
+       "wa" (mask)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
+       "vs56","vs57","vs58","vs59"
+     );
+}
diff --git a/kernel/power/caxpy_power10.c b/kernel/power/caxpy_power10.c
new file mode 100644
index 000000000..14b8cda67
--- /dev/null
+++ b/kernel/power/caxpy_power10.c
@@ -0,0 +1,126 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "caxpy_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i)
+{
+	BLASLONG register i  = 0;
+	BLASLONG register ix = 0;
+ 
+	
+
+	while(i < n)
+        {
+#if !defined(CONJ)
+              y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+              y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
+              y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
+#else
+              y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+              y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+              y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
+              y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
+#endif
+
+              ix+=4 ;
+              i+=2 ;
+
+       }
+
+}
+
+#endif
+
+int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	if ( n <= 0 )  return(0);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+		{
+			caxpy_kernel_8 (n1, x, y, da_r, da_i);
+			ix = 2 * n1;
+		}
+		i = n1;
+		while(i < n)
+		{
+#if !defined(CONJ)
+                	y[ix]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                	y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                	y[ix]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                	y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+			i++ ;
+			ix += 2;
+
+		}
+		return(0);
+
+
+	}
+
+	inc_x *=2;
+	inc_y *=2;
+
+	while(i < n)
+	{
+
+#if !defined(CONJ)
+                y[iy]   += ( da_r * x[ix]   - da_i * x[ix+1] ) ;
+                y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix]   ) ;
+#else
+                y[iy]   += ( da_r * x[ix]   + da_i * x[ix+1] ) ;
+                y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix]   ) ;
+#endif
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(0);
+
+}
+
+

From 1f564d729b147fb79831008af820a018f500a73a Mon Sep 17 00:00:00 2001
From: Guillaume Horel <guillaume.horel@gmail.com>
Date: Sat, 31 Oct 2020 10:00:48 -0400
Subject: [PATCH 024/121] fix avx2 detection

reword commits to make it clearer
---
 cpuid_x86.c             | 16 ++++++++--------
 driver/others/dynamic.c | 12 ++++++------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpuid_x86.c b/cpuid_x86.c
index 728d459d1..84c12ff43 100644
--- a/cpuid_x86.c
+++ b/cpuid_x86.c
@@ -202,7 +202,7 @@ int support_avx(){
   if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
     xgetbv(0, &eax, &edx);
     if((eax & 6) == 6){
-      ret=1;  //OS support AVX
+      ret=1;  //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2))
     }
   }
   return ret;
@@ -219,8 +219,8 @@ int support_avx2(){
   if (!support_avx()) 
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 0)
-      ret=1;  //OS supports AVX2
+  if((ebx & (1<<5)) != 0)
+      ret=1;  //CPU supports AVX2
   return ret;
 #else
   return 0;
@@ -235,14 +235,14 @@ int support_avx512(){
   if (!support_avx()) 
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & 32) != 32){
-      ret=0;  //OS does not even support AVX2
+  if((ebx & (1<<5)) == 0){
+      ret=0;  //cpu does not have avx2 flag
   }
-  if((ebx & (1<<31)) != 0){
+  if((ebx & (1<<31)) != 0){ //AVX512VL flag
     xgetbv(0, &eax, &edx); 
     if((eax & 0xe0) == 0xe0)
-      ret=1;  //OS supports AVX512VL
-  }
+      ret=1;  //OS supports saving zmm registers
+ }
   return ret;
 #else
   return 0;
diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c
index 21d2c7948..58f4d8b59 100644
--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@@ -330,8 +330,8 @@ int support_avx2(){
   if (!support_avx())
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) != 0)
-      ret=1;  //OS supports AVX2
+  if((ebx & (1<<5)) != 0)
+      ret=1;  //AVX2 flag is set
   return ret;
 #else
   return 0;
@@ -346,13 +346,13 @@ int support_avx512(){
   if (!support_avx())
     return 0;
   cpuid(7, &eax, &ebx, &ecx, &edx);
-  if((ebx & (1<<7)) == 0){
-      ret=0;  //OS does not even support AVX2
+  if((ebx & (1<<5)) == 0){
+      ret=0;  //cpu does not have avx2 flag
   }
-  if((ebx & (1u<<31)) != 0){
+  if((ebx & (1<<31)) != 0){ //AVX512VL flag is set
     xgetbv(0, &eax, &edx);
     if((eax & 0xe0) == 0xe0)
-      ret=1;  //OS supports AVX512VL
+      ret=1;  //OS supports saving zmm register
   }
   return ret;
 #else

From 9fab65e90ad35253014cd9620be0caaabf5f130b Mon Sep 17 00:00:00 2001
From: User User-User <user@localhost>
Date: Sun, 1 Nov 2020 00:38:08 +0200
Subject: [PATCH 025/121] add openbsd gfortran

---
 f_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/f_check b/f_check
index f894aa9ac..c12b0f2ef 100644
--- a/f_check
+++ b/f_check
@@ -33,7 +33,7 @@ if ($compiler eq "") {
               "ppuf77", "ppuf95", "ppuf90", "ppuxlf",
 	      "pathf90", "pathf95",
 	      "pgf95", "pgf90", "pgf77",
-	      "flang",
+	      "flang", "egfortran",
               "ifort");
 
 OUTER:

From 7f26be4802042d7c54bd1645c54adc3e2ff72d50 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 00:00:43 +0100
Subject: [PATCH 026/121] Reunify BUFFERSIZE across arm64 platforms to avoid
 segfaults in DYNAMIC_ARCH

---
 common_arm64.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/common_arm64.h b/common_arm64.h
index 314946282..9cdded305 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -142,14 +142,8 @@ REALNAME:
 #define HUGE_PAGESIZE   ( 4 << 20)
 
 #ifndef BUFFERSIZE
-#if defined(CORTEXA57)
-#define BUFFER_SIZE     (20 << 20)
-#elif defined(TSV110) || defined(EMAG8180)
 #define BUFFER_SIZE     (32 << 20)
 #else
-#define BUFFER_SIZE     (16 << 20)
-#endif
-#else
 #define BUFFER_SIZE	(32 << BUFFERSIZE)
 #endif
 

From dd7a9cc5bf6b926a44b38d13366743691fd6e604 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Sat, 31 Oct 2020 18:28:57 -0500
Subject: [PATCH 027/121] POWER10:  Change dgemm unroll factors

Changing the unroll factors for dgemm to 8 shows improved performance with
POWER10 MMA feature.   Also made some minor changes in sgemm for edge cases.
---
 kernel/power/KERNEL.POWER10          |  14 +-
 kernel/power/dgemm_kernel_power10.c  | 431 +++++++++++++--------------
 kernel/power/dgemm_ncopy_8_power10.c | 326 ++++++++++++++++++++
 kernel/power/sgemm_kernel_power10.c  |  70 ++---
 param.h                              |   4 +
 5 files changed, 568 insertions(+), 277 deletions(-)
 create mode 100644 kernel/power/dgemm_ncopy_8_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index b4c7a5e41..28c39051f 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -34,12 +34,12 @@ SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_power10.c
-DGEMMINCOPY    = ../generic/gemm_ncopy_16.c
-DGEMMITCOPY    =  dgemm_tcopy_16_power8.S
-DGEMMONCOPY    =  dgemm_ncopy_4_power8.S
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMINCOPY    =
+DGEMMITCOPY    =
+DGEMMONCOPY    =  dgemm_ncopy_8_power10.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =
+DGEMMITCOPYOBJ =
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
@@ -69,7 +69,7 @@ STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
 
 DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
 DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c
index b2a29140e..b531799a6 100644
--- a/kernel/power/dgemm_kernel_power10.c
+++ b/kernel/power/dgemm_kernel_power10.c
@@ -149,7 +149,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
   )
 {
-  BLASLONG N = n;
   BLASLONG i1;
 #if defined(TRMMKERNEL)
   BLASLONG off;
@@ -158,85 +157,232 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
   off = -offset;
 #endif
   v4sf_t valpha = { alpha, alpha };
-  N = n >> 2;
-  for (i1 = 0; i1 < N; i1++)
+  for (i1 = 0; i1 < (n >> 3); i1++)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
       FLOAT *CO;
       FLOAT *AO;
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
       CO = C;
-      C += ldc << 2;
+      C += ldc << 3;
       AO = A;
       PREFETCH1 (A, 128);
       PREFETCH1 (A, 256);
-      i = m >> 4;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 3); j++)
 	{
-          FLOAT *BO;
+	  FLOAT *BO;
 #if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 4);
+          REFRESH_POINTERS (8, 8);
 #else
           BO = B;
           temp = k;
 #endif
 	  v4sf_t *rowC;
 	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7;
 	  BLASLONG l = 0;
-	  PREFETCH1 (CO, 0);
-	  PREFETCH1 (CO + ldc, 0);
-	  PREFETCH1 (CO + ldc + ldc, 0);
-	  PREFETCH1 (CO + ldc + ldc + ldc, 0);
-	  PREFETCH1 (CO, 128);
-	  PREFETCH1 (CO + ldc, 128);
-	  PREFETCH1 (CO + ldc + ldc, 128);
-	  PREFETCH1 (CO + ldc + ldc + ldc, 128);
-	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
 	  vec_t *rowA = (vec_t *) & AO[0];
-	  __vector_pair rowB;
 	  vec_t *rb = (vec_t *) & BO[0];
+	  __vector_pair rowB, rowB1;
 	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
 	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
-	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
-	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
-	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
-	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
-	  __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
-	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
-	  __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[2]);
+	  __builtin_mma_xvf64ger (&acc5, rowB1, rowA[2]);
+	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[3]);
+	  __builtin_mma_xvf64ger (&acc7, rowB1, rowA[3]);
 	  for (l = 1; l < temp; l++)
 	    {
-	      rowA = (vec_t *) & AO[l << 4];
-	      rb = (vec_t *) & BO[l << 2];
+	      rowA = (vec_t *) & AO[l << 3];
+	      rb = (vec_t *) & BO[l << 3];
 	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
 	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
-	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
-	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
-	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
-	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
-	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
-	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
-	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc5, rowB1, rowA[2]);
+	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[3]);
+	      __builtin_mma_xvf64gerpp (&acc7, rowB1, rowA[3]);
 	    }
 	  SAVE_ACC (&acc0, 0);
-	  SAVE_ACC (&acc2, 4);
-	  SAVE_ACC (&acc1, 2);
-	  SAVE_ACC (&acc3, 6);
-	  SAVE_ACC (&acc4, 8);
-	  SAVE_ACC (&acc6, 12);
-	  SAVE_ACC (&acc5, 10);
-	  SAVE_ACC (&acc7, 14);
-	  AO += temp << 4;
-	  BO += temp << 2;
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC (&acc2, 2);
+	  SAVE_ACC1 (&acc3, 2);
+	  SAVE_ACC (&acc4, 4);
+	  SAVE_ACC1 (&acc5, 4);
+	  SAVE_ACC (&acc6, 6);
+	  SAVE_ACC1 (&acc7, 6);
+	  CO += 8;
+	  AO += temp << 3;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (8, 8)
+#endif
+	}
+      if (m & 4)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (4, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1, acc2, acc3;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  __vector_pair rowB, rowB1;
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]);
+	  __builtin_mma_xvf64ger (&acc3, rowB1, rowA[1]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 2];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]);
+	      __builtin_mma_xvf64gerpp (&acc3, rowB1, rowA[1]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  SAVE_ACC (&acc2, 2);
+	  SAVE_ACC1 (&acc3, 2);
+	  CO += 4;
+	  AO += temp << 2;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (4, 8)
+#endif
+	}
+      if (m & 2)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (2, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  v4sf_t *rowC;
+	  v4sf_t result[4];
+	  __vector_quad acc0, acc1;
+	  BLASLONG l = 0;
+	  vec_t *rowA = (vec_t *) & AO[0];
+	  __vector_pair rowB, rowB1;
+	  vec_t *rb = (vec_t *) & BO[0];
+	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	  __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
+	  __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]);
+	  for (l = 1; l < temp; l++)
+	    {
+	      rowA = (vec_t *) & AO[l << 1];
+	      rb = (vec_t *) & BO[l << 3];
+	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
+	      __builtin_mma_assemble_pair (&rowB1, rb[3], rb[2]);
+	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
+	      __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]);
+	    }
+	  SAVE_ACC (&acc0, 0);
+	  SAVE_ACC1 (&acc1, 0);
+	  CO += 2;
+	  AO += temp << 1;
+	  BO += temp << 3;
 #if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 4)
+          REFRESH_AFTER_SAVE (2, 8)
 #endif
-	  CO += 16;
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 1)
+	{
+	  FLOAT *BO;
+#if defined(TRMMKERNEL)
+          REFRESH_POINTERS (1, 8);
+#else
+          BO = B;
+          temp = k;
+#endif
+	  BLASLONG l = 0;
+	  v4sf_t t = { 0, 0 };
+	  v4sf_t t1 = { 0, 0 };
+	  v4sf_t t2 = { 0, 0 };
+	  v4sf_t t3 = { 0, 0 };
+	  for (l = 0; l < temp; l++)
+	    {
+	      v4sf_t rowA = { AO[l], AO[l] };
+	      v4sf_t rowB = { BO[l << 3], BO[(l << 3) + 1] };
+	      v4sf_t rowB1 = { BO[(l << 3) + 2], BO[(l << 3) + 3] };
+	      v4sf_t rowB2 = { BO[(l << 3) + 4], BO[(l << 3) + 5] };
+	      v4sf_t rowB3 = { BO[(l << 3) + 6], BO[(l << 3) + 7] };
+	      t += rowA * rowB;
+	      t1 += rowA * rowB1;
+	      t2 += rowA * rowB2;
+	      t3 += rowA * rowB3;
+	    }
+	  t = t * valpha;
+	  t1 = t1 * valpha;
+	  t2 = t2 * valpha;
+	  t3 = t3 * valpha;
+#if defined(TRMMKERNEL)
+	  CO[0 * ldc] = t[0];
+	  CO[1 * ldc] = t[1];
+	  CO[2 * ldc] = t1[0];
+	  CO[3 * ldc] = t1[1];
+	  CO[4 * ldc] = t2[0];
+	  CO[5 * ldc] = t2[1];
+	  CO[6 * ldc] = t3[0];
+	  CO[7 * ldc] = t3[1];
+#else
+	  CO[0 * ldc] += t[0];
+	  CO[1 * ldc] += t[1];
+	  CO[2 * ldc] += t1[0];
+	  CO[3 * ldc] += t1[1];
+	  CO[4 * ldc] += t2[0];
+	  CO[5 * ldc] += t2[1];
+	  CO[6 * ldc] += t3[0];
+	  CO[7 * ldc] += t3[1];
+#endif
+	  CO += 1;
+	  AO += temp;
+	  BO += temp << 3;
+#if defined(TRMMKERNEL)
+          REFRESH_AFTER_SAVE (1, 8)
+#endif
+	}
+#if defined(TRMMKERNEL) && !defined(LEFT)
+      off += 8;                 // number of values in A
+#endif
+      B += k << 3;
+    }
+  if (n & 4)
+    {
+      BLASLONG j, temp;
+      FLOAT *CO;
+      FLOAT *AO;
+#if defined(TRMMKERNEL) && defined(LEFT)
+      off = offset;
+#endif
+      CO = C;
+      C += ldc << 2;
+      AO = A;
+      PREFETCH1 (A, 128);
+      PREFETCH1 (A, 256);
+      for (j = 0; j < (m >> 3); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -278,8 +424,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (8, 4)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -315,8 +460,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (4, 4)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -349,8 +493,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (2, 4)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -395,10 +538,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
       B += k << 2;
     }
-  N = (n & 3) >> 1;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 2)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
       off = offset;
 #endif
@@ -407,66 +549,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc << 1;
       AO = A;
-      i = m >> 4;
-      for (j = 0; j < i; j++)
-	{
-	  FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 2);
-#else
-          BO = B;
-          temp = k;
-#endif
-	  v4sf_t *rowC;
-	  v4sf_t result[4];
-	  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
-	  BLASLONG l = 0;
-	  FLOAT t[4] = { 0, 0, 0, 0 };
-	  t[0] = BO[0], t[1] = BO[1];
-	  __vector_pair rowB;
-	  vec_t *rb = (vec_t *) & t[0];
-	  __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	  vec_t *rowA = (vec_t *) & AO[0];
-	  __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]);
-	  __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]);
-	  __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]);
-	  __builtin_mma_xvf64ger (&acc3, rowB, rowA[3]);
-	  __builtin_mma_xvf64ger (&acc4, rowB, rowA[4]);
-	  __builtin_mma_xvf64ger (&acc5, rowB, rowA[5]);
-	  __builtin_mma_xvf64ger (&acc6, rowB, rowA[6]);
-	  __builtin_mma_xvf64ger (&acc7, rowB, rowA[7]);
-	  for (l = 1; l < temp; l++)
-	    {
-	      t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1];
-	      rb = (vec_t *) & t[0];
-	      __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]);
-	      rowA = (vec_t *) & AO[l << 4];
-	      __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]);
-	      __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]);
-	      __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]);
-	      __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]);
-	      __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]);
-	      __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]);
-	      __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]);
-	      __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]);
-	    }
-	  SAVE2x4_ACC (&acc0, 0);
-	  SAVE2x4_ACC (&acc1, 2);
-	  SAVE2x4_ACC (&acc2, 4);
-	  SAVE2x4_ACC (&acc3, 6);
-	  SAVE2x4_ACC (&acc4, 8);
-	  SAVE2x4_ACC (&acc5, 10);
-	  SAVE2x4_ACC (&acc6, 12);
-	  SAVE2x4_ACC (&acc7, 14);
-	  CO += 16;
-	  AO += temp << 4;
-	  BO += temp << 1;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 2)
-#endif
-	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 3); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -511,8 +594,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (8, 2)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -551,8 +633,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (4, 2)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -588,8 +669,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
           REFRESH_AFTER_SAVE (2, 2)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -626,8 +706,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
       B += k << 1;
     }
-  N = (n & 1) >> 0;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 1)
     {
       BLASLONG i, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -638,97 +717,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc;
       AO = A;
-      i = m;
-      while (i >= 16)
-	{
-	  FLOAT *BO;
-#if defined(TRMMKERNEL)
-          REFRESH_POINTERS (16, 1)
-#else
-          BO = B;
-          temp = k;
-#endif
-	  BLASLONG l = 0;
-	  v4sf_t t = { 0, 0 };
-	  v4sf_t t1 = { 0, 0 };
-	  v4sf_t t2 = { 0, 0 };
-	  v4sf_t t3 = { 0, 0 };
-	  v4sf_t t4 = { 0, 0 };
-	  v4sf_t t5 = { 0, 0 };
-	  v4sf_t t6 = { 0, 0 };
-	  v4sf_t t7 = { 0, 0 };
-	  for (l = 0; l < temp; l++)
-	    {
-	      v4sf_t rowB = { BO[l], BO[l] };
-	      v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] };
-	      v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] };
-	      v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] };
-	      v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] };
-	      v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] };
-	      v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] };
-	      v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] };
-	      v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] };
-	      t += rowA * rowB;
-	      t1 += rowA1 * rowB;
-	      t2 += rowA2 * rowB;
-	      t3 += rowA3 * rowB;
-	      t4 += rowA4 * rowB;
-	      t5 += rowA5 * rowB;
-	      t6 += rowA6 * rowB;
-	      t7 += rowA7 * rowB;
-	    }
-	  t = t * valpha;
-	  t1 = t1 * valpha;
-	  t2 = t2 * valpha;
-	  t3 = t3 * valpha;
-	  t4 = t4 * valpha;
-	  t5 = t5 * valpha;
-	  t6 = t6 * valpha;
-	  t7 = t7 * valpha;
-#if defined(TRMMKERNEL)
-	  CO[0] = t[0];
-	  CO[1] = t[1];
-	  CO[2] = t1[0];
-	  CO[3] = t1[1];
-	  CO[4] = t2[0];
-	  CO[5] = t2[1];
-	  CO[6] = t3[0];
-	  CO[7] = t3[1];
-	  CO[8] = t4[0];
-	  CO[9] = t4[1];
-	  CO[10] = t5[0];
-	  CO[11] = t5[1];
-	  CO[12] = t6[0];
-	  CO[13] = t6[1];
-	  CO[14] = t7[0];
-	  CO[15] = t7[1];
-#else
-	  CO[0] += t[0];
-	  CO[1] += t[1];
-	  CO[2] += t1[0];
-	  CO[3] += t1[1];
-	  CO[4] += t2[0];
-	  CO[5] += t2[1];
-	  CO[6] += t3[0];
-	  CO[7] += t3[1];
-	  CO[8] += t4[0];
-	  CO[9] += t4[1];
-	  CO[10] += t5[0];
-	  CO[11] += t5[1];
-	  CO[12] += t6[0];
-	  CO[13] += t6[1];
-	  CO[14] += t7[0];
-	  CO[15] += t7[1];
-#endif
-	  AO += temp << 4;
-	  BO += temp;
-	  CO += 16;
-	  i -= 16;
-#if defined(TRMMKERNEL)
-          REFRESH_AFTER_SAVE (16, 1)
-#endif
-	}
-      while (i >= 8)
+      for (i = 0; i < (m >> 3); i++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -780,12 +769,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 3;
 	  BO += temp;
 	  CO += 8;
-	  i -= 8;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (8, 1)
 #endif
 	}
-      while (i >= 4)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -821,12 +809,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 2;
 	  BO += temp;
 	  CO += 4;
-	  i -= 4;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (4, 1)
 #endif
 	}
-      while (i >= 2)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -854,12 +841,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 1;
 	  BO += temp;
 	  CO += 2;
-	  i -= 2;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (2, 1)
 #endif
 	}
-      while (i >= 1)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -882,7 +868,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  CO[0] += t * alpha;
 #endif
 	  CO += 1;
-	  i -= 1;
 #if defined(TRMMKERNEL)
           REFRESH_AFTER_SAVE (1, 1)
 #endif
diff --git a/kernel/power/dgemm_ncopy_8_power10.c b/kernel/power/dgemm_ncopy_8_power10.c
new file mode 100644
index 000000000..9836c2e7f
--- /dev/null
+++ b/kernel/power/dgemm_ncopy_8_power10.c
@@ -0,0 +1,326 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include <stdio.h>
+#include "common.h"
+#include <altivec.h>
+#define PREFETCHA(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
+
+int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){
+  BLASLONG i, j;
+
+  IFLOAT *aoffset;
+  IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4;
+  IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8;
+
+  IFLOAT *boffset;
+  IFLOAT ctemp01, ctemp02, ctemp03, ctemp04;
+  IFLOAT ctemp09, ctemp17, ctemp33;
+  IFLOAT ctemp25, ctemp41;
+  IFLOAT ctemp49, ctemp57;
+
+  aoffset = a;
+  boffset = b;
+
+  j = (n >> 3);
+  if (j > 0){
+    do{
+      aoffset1  = aoffset;
+      aoffset2  = aoffset1 + lda;
+      aoffset3  = aoffset2 + lda;
+      aoffset4  = aoffset3 + lda;
+      aoffset5  = aoffset4 + lda;
+      aoffset6  = aoffset5 + lda;
+      aoffset7  = aoffset6 + lda;
+      aoffset8  = aoffset7 + lda;
+      aoffset += 8 * lda;
+
+      i = (m >> 3);
+      if (i > 0){
+	do{
+	PREFETCHA (aoffset1, 384);
+	PREFETCHA (aoffset2, 384);
+	PREFETCHA (aoffset3, 384);
+	PREFETCHA (aoffset4, 384);
+	PREFETCHA (aoffset5, 384);
+	PREFETCHA (aoffset6, 384);
+	PREFETCHA (aoffset7, 384);
+	PREFETCHA (aoffset8, 384);
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset1 +  2);
+	__vector double va2 = *(__vector double*)(aoffset1 +  4);
+	__vector double va3 = *(__vector double*)(aoffset1 +  6);
+
+	__vector double va4 = *(__vector double*)(aoffset2 +  0);
+	__vector double va5 = *(__vector double*)(aoffset2 +  2);
+	__vector double va6 = *(__vector double*)(aoffset2 +  4);
+	__vector double va7 = *(__vector double*)(aoffset2 +  6);
+
+	__vector double va8 = *(__vector double*)(aoffset3 +  0);
+	__vector double va9 = *(__vector double*)(aoffset3 +  2);
+	__vector double va10 = *(__vector double*)(aoffset3 + 4);
+	__vector double va11 = *(__vector double*)(aoffset3 + 6);
+
+	__vector double va12 = *(__vector double*)(aoffset4 +  0);
+	__vector double va13 = *(__vector double*)(aoffset4 +  2);
+	__vector double va14 = *(__vector double*)(aoffset4 +  4);
+	__vector double va15 = *(__vector double*)(aoffset4 +  6);
+
+	__vector double va16 = *(__vector double*)(aoffset5 +  0);
+	__vector double va17 = *(__vector double*)(aoffset5 +  2);
+	__vector double va18 = *(__vector double*)(aoffset5 +  4);
+	__vector double va19 = *(__vector double*)(aoffset5 +  6);
+
+	__vector double va20 = *(__vector double*)(aoffset6 +  0);
+	__vector double va21 = *(__vector double*)(aoffset6 +  2);
+	__vector double va22 = *(__vector double*)(aoffset6 +  4);
+	__vector double va23 = *(__vector double*)(aoffset6 +  6);
+
+	__vector double va24 = *(__vector double*)(aoffset7 +  0);
+	__vector double va25 = *(__vector double*)(aoffset7 +  2);
+	__vector double va26 = *(__vector double*)(aoffset7 + 4);
+	__vector double va27 = *(__vector double*)(aoffset7 + 6);
+
+	__vector double va28 = *(__vector double*)(aoffset8 +  0);
+	__vector double va29 = *(__vector double*)(aoffset8 +  2);
+	__vector double va30 = *(__vector double*)(aoffset8 +  4);
+	__vector double va31 = *(__vector double*)(aoffset8 +  6);
+
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va4, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va8, va12, 0);
+	*(__vector double*)(boffset +  4) = vec_xxpermdi(va16, va20, 0);
+	*(__vector double*)(boffset +  6) = vec_xxpermdi(va24, va28, 0);
+	*(__vector double*)(boffset +  8) = vec_xxpermdi(va0, va4, 3);
+	*(__vector double*)(boffset +  10) = vec_xxpermdi(va8, va12, 3);
+	*(__vector double*)(boffset +  12) = vec_xxpermdi(va16, va20, 3);
+	*(__vector double*)(boffset +  14) = vec_xxpermdi(va24, va28, 3);
+
+	*(__vector double*)(boffset +  16) = vec_xxpermdi(va1, va5, 0);
+	*(__vector double*)(boffset +  18) = vec_xxpermdi(va9, va13, 0);
+	*(__vector double*)(boffset +  20) = vec_xxpermdi(va17, va21, 0);
+	*(__vector double*)(boffset +  22) = vec_xxpermdi(va25, va29, 0);
+	*(__vector double*)(boffset +  24) = vec_xxpermdi(va1, va5, 3);
+	*(__vector double*)(boffset +  26) = vec_xxpermdi(va9, va13, 3);
+	*(__vector double*)(boffset +  28) = vec_xxpermdi(va17, va21, 3);
+	*(__vector double*)(boffset +  30) = vec_xxpermdi(va25, va29, 3);
+
+	*(__vector double*)(boffset +  32) = vec_xxpermdi(va2, va6, 0);
+	*(__vector double*)(boffset +  34) = vec_xxpermdi(va10, va14, 0);
+	*(__vector double*)(boffset +  36) = vec_xxpermdi(va18, va22, 0);
+	*(__vector double*)(boffset +  38) = vec_xxpermdi(va26, va30, 0);
+	*(__vector double*)(boffset +  40) = vec_xxpermdi(va2, va6, 3);
+	*(__vector double*)(boffset +  42) = vec_xxpermdi(va10, va14, 3);
+	*(__vector double*)(boffset +  44) = vec_xxpermdi(va18, va22, 3);
+	*(__vector double*)(boffset +  46) = vec_xxpermdi(va26, va30, 3);
+
+	*(__vector double*)(boffset +  48) = vec_xxpermdi(va3, va7, 0);
+	*(__vector double*)(boffset +  50) = vec_xxpermdi(va11, va15, 0);
+	*(__vector double*)(boffset +  52) = vec_xxpermdi(va19, va23, 0);
+	*(__vector double*)(boffset +  54) = vec_xxpermdi(va27, va31, 0);
+	*(__vector double*)(boffset +  56) = vec_xxpermdi(va3, va7, 3);
+	*(__vector double*)(boffset +  58) = vec_xxpermdi(va11, va15, 3);
+	*(__vector double*)(boffset +  60) = vec_xxpermdi(va19, va23, 3);
+	*(__vector double*)(boffset +  62) = vec_xxpermdi(va27, va31, 3);
+	  aoffset1 +=  8;
+	  aoffset2 +=  8;
+	  aoffset3 +=  8;
+	  aoffset4 +=  8;
+	  aoffset5 +=  8;
+	  aoffset6 +=  8;
+	  aoffset7 +=  8;
+	  aoffset8 +=  8;
+	  boffset  += 64;
+	  i --;
+	}while(i > 0);
+      }
+
+      i = (m & 7);
+      if (i > 0){
+	do{
+	  ctemp01 = *(aoffset1 +  0);
+	  ctemp09 = *(aoffset2 +  0);
+	  ctemp17 = *(aoffset3 +  0);
+	  ctemp25 = *(aoffset4 +  0);
+	  ctemp33 = *(aoffset5 +  0);
+	  ctemp41 = *(aoffset6 +  0);
+	  ctemp49 = *(aoffset7 +  0);
+	  ctemp57 = *(aoffset8 +  0);
+
+	  *(boffset +  0) = ctemp01;
+	  *(boffset +  1) = ctemp09;
+	  *(boffset +  2) = ctemp17;
+	  *(boffset +  3) = ctemp25;
+	  *(boffset +  4) = ctemp33;
+	  *(boffset +  5) = ctemp41;
+	  *(boffset +  6) = ctemp49;
+	  *(boffset +  7) = ctemp57;
+
+	  aoffset1 ++;
+	  aoffset2 ++;
+	  aoffset3 ++;
+	  aoffset4 ++;
+	  aoffset5 ++;
+	  aoffset6 ++;
+	  aoffset7 ++;
+	  aoffset8 ++;
+
+	  boffset += 8;
+	  i --;
+	}while(i > 0);
+      }
+      j--;
+    }while(j > 0);
+  } /* end of if(j > 0) */
+
+  if (n & 4){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1 + lda;
+    aoffset3  = aoffset2 + lda;
+    aoffset4  = aoffset3 + lda;
+    aoffset += 4 * lda;
+
+    i = (m >> 2);
+    if (i > 0){
+      do{
+	PREFETCHA (aoffset1, 384);
+	PREFETCHA (aoffset2, 384);
+	PREFETCHA (aoffset3, 384);
+	PREFETCHA (aoffset4, 384);
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset1 +  2);
+	__vector double va2 = *(__vector double*)(aoffset2 +  0);
+	__vector double va3 = *(__vector double*)(aoffset2 +  2);
+	__vector double va4 = *(__vector double*)(aoffset3 +  0);
+	__vector double va5 = *(__vector double*)(aoffset3 +  2);
+	__vector double va6 = *(__vector double*)(aoffset4 +  0);
+	__vector double va7 = *(__vector double*)(aoffset4 +  2);
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va2, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va4, va6, 0);
+	*(__vector double*)(boffset +  4) = vec_xxpermdi(va0, va2, 3);
+	*(__vector double*)(boffset +  6) = vec_xxpermdi(va4, va6, 3);
+	*(__vector double*)(boffset +  8) = vec_xxpermdi(va1, va3, 0);
+	*(__vector double*)(boffset +  10) = vec_xxpermdi(va5, va7, 0);
+	*(__vector double*)(boffset +  12) = vec_xxpermdi(va1, va3, 3);
+	*(__vector double*)(boffset +  14) = vec_xxpermdi(va5, va7, 3);
+
+	aoffset1 +=  4;
+	aoffset2 +=  4;
+	aoffset3 +=  4;
+	aoffset4 +=  4;
+	boffset  +=  16;
+	i --;
+      }while(i > 0);
+    }
+
+    i = (m & 3);
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+	ctemp02 = *(aoffset2 +  0);
+	ctemp03 = *(aoffset3 +  0);
+	ctemp04 = *(aoffset4 +  0);
+
+	*(boffset +  0) = ctemp01;
+	*(boffset +  1) = ctemp02;
+	*(boffset +  2) = ctemp03;
+	*(boffset +  3) = ctemp04;
+
+	aoffset1 ++;
+	aoffset2 ++;
+	aoffset3 ++;
+	aoffset4 ++;
+
+	boffset += 4;
+	i --;
+      }while(i > 0);
+    }
+  } /* end of if(j > 0) */
+
+  if (n & 2){
+    aoffset1  = aoffset;
+    aoffset2  = aoffset1 + lda;
+    aoffset += 2 * lda;
+
+    i = (m >> 1);
+    if (i > 0){
+      do{
+	__vector double va0 = *(__vector double*)(aoffset1 +  0);
+	__vector double va1 = *(__vector double*)(aoffset2 +  0);
+	*(__vector double*)(boffset +  0) = vec_xxpermdi(va0, va1, 0);
+	*(__vector double*)(boffset +  2) = vec_xxpermdi(va0, va1, 3);
+
+	aoffset1 +=  2;
+	aoffset2 +=  2;
+	boffset  +=  4;
+	i --;
+      }while(i > 0);
+    }
+
+    if (m & 1){
+      ctemp01 = *(aoffset1 +  0);
+      ctemp02 = *(aoffset2 +  0);
+
+      *(boffset +  0) = ctemp01;
+      *(boffset +  1) = ctemp02;
+
+      aoffset1 ++;
+      aoffset2 ++;
+      boffset += 2;
+    }
+  } /* end of if(j > 0) */
+
+  if (n & 1){
+    aoffset1  = aoffset;
+
+    i = m;
+    if (i > 0){
+      do{
+	ctemp01 = *(aoffset1 +  0);
+
+	*(boffset +  0) = ctemp01;
+
+	aoffset1 ++;
+	boffset  ++;
+	i --;
+      }while(i > 0);
+    }
+
+  } /* end of if(j > 0) */
+
+  return 0;
+}
diff --git a/kernel/power/sgemm_kernel_power10.c b/kernel/power/sgemm_kernel_power10.c
index 9fbf84695..80f495f70 100644
--- a/kernel/power/sgemm_kernel_power10.c
+++ b/kernel/power/sgemm_kernel_power10.c
@@ -197,7 +197,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
   )
 {
-  BLASLONG N = n;
   BLASLONG i1;
 #if defined(TRMMKERNEL)
   BLASLONG off;
@@ -207,10 +206,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
 
   v4sf_t valpha = { alpha, alpha, alpha, alpha };
-  N = n >> 3;
-  for (i1 = 0; i1 < N; i1++)
+  for (i1 = 0; i1 < (n >> 3); i1++)
     {
-      BLASLONG i, j, temp;
+      BLASLONG j, temp;
       FLOAT *CO;
       FLOAT *AO;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -221,8 +219,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       AO = A;
       PREFETCH1 (A, 128);
       PREFETCH1 (A, 256);
-      i = m >> 4;
-      for (j = 0; j < i; j++)
+      for (j = 0; j < (m >> 4); j++)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -438,8 +435,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 #endif
 	    CO += 16;
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -478,8 +474,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 8)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -512,8 +507,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 8)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -550,8 +544,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 8)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -610,8 +603,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 3;
     }
-  N = (n & 7) >> 2;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 4)
     {
       BLASLONG i, j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -719,8 +711,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (16, 4)
 #endif
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -753,8 +744,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 4)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -784,8 +774,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 4)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -818,8 +807,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 4)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -863,8 +851,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 2;
     }
-  N = (n & 3) >> 1;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 2)
     {
       BLASLONG i, j, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -973,8 +960,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (16, 2)
 #endif
 	}
-      i = (m & 15) >> 3;
-      for (j = 0; j < i; j++)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 	  v4sf_t *rowC;
@@ -1010,8 +996,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (8, 2)
 #endif
 	}
-      i = (m & 7) >> 2;
-      for (j = 0; j < i; j++)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 	  v4sf_t *rowC;
@@ -1044,8 +1029,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (4, 2)
 #endif
 	}
-      i = (m & 3) >> 1;
-      for (j = 0; j < i; j++)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1081,8 +1065,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  REFRESH_AFTER_SAVE (2, 2)
 #endif
 	}
-      i = (m & 1) >> 0;
-      for (j = 0; j < i; j++)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1120,8 +1103,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 
       B += k << 1;
     }
-  N = (n & 1) >> 0;
-  for (i1 = 0; i1 < N; i1++)
+  if (n & 1)
     {
       BLASLONG i, temp;
 #if defined(TRMMKERNEL) && defined(LEFT)
@@ -1132,8 +1114,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
       CO = C;
       C += ldc;
       AO = A;
-      i = m;
-      while (i >= 16)
+      for (i = 0; i < (m >> 4); i++)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1213,12 +1194,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 4;
 	  BO += temp;
 	  CO += 16;
-	  i -= 16;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (16, 1)
 #endif
 	}
-      while (i >= 8)
+      if (m & 8)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1268,12 +1248,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 3;
 	  BO += temp;
 	  CO += 8;
-	  i -= 8;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (8, 1)
 #endif
 	}
-      while (i >= 4)
+      if (m & 4)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1308,12 +1287,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 2;
 	  BO += temp;
 	  CO += 4;
-	  i -= 4;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (4, 1)
 #endif
 	}
-      while (i >= 2)
+      if (m & 2)
 	{
 	  FLOAT *BO;
 	  BLASLONG l = 0;
@@ -1342,12 +1320,11 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  AO += temp << 1;
 	  BO += temp;
 	  CO += 2;
-	  i -= 2;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (2, 1)
 #endif
 	}
-      while (i >= 1)
+      if (m & 1)
 	{
 	  FLOAT *BO;
 #if defined(TRMMKERNEL)
@@ -1371,7 +1348,6 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B,
 	  CO[0] += t * alpha;
 #endif
 	  CO += 1;
-	  i -= 1;
 #if defined(TRMMKERNEL)
 	  REFRESH_AFTER_SAVE (1, 1)
 #endif
diff --git a/param.h b/param.h
index f3ddde6a1..2047e4776 100644
--- a/param.h
+++ b/param.h
@@ -2436,6 +2436,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SBGEMM_DEFAULT_P 832
 #define SBGEMM_DEFAULT_Q 1026
 #define SBGEMM_DEFAULT_R 4096
+#undef DGEMM_DEFAULT_UNROLL_M
+#undef DGEMM_DEFAULT_UNROLL_N
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 8
 #endif
 
 #if defined(SPARC) && defined(V7)

From 40a93c232b6a9a09fb0cf10a8de5ba6ca94070a8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 21:58:26 +0100
Subject: [PATCH 028/121] Disable EXPRECISION for DYNAMIC_ARCH in combination
 with TARGET=GENERIC

NO_EXPRECISION is disabled for the GENERIC_TARGET already, so prevent mixing with code parts that use a different float size by default
---
 Makefile.system | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 52d3e2cdc..b62eab379 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -93,6 +93,11 @@ endif
 ifdef TARGET
 GETARCH_FLAGS := -DFORCE_$(TARGET)
 GETARCH_FLAGS += -DUSER_TARGET
+ifeq ($(TARGET), GENERIC)
+ifeq ($(DYNAMIC_ARCH), 1)
+override NO_EXPRECISION=1
+endif
+endif
 endif
 
 # Force fallbacks for 32bit

From 6baf8af6588725ee720bcfad12e235a61df5deb2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 22:11:48 +0100
Subject: [PATCH 029/121] Disable EXPRECISION for the combination of
 DYNAMIC_CORE and GENERIC target

---
 cmake/os.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index c644bc3f7..98428c624 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -84,6 +84,10 @@ if (X86)
   set(NO_EXPRECISION 1)
 endif ()
 
+if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC))
+  set(NO_EXPRECISION 1)
+endif ()
+
 if (UTEST_CHECK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")
   set(SANITY_CHECK 1)

From e5f8c2bf8ae438ec6b626f9fe6711101ad004d3d Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 1 Nov 2020 22:25:43 +0100
Subject: [PATCH 030/121] typo fix

---
 cmake/os.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index 98428c624..1eb2b7472 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -84,7 +84,7 @@ if (X86)
   set(NO_EXPRECISION 1)
 endif ()
 
-if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC))
+if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC"))
   set(NO_EXPRECISION 1)
 endif ()
 

From 60997ddd73b00dcdd86086e166483fcc70aa2a3d Mon Sep 17 00:00:00 2001
From: Aisha Tammy <gentoo@aisha.cc>
Date: Mon, 2 Nov 2020 13:04:53 +0000
Subject: [PATCH 031/121] allow setting soname without suffix or prefix

Allows to create a library with a different
SONAME without the need to add suffixes to symbols
Backwards compatible and should have no effect
on the workflow and previous users.
Useful for allowing INTERFACE64 library alongside
the standard library without file conflicts
---
 Makefile.install | 16 ++++++++--------
 Makefile.system  |  8 ++++++--
 2 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/Makefile.install b/Makefile.install
index 7c1a3ca43..e8b64465f 100644
--- a/Makefile.install
+++ b/Makefile.install
@@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
 OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
 OPENBLAS_BINARY_DIR := $(PREFIX)/bin
 OPENBLAS_BUILD_DIR := $(CURDIR)
-OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas
+OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE)
 OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
 OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake
 OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig
@@ -150,13 +150,13 @@ endif
 endif
 
 #Generating openblas.pc
-	@echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
-	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
-	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc"
+	@echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
+	@echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
+	@cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc"
 
 
 #Generating OpenBLASConfig.cmake
diff --git a/Makefile.system b/Makefile.system
index 52d3e2cdc..afbdb6bab 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1263,10 +1263,14 @@ ifndef SYMBOLSUFFIX
 SYMBOLSUFFIX =
 endif
 
+ifndef LIBSONAMEBASE
+LIBSONAMEBASE = openblas
+endif
+
 ifndef LIBNAMESUFFIX
-LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)
+LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)
 else
-LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
+LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX)
 endif
 
 ifeq ($(OSNAME), CYGWIN_NT)

From b9bc76aec4c869fed0b5cfbbe11336206a6ff5ec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 2 Nov 2020 22:43:50 +0100
Subject: [PATCH 032/121] Add files via upload

---
 cmake/os.cmake       |  4 +++-
 cmake/prebuild.cmake | 30 ++++++++++++++++++++++++++++++
 cmake/system.cmake   | 31 +++++++++++++++++++++++++++++--
 3 files changed, 62 insertions(+), 3 deletions(-)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index 1eb2b7472..feb4c05d1 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -84,9 +84,11 @@ if (X86)
   set(NO_EXPRECISION 1)
 endif ()
 
-if ((DYNAMIC_ARCH) AND (${TARGET} STREQUAL "GENERIC"))
+if (DYNAMIC_ARCH)
+if (${TARGET} STREQUAL "GENERIC")
   set(NO_EXPRECISION 1)
 endif ()
+endif ()
 
 if (UTEST_CHECK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")
diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index 3e38abbf5..b1b4c501a 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -139,6 +139,36 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       set(CGEMM3M_UNROLL_N 4)
       set(ZGEMM3M_UNROLL_M 4)
       set(ZGEMM3M_UNROLL_N 4)
+  elseif ("${TCORE}" STREQUAL "BARCELONA")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "STEAMROLLER")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "EXCAVATOR")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "NEHALEM")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "PRESCOTT")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_SSE3\n")
+  elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX\n")
+  elseif ("${TCORE}" STREQUAL "HASWELL")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX2\n")
+  elseif ("${TCORE}" STREQUAL "ZEN")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX2\n")
+  elseif ("${TCORE}" STREQUAL "SKYLAKEX")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX512\n")
+  elseif ("${TCORE}" STREQUAL "COOPERLAKE")
+    file(APPEND ${TARGET_CONF_TEMP}
+      "#define HAVE_AVX512\n")
   elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 4cc46236d..83b79bab2 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -64,12 +64,39 @@ if (DEFINED TARGET)
     if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
       execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
       if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
-        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
       endif()
     elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
     endif()
   endif()
+  if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
+  endif()
+  if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
+  endif()
+  if (${TARGET} STREQUAL "BARCELONA")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "STEAMROLLER")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "EXCAVATOR")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "PILEDRIVER")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "PRESCOTT")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "NEHALEM")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+  if (${TARGET} STREQUAL "CORE2")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
   if (DEFINED HAVE_SSE)
     set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
   endif()

From a9f9354296d448ffc087fc618d4fc9c39b56f72c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 2 Nov 2020 23:17:46 +0100
Subject: [PATCH 033/121] Fix target test

---
 cmake/os.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/os.cmake b/cmake/os.cmake
index feb4c05d1..e24059dd5 100644
--- a/cmake/os.cmake
+++ b/cmake/os.cmake
@@ -85,10 +85,12 @@ if (X86)
 endif ()
 
 if (DYNAMIC_ARCH)
+if (TARGET)
 if (${TARGET} STREQUAL "GENERIC")
   set(NO_EXPRECISION 1)
 endif ()
 endif ()
+endif ()
 
 if (UTEST_CHECK)
   set(CCOMMON_OPT "${CCOMMON_OPT} -DUTEST_CHECK")

From 0155cd53a3c29e8a57cdef504a4a685bc7ea098a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Nov 2020 23:45:49 +0100
Subject: [PATCH 034/121] Add -msse3 where needed for DYNAMIC_ARCH builds

---
 cmake/system.cmake | 21 +++++++++------------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 83b79bab2..48d206b12 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -67,34 +67,31 @@ if (DEFINED TARGET)
         set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
       endif()
     elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2")
     endif()
   endif()
+  if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
+  endif()
   if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
   endif()
   if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
   endif()
-  if (${TARGET} STREQUAL "BARCELONA")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "STEAMROLLER")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "EXCAVATOR")
+  if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "PILEDRIVER")
+  if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "PRESCOTT")
+  if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "NEHALEM")
+  if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
-  if (${TARGET} STREQUAL "CORE2")
+  if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON")
       set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
   endif()
   if (DEFINED HAVE_SSE)

From 8cc73fee98684b49fdd1869e44b3d6a816cdb407 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 3 Nov 2020 23:47:04 +0100
Subject: [PATCH 035/121] Export NO_EXPRECISION after overriding for
 DYNAMIC_ARCH with GENERIC target

---
 Makefile.system | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Makefile.system b/Makefile.system
index b62eab379..ca302a98a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -96,6 +96,7 @@ GETARCH_FLAGS += -DUSER_TARGET
 ifeq ($(TARGET), GENERIC)
 ifeq ($(DYNAMIC_ARCH), 1)
 override NO_EXPRECISION=1
+export NO_EXPRECiSION
 endif
 endif
 endif

From d9ba49165af15d535d9b9955bd248eab4d259f06 Mon Sep 17 00:00:00 2001
From: Gengxin Xie <gengxin.xie@intel.com>
Date: Sun, 27 Sep 2020 10:38:19 +0800
Subject: [PATCH 036/121] Improve the performance of rot by using AVX512 and
 AVX2 intrinsic

---
 driver/others/blas_l1_thread.c         |   2 +-
 driver/others/blas_server_win32.c      |  11 +-
 kernel/x86_64/KERNEL.HASWELL           |   3 +
 kernel/x86_64/drot.c                   | 139 +++++++++++++++++++++++++
 kernel/x86_64/drot_microk_haswell-2.c  |  87 ++++++++++++++++
 kernel/x86_64/drot_microk_skylakex-2.c |  94 +++++++++++++++++
 kernel/x86_64/srot.c                   | 139 +++++++++++++++++++++++++
 kernel/x86_64/srot_microk_haswell-2.c  |  87 ++++++++++++++++
 kernel/x86_64/srot_microk_skylakex-2.c |  91 ++++++++++++++++
 9 files changed, 648 insertions(+), 5 deletions(-)
 create mode 100644 kernel/x86_64/drot.c
 create mode 100644 kernel/x86_64/drot_microk_haswell-2.c
 create mode 100644 kernel/x86_64/drot_microk_skylakex-2.c
 create mode 100644 kernel/x86_64/srot.c
 create mode 100644 kernel/x86_64/srot_microk_haswell-2.c
 create mode 100644 kernel/x86_64/srot_microk_skylakex-2.c

diff --git a/driver/others/blas_l1_thread.c b/driver/others/blas_l1_thread.c
index 04acbcc5f..06039c952 100644
--- a/driver/others/blas_l1_thread.c
+++ b/driver/others/blas_l1_thread.c
@@ -80,7 +80,7 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
     break;
   }
 
-  mode |= BLAS_LEGACY;
+  if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY;
 
   for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]);
 
diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c
index d2cc91757..f47908c70 100644
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@@ -476,12 +476,15 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
 
   routine = queue -> routine;
 
-    if (!(queue -> mode & BLAS_LEGACY)) {
+  if (queue -> mode & BLAS_LEGACY) {
+    legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
+  } else
+    if (queue -> mode & BLAS_PTHREAD) {
+      void (*pthreadcompat)(void *) = queue -> routine;
+      (pthreadcompat)(queue -> args);
+    } else
       (routine)(queue -> args, queue -> range_m, queue -> range_n,
 		queue -> sa, queue -> sb, 0);
-    } else {
-      legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
-    }
 
   if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
 
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index b979fc0ae..81eaf96ac 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -102,3 +102,6 @@ ZGEMM3MKERNEL    =  zgemm3m_kernel_4x4_haswell.c
 
 SASUMKERNEL = sasum.c
 DASUMKERNEL = dasum.c
+
+SROTKERNEL = srot.c
+DROTKERNEL = drot.c
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
new file mode 100644
index 000000000..a312b7ff9
--- /dev/null
+++ b/kernel/x86_64/drot.c
@@ -0,0 +1,139 @@
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "drot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "drot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_DROT_KERNEL
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    FLOAT f0, f1, f2, f3;
+    FLOAT x0, x1, x2, x3;
+    FLOAT g0, g1, g2, g3;
+    FLOAT y0, y1, y2, y3;
+
+    FLOAT* xp = x;
+    FLOAT* yp = y;
+
+    BLASLONG n1 = n & (~7);
+
+    while (i < n1) {
+        x0 = xp[0];
+        y0 = yp[0];
+        x1 = xp[1];
+        y1 = yp[1];
+        x2 = xp[2];
+        y2 = yp[2];
+        x3 = xp[3];
+        y3 = yp[3];
+
+        f0 = c*x0 + s*y0;
+        g0 = c*y0 - s*x0;
+        f1 = c*x1 + s*y1;
+        g1 = c*y1 - s*x1;
+        f2 = c*x2 + s*y2;
+        g2 = c*y2 - s*x2;
+        f3 = c*x3 + s*y3;
+        g3 = c*y3 - s*x3;
+
+        xp[0] = f0;
+        yp[0] = g0;
+        xp[1] = f1;
+        yp[1] = g1;
+        xp[2] = f2;
+        yp[2] = g2;
+        xp[3] = f3;
+        yp[3] = g3;
+
+        xp += 4;
+        yp += 4;
+        i += 4;
+    }
+
+    while (i < n) {
+        FLOAT temp = c*x[i] + s*y[i];
+        y[i] = c*y[i] - s*x[i];
+        x[i] = temp;
+
+        i++;
+    }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0, iy = 0;
+
+    FLOAT temp;
+    
+    if (n <= 0)
+        return;
+    if ((inc_x == 1) && (inc_y == 1)) {
+            drot_kernel(n, x, y, c, s);
+    }
+    else {
+        while (i < n) {
+            temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+
+            ix += inc_x;
+            iy += inc_y;
+            i++;
+        }
+    }
+    return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+    rot_compute(args->m, 
+            args->a, args->lda, 
+            args->b, args->ldb, 
+            ((FLOAT *)args->alpha)[0], 
+            ((FLOAT *)args->alpha)[1]);
+    return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT alpha[2]={c, s};
+    FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+    if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+        nthreads = 1;
+    }
+    else {
+        nthreads = num_cpu_avail(1);
+    }
+
+    if (nthreads == 1) {
+        rot_compute(n, x, inc_x, y, inc_y, c, s);
+    }
+    else {
+#if defined(DOUBLE)
+	    int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+    }
+#else	
+    rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+    return 0;
+}
diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c
new file mode 100644
index 000000000..72a87696e
--- /dev/null
+++ b/kernel/x86_64/drot_microk_haswell-2.c
@@ -0,0 +1,87 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+
+    BLASLONG tail_index_4 = n&(~3);
+    BLASLONG tail_index_16 = n&(~15);
+
+    __m256d c_256, s_256;
+    if (n >= 4) {
+        c_256 = _mm256_set1_pd(c);
+        s_256 = _mm256_set1_pd(s);
+    }
+
+    __m256d x0, x1, x2, x3;
+    __m256d y0, y1, y2, y3;
+    __m256d t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_16; i += 16) {
+        x0 = _mm256_loadu_pd(&x[i + 0]);
+        x1 = _mm256_loadu_pd(&x[i + 4]);
+        x2 = _mm256_loadu_pd(&x[i + 8]);
+        x3 = _mm256_loadu_pd(&x[i +12]);
+        y0 = _mm256_loadu_pd(&y[i + 0]);
+        y1 = _mm256_loadu_pd(&y[i + 4]);
+        y2 = _mm256_loadu_pd(&y[i + 8]);
+        y3 = _mm256_loadu_pd(&y[i +12]);
+
+        t0 = _mm256_mul_pd(s_256, y0);
+        t1 = _mm256_mul_pd(s_256, y1);
+        t2 = _mm256_mul_pd(s_256, y2);
+        t3 = _mm256_mul_pd(s_256, y3);
+
+        t0 = _mm256_fmadd_pd(c_256, x0, t0);
+        t1 = _mm256_fmadd_pd(c_256, x1, t1);
+        t2 = _mm256_fmadd_pd(c_256, x2, t2);
+        t3 = _mm256_fmadd_pd(c_256, x3, t3);
+
+        _mm256_storeu_pd(&x[i + 0], t0);
+        _mm256_storeu_pd(&x[i + 4], t1);
+        _mm256_storeu_pd(&x[i + 8], t2);
+        _mm256_storeu_pd(&x[i +12], t3);
+
+        t0 = _mm256_mul_pd(s_256, x0);
+        t1 = _mm256_mul_pd(s_256, x1);
+        t2 = _mm256_mul_pd(s_256, x2);
+        t3 = _mm256_mul_pd(s_256, x3);
+
+        t0 = _mm256_fmsub_pd(c_256, y0, t0);
+        t1 = _mm256_fmsub_pd(c_256, y1, t1);
+        t2 = _mm256_fmsub_pd(c_256, y2, t2);
+        t3 = _mm256_fmsub_pd(c_256, y3, t3);
+
+        _mm256_storeu_pd(&y[i + 0], t0);
+        _mm256_storeu_pd(&y[i + 4], t1);
+        _mm256_storeu_pd(&y[i + 8], t2);
+        _mm256_storeu_pd(&y[i +12], t3);
+
+    }
+
+    for (i = tail_index_16; i < tail_index_4; i += 4) {
+        x0 = _mm256_loadu_pd(&x[i]);
+        y0 = _mm256_loadu_pd(&y[i]);
+
+        t0 = _mm256_mul_pd(s_256, y0);
+        t0 = _mm256_fmadd_pd(c_256, x0, t0);
+        _mm256_storeu_pd(&x[i], t0);
+        
+        t0 = _mm256_mul_pd(s_256, x0);
+        t0 = _mm256_fmsub_pd(c_256, y0, t0);
+        _mm256_storeu_pd(&y[i], t0);
+    }
+
+    for (i = tail_index_4; i < n; ++i) {
+        FLOAT temp = c * x[i] + s * y[i];
+        y[i] = c * y[i] - s * x[i];
+        x[i] = temp;
+    }
+}
+#endif
diff --git a/kernel/x86_64/drot_microk_skylakex-2.c b/kernel/x86_64/drot_microk_skylakex-2.c
new file mode 100644
index 000000000..4e862e663
--- /dev/null
+++ b/kernel/x86_64/drot_microk_skylakex-2.c
@@ -0,0 +1,94 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_DROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG n1 = n;
+    
+    BLASLONG tail_index_8 = 0;
+    BLASLONG tail_index_32 = 0;
+
+    __m512d c_512 = _mm512_set1_pd(c);
+    __m512d s_512 = _mm512_set1_pd(s);
+
+    tail_index_8 = n1 & (~7);
+    tail_index_32 = n1 & (~31);
+
+
+    __m512d x0, x1, x2, x3;
+    __m512d y0, y1, y2, y3;
+    __m512d t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_32; i += 32) {
+        x0 = _mm512_loadu_pd(&x[i + 0]);
+        x1 = _mm512_loadu_pd(&x[i + 8]);
+        x2 = _mm512_loadu_pd(&x[i +16]);
+        x3 = _mm512_loadu_pd(&x[i +24]);
+        y0 = _mm512_loadu_pd(&y[i + 0]);
+        y1 = _mm512_loadu_pd(&y[i + 8]);
+        y2 = _mm512_loadu_pd(&y[i +16]);
+        y3 = _mm512_loadu_pd(&y[i +24]);
+
+        t0 = _mm512_mul_pd(s_512, y0);
+        t1 = _mm512_mul_pd(s_512, y1);
+        t2 = _mm512_mul_pd(s_512, y2);
+        t3 = _mm512_mul_pd(s_512, y3);
+
+        t0 = _mm512_fmadd_pd(c_512, x0, t0);
+        t1 = _mm512_fmadd_pd(c_512, x1, t1);
+        t2 = _mm512_fmadd_pd(c_512, x2, t2);
+        t3 = _mm512_fmadd_pd(c_512, x3, t3);
+
+        _mm512_storeu_pd(&x[i + 0], t0);
+        _mm512_storeu_pd(&x[i + 8], t1);
+        _mm512_storeu_pd(&x[i +16], t2);
+        _mm512_storeu_pd(&x[i +24], t3);
+
+        t0 = _mm512_mul_pd(s_512, x0);
+        t1 = _mm512_mul_pd(s_512, x1);
+        t2 = _mm512_mul_pd(s_512, x2);
+        t3 = _mm512_mul_pd(s_512, x3);
+
+        t0 = _mm512_fmsub_pd(c_512, y0, t0);
+        t1 = _mm512_fmsub_pd(c_512, y1, t1);
+        t2 = _mm512_fmsub_pd(c_512, y2, t2);
+        t3 = _mm512_fmsub_pd(c_512, y3, t3);
+
+        _mm512_storeu_pd(&y[i + 0], t0);
+        _mm512_storeu_pd(&y[i + 8], t1);
+        _mm512_storeu_pd(&y[i +16], t2);
+        _mm512_storeu_pd(&y[i +24], t3);
+    }
+
+    for (i = tail_index_32; i < tail_index_8; i += 8) {
+        x0 = _mm512_loadu_pd(&x[i]);
+        y0 = _mm512_loadu_pd(&y[i]);
+
+        t0 = _mm512_mul_pd(s_512, y0);
+        t0 = _mm512_fmadd_pd(c_512, x0, t0);
+        _mm512_storeu_pd(&x[i], t0);
+
+        t0 = _mm512_mul_pd(s_512, x0);
+        t0 = _mm512_fmsub_pd(c_512, y0, t0);
+        _mm512_storeu_pd(&y[i], t0);
+    }
+
+    if ((n1&7) > 0) {
+        unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n1&7)));
+	__m512d tail_x = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x[tail_index_8]);
+	__m512d tail_y = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &y[tail_index_8]);
+	__m512d temp = _mm512_mul_pd(s_512, tail_y);
+	temp = _mm512_fmadd_pd(c_512, tail_x, temp);
+	_mm512_mask_storeu_pd(&x[tail_index_8],*((__mmask8*)&tail_mask8), temp);
+        temp = _mm512_mul_pd(s_512, tail_x);
+        temp = _mm512_fmsub_pd(c_512, tail_y, temp);
+        _mm512_mask_storeu_pd(&y[tail_index_8], *((__mmask8*)&tail_mask8), temp);	
+    }
+}
+#endif
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
new file mode 100644
index 000000000..021c20d82
--- /dev/null
+++ b/kernel/x86_64/srot.c
@@ -0,0 +1,139 @@
+#include "common.h"
+
+#if defined(SKYLAKEX)
+#include "srot_microk_skylakex-2.c"
+#elif defined(HASWELL)
+#include "srot_microk_haswell-2.c"
+#endif
+
+#ifndef HAVE_SROT_KERNEL
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    FLOAT f0, f1, f2, f3;
+    FLOAT x0, x1, x2, x3;
+    FLOAT g0, g1, g2, g3;
+    FLOAT y0, y1, y2, y3;
+
+    FLOAT* xp = x;
+    FLOAT* yp = y;
+
+    BLASLONG n1 = n & (~7);
+
+    while (i < n1) {
+        x0 = xp[0];
+        y0 = yp[0];
+        x1 = xp[1];
+        y1 = yp[1];
+        x2 = xp[2];
+        y2 = yp[2];
+        x3 = xp[3];
+        y3 = yp[3];
+
+        f0 = c*x0 + s*y0;
+        g0 = c*y0 - s*x0;
+        f1 = c*x1 + s*y1;
+        g1 = c*y1 - s*x1;
+        f2 = c*x2 + s*y2;
+        g2 = c*y2 - s*x2;
+        f3 = c*x3 + s*y3;
+        g3 = c*y3 - s*x3;
+
+        xp[0] = f0;
+        yp[0] = g0;
+        xp[1] = f1;
+        yp[1] = g1;
+        xp[2] = f2;
+        yp[2] = g2;
+        xp[3] = f3;
+        yp[3] = g3;
+
+        xp += 4;
+        yp += 4;
+        i += 4;
+    }
+
+    while (i < n) {
+        FLOAT temp = c*x[i] + s*y[i];
+        y[i] = c*y[i] - s*x[i];
+        x[i] = temp;
+
+        i++;
+    }
+}
+
+#endif
+static void rot_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    BLASLONG ix = 0, iy = 0;
+
+    FLOAT temp;
+    
+    if (n <= 0)
+        return;
+    if ((inc_x == 1) && (inc_y == 1)) {
+            srot_kernel(n, x, y, c, s);
+    }
+    else {
+        while (i < n) {
+            temp = c * x[ix] + s * y[iy];
+            y[iy] = c * y[iy] - s * x[ix];
+            x[ix] = temp;
+
+            ix += inc_x;
+            iy += inc_y;
+            i++;
+        }
+    }
+    return;
+}
+
+
+#if defined(SMP)
+static int rot_thread_function(blas_arg_t *args)
+{
+
+    rot_compute(args->m, 
+            args->a, args->lda, 
+            args->b, args->ldb, 
+            ((float *)args->alpha)[0], 
+            ((float *)args->alpha)[1]);
+    return 0;
+}
+
+extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT alpha[2]={c, s};
+    FLOAT dummy_c;
+#endif
+
+#if defined(SMP)
+    if (inc_x == 0 || inc_y == 0 || n <= 100000) {
+        nthreads = 1;
+    }
+    else {
+        nthreads = num_cpu_avail(1);
+    }
+
+    if (nthreads == 1) {
+        rot_compute(n, x, inc_x, y, inc_y, c, s);
+    }
+    else {
+#if defined(DOUBLE)
+	    int mode = BLAS_DOUBLE | BLAS_REAL | BLAS_PTHREAD;
+#else
+	    int mode = BLAS_SINGLE | BLAS_REAL | BLAS_PTHREAD;
+#endif
+	    blas_level1_thread(mode, n, 0, 0, alpha, x, inc_x, y, inc_y, &dummy_c, 0, (void *)rot_thread_function, nthreads);
+    }
+#else	
+    rot_compute(n, x, inc_x, y, inc_y, c, s);
+#endif
+    return 0;
+}
diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c
new file mode 100644
index 000000000..cba962042
--- /dev/null
+++ b/kernel/x86_64/srot_microk_haswell-2.c
@@ -0,0 +1,87 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+
+    BLASLONG tail_index_8 = n&(~7);
+    BLASLONG tail_index_32 = n&(~31);
+
+    __m256 c_256, s_256;
+    if (n >= 8) {
+        c_256 = _mm256_set1_ps(c);
+        s_256 = _mm256_set1_ps(s);
+    }
+
+    __m256 x0, x1, x2, x3;
+    __m256 y0, y1, y2, y3;
+    __m256 t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_32; i += 32) {
+        x0 = _mm256_loadu_ps(&x[i + 0]);
+        x1 = _mm256_loadu_ps(&x[i + 8]);
+        x2 = _mm256_loadu_ps(&x[i +16]);
+        x3 = _mm256_loadu_ps(&x[i +24]);
+        y0 = _mm256_loadu_ps(&y[i + 0]);
+        y1 = _mm256_loadu_ps(&y[i + 8]);
+        y2 = _mm256_loadu_ps(&y[i +16]);
+        y3 = _mm256_loadu_ps(&y[i +24]);
+
+        t0 = _mm256_mul_ps(s_256, y0);
+        t1 = _mm256_mul_ps(s_256, y1);
+        t2 = _mm256_mul_ps(s_256, y2);
+        t3 = _mm256_mul_ps(s_256, y3);
+
+        t0 = _mm256_fmadd_ps(c_256, x0, t0);
+        t1 = _mm256_fmadd_ps(c_256, x1, t1);
+        t2 = _mm256_fmadd_ps(c_256, x2, t2);
+        t3 = _mm256_fmadd_ps(c_256, x3, t3);
+
+        _mm256_storeu_ps(&x[i + 0], t0);
+        _mm256_storeu_ps(&x[i + 8], t1);
+        _mm256_storeu_ps(&x[i +16], t2);
+        _mm256_storeu_ps(&x[i +24], t3);
+
+        t0 = _mm256_mul_ps(s_256, x0);
+        t1 = _mm256_mul_ps(s_256, x1);
+        t2 = _mm256_mul_ps(s_256, x2);
+        t3 = _mm256_mul_ps(s_256, x3);
+
+        t0 = _mm256_fmsub_ps(c_256, y0, t0);
+        t1 = _mm256_fmsub_ps(c_256, y1, t1);
+        t2 = _mm256_fmsub_ps(c_256, y2, t2);
+        t3 = _mm256_fmsub_ps(c_256, y3, t3);
+
+        _mm256_storeu_ps(&y[i + 0], t0);
+        _mm256_storeu_ps(&y[i + 8], t1);
+        _mm256_storeu_ps(&y[i +16], t2);
+        _mm256_storeu_ps(&y[i +24], t3);
+
+    }
+
+    for (i = tail_index_32; i < tail_index_8; i += 8) {
+        x0 = _mm256_loadu_ps(&x[i]);
+        y0 = _mm256_loadu_ps(&y[i]);
+
+        t0 = _mm256_mul_ps(s_256, y0);
+        t0 = _mm256_fmadd_ps(c_256, s0, t0);
+        _mm256_storeu_ps(&x[i], t0);
+
+        t0 = _mm256_mul_ps(s_256, x0);
+        t0 = _mm256_fmsub_ps(c_256, y0, t0);
+        _mm256_storeu_ps(&y[i], t0);
+    }
+
+    for (i = tail_index_8; i < n; ++i) {
+        FLOAT temp = c * x[i] + s * y[i];
+        y[i] = c * y[i] - s * x[i];
+        x[i] = temp;
+    }
+}
+#endif
diff --git a/kernel/x86_64/srot_microk_skylakex-2.c b/kernel/x86_64/srot_microk_skylakex-2.c
new file mode 100644
index 000000000..a21d1cf64
--- /dev/null
+++ b/kernel/x86_64/srot_microk_skylakex-2.c
@@ -0,0 +1,91 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_SROT_KERNEL 1
+
+#include <immintrin.h>
+#include <stdint.h>
+
+static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
+{
+    BLASLONG i = 0;
+    __m512 c_512, s_512;
+    c_512 = _mm512_set1_ps(c);
+    s_512 = _mm512_set1_ps(s);
+
+    BLASLONG tail_index_16 = n&(~15);
+    BLASLONG tail_index_64 = n&(~63);
+
+
+    __m512 x0, x1, x2, x3;
+    __m512 y0, y1, y2, y3;
+    __m512 t0, t1, t2, t3;
+
+    for (i = 0; i < tail_index_64; i += 64) {
+        x0 = _mm512_loadu_ps(&x[i + 0]);
+        x1 = _mm512_loadu_ps(&x[i +16]);
+        x2 = _mm512_loadu_ps(&x[i +32]);
+        x3 = _mm512_loadu_ps(&x[i +48]);
+        y0 = _mm512_loadu_ps(&y[i + 0]);
+        y1 = _mm512_loadu_ps(&y[i +16]);
+        y2 = _mm512_loadu_ps(&y[i +32]);
+        y3 = _mm512_loadu_ps(&y[i +48]);
+
+        t0 = _mm512_mul_ps(s_512, y0);
+        t1 = _mm512_mul_ps(s_512, y1);
+        t2 = _mm512_mul_ps(s_512, y2);
+        t3 = _mm512_mul_ps(s_512, y3);
+
+        t0 = _mm512_fmadd_ps(c_512, x0, t0);
+        t1 = _mm512_fmadd_ps(c_512, x1, t1);
+        t2 = _mm512_fmadd_ps(c_512, x2, t2);
+        t3 = _mm512_fmadd_ps(c_512, x3, t3);
+
+        _mm512_storeu_ps(&x[i + 0], t0);
+        _mm512_storeu_ps(&x[i +16], t1);
+        _mm512_storeu_ps(&x[i +32], t2);
+        _mm512_storeu_ps(&x[i +48], t3);
+
+        t0 = _mm512_mul_ps(s_512, x0);
+        t1 = _mm512_mul_ps(s_512, x1);
+        t2 = _mm512_mul_ps(s_512, x2);
+        t3 = _mm512_mul_ps(s_512, x3);
+
+        t0 = _mm512_fmsub_ps(c_512, y0, t0);
+        t1 = _mm512_fmsub_ps(c_512, y1, t1);
+        t2 = _mm512_fmsub_ps(c_512, y2, t2);
+        t3 = _mm512_fmsub_ps(c_512, y3, t3);
+
+        _mm512_storeu_ps(&y[i + 0], t0);
+        _mm512_storeu_ps(&y[i +16], t1);
+        _mm512_storeu_ps(&y[i +32], t2);
+        _mm512_storeu_ps(&y[i +48], t3);
+    }
+
+    for (i = tail_index_64; i < tail_index_16; i += 16) {
+        x0 = _mm512_loadu_ps(&x[i]);
+        y0 = _mm512_loadu_ps(&y[i]);
+
+        t0 = _mm512_mul_ps(s_512, y0);
+        t0 = _mm512_fmadd_ps(c_512, x0, t0);
+        _mm512_storeu_ps(&x[i], t0);
+
+        t0 = _mm512_mul_ps(s_512, x0);
+        t0 = _mm512_fmsub_ps(c_512, y0, t0);
+        _mm512_storeu_ps(&y[i], t0);
+    }
+
+
+    if ((n & 15) > 0) {
+        uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15)));
+        __m512 tail_x = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &x[tail_index_16]);
+        __m512 tail_y = _mm512_maskz_loadu_ps(*((__mmask16*)&tail_mask16), &y[tail_index_16]);
+	    __m512 temp = _mm512_mul_ps(s_512, tail_y);
+	    temp = _mm512_fmadd_ps(c_512, tail_x, temp);
+	    _mm512_mask_storeu_ps(&x[tail_index_16], *((__mmask16*)&tail_mask16), temp);
+	    temp = _mm512_mul_ps(s_512, tail_x);
+	    temp = _mm512_fmsub_ps(c_512, tail_y, temp);
+	    _mm512_mask_storeu_ps(&y[tail_index_16], *((__mmask16*)&tail_mask16), temp);	
+    }
+}
+#endif

From 725ffbf041b021d2f3602b2313e4027aab19ee89 Mon Sep 17 00:00:00 2001
From: Gengxin Xie <gengxin.xie@intel.com>
Date: Thu, 5 Nov 2020 16:25:17 +0800
Subject: [PATCH 037/121] fix typo

---
 kernel/x86_64/srot_microk_haswell-2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/srot_microk_haswell-2.c b/kernel/x86_64/srot_microk_haswell-2.c
index cba962042..8e245cc8f 100644
--- a/kernel/x86_64/srot_microk_haswell-2.c
+++ b/kernel/x86_64/srot_microk_haswell-2.c
@@ -70,7 +70,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
         y0 = _mm256_loadu_ps(&y[i]);
 
         t0 = _mm256_mul_ps(s_256, y0);
-        t0 = _mm256_fmadd_ps(c_256, s0, t0);
+        t0 = _mm256_fmadd_ps(c_256, x0, t0);
         _mm256_storeu_ps(&x[i], t0);
 
         t0 = _mm256_mul_ps(s_256, x0);

From 28d2dfe2b3bd6c779137fcb53451f97f47b78b37 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 12:17:49 +0100
Subject: [PATCH 038/121] Fix macro name used in ifdef

---
 kernel/arm/zdot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c
index ba0e57eb5..73ae3acd7 100644
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 		i++ ;
 
 	}
-#if !defined(__POWER__)	
+#if !defined(__PPC__)	
         CREAL(result) = dot[0];
 	CIMAG(result) = dot[1];
 #else

From 438a8e5624ef1adfe98f989655ca398866143458 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:26:12 +0100
Subject: [PATCH 039/121] Fix placement of getarch call and spurious cpu
 property accumulation in DYNAMIC_ARCH builds

---
 cmake/prebuild.cmake |  45 ++++++----------
 cmake/system.cmake   | 124 ++++++++++++++++++++-----------------------
 2 files changed, 73 insertions(+), 96 deletions(-)

diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake
index b1b4c501a..da7686c33 100644
--- a/cmake/prebuild.cmake
+++ b/cmake/prebuild.cmake
@@ -139,36 +139,6 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS
       set(CGEMM3M_UNROLL_N 4)
       set(ZGEMM3M_UNROLL_M 4)
       set(ZGEMM3M_UNROLL_N 4)
-  elseif ("${TCORE}" STREQUAL "BARCELONA")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "STEAMROLLER")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "EXCAVATOR")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "NEHALEM")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "PRESCOTT")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_SSE3\n")
-  elseif ("${TCORE}" STREQUAL "SANDYBRIDGE")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX\n")
-  elseif ("${TCORE}" STREQUAL "HASWELL")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX2\n")
-  elseif ("${TCORE}" STREQUAL "ZEN")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX2\n")
-  elseif ("${TCORE}" STREQUAL "SKYLAKEX")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX512\n")
-  elseif ("${TCORE}" STREQUAL "COOPERLAKE")
-    file(APPEND ${TARGET_CONF_TEMP}
-      "#define HAVE_AVX512\n")
   elseif ("${TCORE}" STREQUAL "ARMV7")
     file(APPEND ${TARGET_CONF_TEMP}
       "#define L1_DATA_SIZE\t65536\n"
@@ -586,6 +556,21 @@ else(NOT CMAKE_CROSSCOMPILING)
       MESSAGE(FATAL_ERROR "Compiling getarch failed ${GETARCH_LOG}")
     endif ()
   endif ()
+  unset (HAVE_AVX2)
+  unset (HAVE_AVX)
+  unset (HAVE_FMA3)
+  unset (HAVE_MMX)
+  unset (HAVE_SSE)
+  unset (HAVE_SSE2)
+  unset (HAVE_SSE3)
+  unset (HAVE_SSSE3)
+  unset (HAVE_SSE4A)
+  unset (HAVE_SSE4_1)
+  unset (HAVE_SSE4_2)
+  unset (HAVE_NEON)
+  unset (HAVE_VFP)
+  unset (HAVE_VFPV3)
+  unset (HAVE_VFPV4)
   message(STATUS "Running getarch")
 
   # use the cmake binary w/ the -E param to run a shell command in a cross-platform way
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 48d206b12..66e95c6d3 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -44,74 +44,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
   endif ()
 endif ()
 
-if (DEFINED TARGET)
-  if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512)
-#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
-          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
-        else()
-          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
-        endif()
-#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
-#    endif()    
-  endif()
-  if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
-  endif()
-  if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
-    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
-      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
-      if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
-        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
-      endif()
-    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse -msse3 -mavx2")
-    endif()
-  endif()
-  if (${TARGET} STREQUAL "HASWELL" AND NOT NO_AVX2)
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
-  endif()
-  if (${TARGET} STREQUAL "ZEN" AND NOT NO_AVX2)
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx2")
-  endif()
-  if (${TARGET} STREQUAL "SANDYBRIDGE" AND NOT NO_AVX)
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3 -mavx")
-  endif()
-  if (${TARGET} STREQUAL "BARCELONA" OR ${TARGET} STREQUAL "STEAMROLLER" OR ${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "EXCAVATOR")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "BOBCAT" OR ${TARGET} STREQUAL "OPTERON_SSE3")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "PRESCOTT" OR ${TARGET} STREQUAL "NANO")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "NEHALEM" OR ${TARGET} STREQUAL "ATOM")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (${TARGET} STREQUAL "CORE2" OR ${TARGET} STREQUAL "PENRYN" OR ${TARGET} STREQUAL "DUNNINGTON")
-      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-  if (DEFINED HAVE_SSE)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
-  endif()
-  if (DEFINED HAVE_SSE2)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
-  endif()
-  if (DEFINED HAVE_SSE3)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
-  endif()
-    if (DEFINED HAVE_SSSE3)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
-  endif()
-    if (DEFINED HAVE_SSE4_1)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
-  endif()
-endif()
 
 if (DEFINED TARGET)
+  message(STATUS "-- -- -- -- -- -- -- -- -- -- -- -- --")
   message(STATUS "Targeting the ${TARGET} architecture.")
   set(GETARCH_FLAGS "-DFORCE_${TARGET}")
 endif ()
@@ -211,6 +146,63 @@ else()
 endif ()
 
 include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake")
+if (DEFINED TARGET)
+  if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512)
+#    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+        if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1)
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake")
+        else()
+          set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+        endif()
+#    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+#      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+#    endif()    
+  endif()
+  if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512")
+  endif()
+  if (${TARGET} STREQUAL HASWELL AND NOT NO_AVX2)
+    if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU")
+      execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)
+      if (${GCC_VERSION} VERSION_GREATER 4.7 OR ${GCC_VERSION} VERSION_EQUAL 4.7)
+        set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+      endif()
+    elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG")
+      set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+    endif()
+  endif()
+  if (DEFINED HAVE_AVX)
+	if (NOT NO_AVX)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
+	endif()
+  endif()
+  if (DEFINED HAVE_AVX2)
+	if (NOT NO_AVX2)
+      	  set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2")
+	endif()
+  endif()
+  if (DEFINED HAVE_FMA3)
+	if (NOT NO_AVX2)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma")
+	endif()
+  endif()
+    if (DEFINED HAVE_SSE)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse")
+  endif()
+  if (DEFINED HAVE_SSE2)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse2")
+  endif()
+  if (DEFINED HAVE_SSE3)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse3")
+  endif()
+    if (DEFINED HAVE_SSSE3)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mssse3")
+  endif()
+    if (DEFINED HAVE_SSE4_1)
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1")
+  endif()
+endif()
 if (DEFINED BINARY)
   message(STATUS "Compiling a ${BINARY}-bit binary.")
 endif ()

From a29338aaa6b364ce99ea30785d1227bd327ce3c7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:27:42 +0100
Subject: [PATCH 040/121] Remove extraneous quotes that caused a cmake policy
 warning

---
 cmake/cc.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index 2f4d1c6d7..b963940d6 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -96,7 +96,7 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "SUN")
   endif ()
 endif ()
 
-if (${CORE} STREQUAL "SKYLAKEX")
+if (${CORE} STREQUAL SKYLAKEX)
   if (NOT DYNAMIC_ARCH)
     if (NOT NO_AVX512)
       set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512")
@@ -104,7 +104,7 @@ if (${CORE} STREQUAL "SKYLAKEX")
   endif ()
 endif ()
 
-if (${CORE} STREQUAL "COOPERLAKE")
+if (${CORE} STREQUAL COOPERLAKE)
   if (NOT DYNAMIC_ARCH)
     if (NOT NO_AVX512)
       execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION)

From ccb9731c7b41b601412b00b73f6da98613d66b7f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:30:15 +0100
Subject: [PATCH 041/121] Fix propagation of cpu properties to compiler options

---
 Makefile.x86_64 | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 49a9a0a23..43bfc9ecd 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -9,9 +9,9 @@ endif
 endif
 
 ifdef HAVE_SSE3
-ifndef DYNAMIC_ARCH
 CCOMMON_OPT += -msse3
 FCOMMON_OPT += -msse3
+endif
 ifdef HAVE_SSSE3
 CCOMMON_OPT += -mssse3
 FCOMMON_OPT += -mssse3
@@ -20,7 +20,17 @@ ifdef HAVE_SSE4_1
 CCOMMON_OPT += -msse4.1
 FCOMMON_OPT += -msse4.1
 endif
+ifdef HAVE_AVX
+CCOMMON_OPT += -mavx
+FCOMMON_OPT += -mavx
 endif
+ifdef HAVE_AVX2
+CCOMMON_OPT += -mavx2
+FCOMMON_OPT += -mavx2
+endif
+ifdef HAVE_FMA3
+CCOMMON_OPT += -mfma
+FCOMMON_OPT += -mfma
 endif
 
 ifeq ($(CORE), SKYLAKEX)
@@ -66,8 +76,7 @@ endif
 endif
 endif
 
-ifeq ($(CORE), $(filter $(CORE), HASWELL ZEN SKYLAKEX COOPERLAKE))
-ifndef DYNAMIC_ARCH
+ifdef HAVE_AVX2
 ifndef NO_AVX2
 ifeq ($(C_COMPILER), GCC)
 # AVX2 support was added in 4.7.0
@@ -96,7 +105,6 @@ endif
 endif
 endif
 endif
-endif
 
 
 

From a04f532edfe65a7e4cf4dfb2dc34d363e2eba065 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:37:03 +0100
Subject: [PATCH 042/121] Reset cpu property flags between build cycles in
 DYNAMIC_ARCH mode

---
 Makefile.system | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index ca302a98a..dc7ed3f3a 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -252,6 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
+undefine HAVE_NEON
+undefine HAVE_VFP
+undefine HAVE_VFPV3
+undefine HAVE_VFPV4
+undefine HAVE_MMX
+undefine HAVE_SSE
+undefine HAVE_SSE2
+undefine HAVE_SSE3
+undefine HAVE_SSSE3
+undefine HAVE_SSE4_1
+undefine HAVE_SSE4_2
+undefine HAVE_SSE4A
+undefine HAVE_SSE5
+undefine HAVE_AVX
+undefine HAVE_AVX2
+undefine HAVE_FMA3
 include $(TOPDIR)/Makefile_kernel.conf
 endif
 
@@ -1522,6 +1538,8 @@ export HAVE_SSE4_2
 export HAVE_SSE4A
 export HAVE_SSE5
 export HAVE_AVX
+export HAVE_AVX2
+export HAVE_FMA3
 export HAVE_VFP
 export HAVE_VFPV3
 export HAVE_VFPV4

From b976a0bf4095fd8b9e80ae3cf0e0f6eab200219e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 20:39:56 +0100
Subject: [PATCH 043/121] Remove previous workaround for compiler flags related
 to cpu capabilities in x86_64 DYNAMIC_ARCH builds

---
 kernel/Makefile | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/kernel/Makefile b/kernel/Makefile
index e811ed43d..fb1d5d39a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,13 +5,6 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
-ifdef HAVE_SSE3
-CFLAGS += -msse3
-endif
-ifdef HAVE_SSSE3
-CFLAGS += -mssse3
-endif
-
 ifeq ($(ARCH), power)
 ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as
@@ -38,12 +31,6 @@ ifdef NO_AVX2
 endif
 
 ifdef TARGET_CORE
-	ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),PRESCOTT CORE2 PENRYN DUNNINGTON ATOM NANO SANDYBRIDGE HASWELL NEHALEM ZEN BARCELONA BOBCAT BULLDOZER PILEDRIVER EXCAVATOR STEAMROLLER OPTERON_SSE3))
-	override CFLAGS += -msse -msse2 -msse3 -mssse3 -msse4.1
-endif
-	ifeq ($(TARGET_CORE), $(filter $(TARGET_CORE),KATMAI COPPERMINE BANIAS NORTHWOOD ATHLON OPTERON))
-	override CFLAGS += -msse -msse2
-endif
 ifeq ($(TARGET_CORE), COOPERLAKE)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
  ifeq ($(GCCVERSIONGTEQ10), 1) 

From 6e364981a8af0f72ad9e62a69fe62fdedc18255b Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Sat, 7 Nov 2020 15:21:58 -0600
Subject: [PATCH 044/121] Optimize sdot/ddot for POWER10

This patch makes use of new POWER10 vector pair instructions for
loads and stores.
---
 kernel/power/KERNEL.POWER10        |   6 +-
 kernel/power/ddot_microk_power10.c | 131 ++++++++++++++++++++++++
 kernel/power/ddot_power10.c        | 130 ++++++++++++++++++++++++
 kernel/power/sdot_microk_power10.c | 135 +++++++++++++++++++++++++
 kernel/power/sdot_power10.c        | 154 +++++++++++++++++++++++++++++
 5 files changed, 553 insertions(+), 3 deletions(-)
 create mode 100644 kernel/power/ddot_microk_power10.c
 create mode 100644 kernel/power/ddot_power10.c
 create mode 100644 kernel/power/sdot_microk_power10.c
 create mode 100644 kernel/power/sdot_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index 28c39051f..c25cd9f04 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -151,9 +151,9 @@ DCOPYKERNEL  = dcopy_power10.c
 CCOPYKERNEL  = ccopy_power10.c
 ZCOPYKERNEL  = zcopy_power10.c
 #
-SDOTKERNEL   =  sdot.c
-DDOTKERNEL   =  ddot.c
-DSDOTKERNEL  =  sdot.c
+SDOTKERNEL   =  sdot_power10.c
+DDOTKERNEL   =  ddot_power10.c
+DSDOTKERNEL  =  sdot_power10.c
 ifneq ($(GCCVERSIONGTEQ9),1)
 CDOTKERNEL   =  cdot_power9.S
 else
diff --git a/kernel/power/ddot_microk_power10.c b/kernel/power/ddot_microk_power10.c
new file mode 100644
index 000000000..3a9865cc0
--- /dev/null
+++ b/kernel/power/ddot_microk_power10.c
@@ -0,0 +1,131 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_8 1
+
+static double ddot_kernel_8 (long n, double *x, double *y)
+{
+  double dot;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            48, 0(%3)       \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "lxvp            40, 0(%2)       \n\t"
+       "lxvp            48, 0(%3)       \n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "lxvp            42, 32(%2)      \n\t"
+       "lxvp            50, 32(%3)      \n\t"
+       "xvmaddadp	36, 44, 52	\n\t"
+       "xvmaddadp	37, 45, 53	\n\t"
+       "lxvp            44, 64(%2)      \n\t"
+       "lxvp            52, 64(%3)      \n\t"
+       "xvmaddadp	38, 46, 54	\n\t"
+       "xvmaddadp	39, 47, 55	\n\t"
+       "lxvp            46, 96(%2)      \n\t"
+       "lxvp            54, 96(%3)      \n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "xvmaddadp	36, 44, 52	\n\t"
+       "xvmaddadp	37, 45, 53	\n\t"
+       "xvmaddadp	38, 46, 54	\n\t"
+       "xvmaddadp	39, 47, 55	\n\t"
+
+       "xvadddp		32, 32, 33	\n\t"
+       "xvadddp		34, 34, 35	\n\t"
+       "xvadddp		36, 36, 37	\n\t"
+       "xvadddp		38, 38, 39	\n\t"
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+
+       XXSWAPD_S(33,32)
+
+       "xsadddp		%x0, 32, 33	\n"
+
+     "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+     :
+       "=d" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/ddot_power10.c b/kernel/power/ddot_power10.c
new file mode 100644
index 000000000..302dceb68
--- /dev/null
+++ b/kernel/power/ddot_power10.c
@@ -0,0 +1,130 @@
+/***************************************************************************
+Copyright (c) 2013-2016, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#include "common.h"
+
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "ddot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_8
+
+static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       return dot;
+}
+
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+
+	FLOAT  dot = 0.0 ;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+		BLASLONG n1 = n & -16;
+
+		if ( n1 )
+			dot = ddot_kernel_8(n1, x, y);
+
+		i = n1;
+		while(i < n)
+		{
+
+			dot += y[i] * x[i] ;
+			i++ ;
+
+		}
+		return(dot);
+
+
+	}
+
+	FLOAT temp1 = 0.0;
+	FLOAT temp2 = 0.0;
+
+        BLASLONG n1 = n & -4;	
+
+	while(i < n1)
+	{
+
+		FLOAT m1 = y[iy]       * x[ix] ;
+		FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ;
+
+		FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ;
+		FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ;
+
+		ix  += inc_x*4 ;
+		iy  += inc_y*4 ;
+
+		temp1 += m1+m3;
+		temp2 += m2+m4;
+
+		i+=4 ;
+
+	}
+
+	while(i < n)
+	{
+
+		temp1 += y[iy] * x[ix] ;
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	dot = temp1 + temp2;
+	return(dot);
+
+}
+
+
diff --git a/kernel/power/sdot_microk_power10.c b/kernel/power/sdot_microk_power10.c
new file mode 100644
index 000000000..2f028c5a0
--- /dev/null
+++ b/kernel/power/sdot_microk_power10.c
@@ -0,0 +1,135 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define HAVE_KERNEL_16 1
+
+static float sdot_kernel_16 (long n, float *x, float *y)
+{
+  float dot;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+       "lxvp		48, 0(%3)	\n\t"
+       "lxvp		50, 32(%3)	\n\t"
+       "lxvp		52, 64(%3)	\n\t"
+       "lxvp		54, 96(%3)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "lxvp		40, 0(%2)	\n\t"
+       "lxvp		48, 0(%3)	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "lxvp		42, 32(%2)	\n\t"
+       "lxvp		50, 32(%3)	\n\t"
+       "xvmaddasp	36, 44, 52	\n\t"
+       "xvmaddasp	37, 45, 53	\n\t"
+       "lxvp		44, 64(%2)	\n\t"
+       "lxvp		52, 64(%3)	\n\t"
+       "xvmaddasp	38, 46, 54 	\n\t"
+       "xvmaddasp	39, 47, 55 	\n\t"
+       "lxvp		46, 96(%2)	\n\t"
+       "lxvp		54, 96(%3)	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "xvmaddasp	36, 44, 52	\n\t"
+       "xvmaddasp	37, 45, 53	\n\t"
+       "xvmaddasp	38, 46, 54	\n\t"
+       "xvmaddasp	39, 47, 55	\n\t"
+
+       "xvaddsp		32, 32, 33	\n\t"
+       "xvaddsp		34, 34, 35	\n\t"
+       "xvaddsp		36, 36, 37	\n\t"
+       "xvaddsp		38, 38, 39	\n\t"
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		32, 32, 36	\n\t"
+
+       "xxsldwi		33, 32, 32, 2	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xxsldwi		33, 32, 32, 1	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xscvspdp	%x0, 32		\n"
+
+     "#dot=%0 n=%1 x=%4=%2 y=%5=%3\n"
+     :
+       "=f" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y)
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/sdot_power10.c b/kernel/power/sdot_power10.c
new file mode 100644
index 000000000..b61f0a90d
--- /dev/null
+++ b/kernel/power/sdot_power10.c
@@ -0,0 +1,154 @@
+/***************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+#include "common.h"
+
+#if defined(__VEC__) || defined(__ALTIVEC__)
+#include "sdot_microk_power10.c"
+#endif
+
+
+#ifndef HAVE_KERNEL_16
+
+static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
+{
+	BLASLONG register i = 0;
+	FLOAT dot = 0.0;
+
+	while(i < n)
+        {
+              dot += y[i]  * x[i]
+                  + y[i+1] * x[i+1]
+                  + y[i+2] * x[i+2]
+                  + y[i+3] * x[i+3]
+                  + y[i+4] * x[i+4]
+                  + y[i+5] * x[i+5]
+                  + y[i+6] * x[i+6]
+                  + y[i+7] * x[i+7] ;
+
+              i+=8 ;
+
+       }
+       return dot;
+}
+
+#endif
+
+#if defined (DSDOT)
+double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#else
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
+#endif
+{
+	BLASLONG i=0;
+	BLASLONG ix=0,iy=0;
+	double dot = 0.0 ;
+
+#if defined (DSDOT)
+        double mydot = 0.0;
+        FLOAT asmdot = 0.0;
+#else
+	FLOAT mydot=0.0;
+#endif
+	BLASLONG n1;
+
+	if ( n <= 0 )  return(dot);
+
+	if ( (inc_x == 1) && (inc_y == 1) )
+	{
+
+	        n1 = n & (BLASLONG)(-32);
+
+		if ( n1 )
+#if defined(DSDOT)
+			{
+			FLOAT *x1=x;
+			FLOAT *y1=y;
+			BLASLONG n2 = 32;
+			while (i<n1) {
+				asmdot = sdot_kernel_16(n2, x1, y1);
+				mydot += (double)asmdot;
+				asmdot=0.;
+				x1+=32;
+				y1+=32;
+				i+=32;
+			}
+		}
+#else		
+			mydot = sdot_kernel_16(n1, x, y);
+#endif
+		i = n1;
+		while(i < n)
+		{
+#if defined(DSDOT)
+			dot += (double)y[i] * (double)x[i] ;
+#else
+			dot += y[i] * x[i] ;
+#endif
+			i++ ;
+
+		}
+
+		dot+=mydot;
+		return(dot);
+
+
+	}
+
+	n1 = n & (BLASLONG)(-2);
+
+	while(i < n1)
+	{
+#if defined (DSDOT)
+		dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x];
+#else
+		dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x];
+#endif
+		ix  += inc_x*2 ;
+		iy  += inc_y*2 ;
+		i+=2 ;
+
+	}
+
+	while(i < n)
+	{
+#if defined (DSDOT)
+		dot += (double)y[iy] * (double)x[ix] ;
+#else
+		dot += y[iy] * x[ix] ;
+#endif
+		ix  += inc_x ;
+		iy  += inc_y ;
+		i++ ;
+
+	}
+	return(dot);
+
+}
+
+

From f4b7ba12b71f97b6e5f8cec462635b9334c62a72 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 7 Nov 2020 23:37:21 +0100
Subject: [PATCH 045/121] Update Makefile.system

---
 Makefile.system | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index dc7ed3f3a..258a84262 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -252,7 +252,9 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
+ifdef HAVE_NEON
 undefine HAVE_NEON
+endif
 undefine HAVE_VFP
 undefine HAVE_VFPV3
 undefine HAVE_VFPV4

From f6a57d8f63ed0f1fa4823d27daafc2cb3a6dc96b Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Nov 2020 00:01:36 +0100
Subject: [PATCH 046/121] Update Makefile.system

---
 Makefile.system | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index 258a84262..da2d452b2 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -255,9 +255,15 @@ else
 ifdef HAVE_NEON
 undefine HAVE_NEON
 endif
+ifdef HAVE_VFP
 undefine HAVE_VFP
+endif
+ifdef HAVE_VFPV3
 undefine HAVE_VFPV3
+endif
+ifdef HAVE_VFPV4
 undefine HAVE_VFPV4
+endif
 undefine HAVE_MMX
 undefine HAVE_SSE
 undefine HAVE_SSE2

From 1c4cfdc13937765dd9bd0ef8b846ba027ec086b3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Nov 2020 00:12:55 +0100
Subject: [PATCH 047/121] Stay compatible with old gmake that did not support
 undefine

---
 Makefile.system | 42 +++++++++++++++++-------------------------
 1 file changed, 17 insertions(+), 25 deletions(-)

diff --git a/Makefile.system b/Makefile.system
index da2d452b2..aae7ba503 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -6,7 +6,7 @@
 INCLUDED = 1
 
 ifndef TOPDIR
-TOPDIR = .
+TOPDIR = . 
 endif
 
  # If ARCH is not set, we use the host system's architecture for getarch compile options.
@@ -252,30 +252,22 @@ DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)"
 ifndef TARGET_CORE
 include $(TOPDIR)/Makefile.conf
 else
-ifdef HAVE_NEON
-undefine HAVE_NEON
-endif
-ifdef HAVE_VFP
-undefine HAVE_VFP
-endif
-ifdef HAVE_VFPV3
-undefine HAVE_VFPV3
-endif
-ifdef HAVE_VFPV4
-undefine HAVE_VFPV4
-endif
-undefine HAVE_MMX
-undefine HAVE_SSE
-undefine HAVE_SSE2
-undefine HAVE_SSE3
-undefine HAVE_SSSE3
-undefine HAVE_SSE4_1
-undefine HAVE_SSE4_2
-undefine HAVE_SSE4A
-undefine HAVE_SSE5
-undefine HAVE_AVX
-undefine HAVE_AVX2
-undefine HAVE_FMA3
+HAVE_NEON=
+HAVE_VFP=
+HAVE_VFPV3=
+HAVE_VFPV4=
+HAVE_MMX=
+HAVE_SSE=
+HAVE_SSE2=
+HAVE_SSE3=
+HAVE_SSSE3=
+HAVE_SSE4_1=
+HAVE_SSE4_2=
+HAVE_SSE4A=
+HAVE_SSE5=
+HAVE_AVX=
+HAVE_AVX2=
+HAVE_FMA3=
 include $(TOPDIR)/Makefile_kernel.conf
 endif
 

From ec088bf33aa3034a82b713ea304fe30e36c278ec Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 8 Nov 2020 13:15:40 +0100
Subject: [PATCH 048/121] Fix missing AVX2 and FMA3 capabilities in
 FORCE_target mode

---
 getarch.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/getarch.c b/getarch.c
index ab90f36d9..daf669e56 100644
--- a/getarch.c
+++ b/getarch.c
@@ -330,7 +330,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #endif
@@ -346,7 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #else
@@ -359,7 +359,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -march=skylake-avx512"
 #define LIBNAME   "skylakex"
 #define CORENAME  "SKYLAKEX"
 #endif
@@ -376,7 +376,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
                      "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
                      "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #else
@@ -389,7 +389,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                      "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
                      "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
                      "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
-                     "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
+                     "-DHAVE_AVX2 -DHAVE_FMA3 -DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake"
 #define LIBNAME   "cooperlake"
 #define CORENAME  "COOPERLAKE"
 #endif
@@ -559,7 +559,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
 		     "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \
 		     "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \
-		     "-DHAVE_AVX -DHAVE_FMA3 -DFMA3"
+		     "-DHAVE_AVX -DHAVE_AVX2 -DHAVE_FMA3 -DFMA3"
 #define LIBNAME   "zen"
 #define CORENAME  "ZEN"
 #endif

From c4c591ac5afc10b5619d1c58b10d5095dc82a2ff Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Tue, 10 Nov 2020 16:16:38 +0800
Subject: [PATCH 049/121] fix sum optimize issues

---
 kernel/arm/sum.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c
index 63584b95c..a486a1868 100644
--- a/kernel/arm/sum.c
+++ b/kernel/arm/sum.c
@@ -42,24 +42,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	n *= inc_x;
 	if (inc_x == 1)
 	{
-#if V_SIMD
+#if V_SIMD && (!defined(DOUBLE) || (defined(DOUBLE) && V_SIMD_F64 && V_SIMD > 128))
 #ifdef DOUBLE
 		const int vstep = v_nlanes_f64;
-		const int unrollx2 = n & (-vstep * 2);
+		const int unrollx4 = n & (-vstep * 4);
 		const int unrollx = n & -vstep;
 		v_f64 vsum0 = v_zero_f64();
 		v_f64 vsum1 = v_zero_f64();
-		while (i < unrollx2)
+		v_f64 vsum2 = v_zero_f64();
+		v_f64 vsum3 = v_zero_f64();
+		for (; i < unrollx4; i += vstep * 4)
 		{
-			vsum0 = v_add_f64(vsum0, v_loadu_f64(x));
-			vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep));
-			i += vstep * 2;
+			vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
+			vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep));
+			vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2));
+			vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3));
 		}
-		vsum0 = v_add_f64(vsum0, vsum1);
-		while (i < unrollx)
+		vsum0 = v_add_f64(
+			v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3));
+		for (; i < unrollx; i += vstep)
 		{
 			vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i));
-			i += vstep;
 		}
 		sumf = v_sum_f64(vsum0);
 #else
@@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		v_f32 vsum1 = v_zero_f32();
 		v_f32 vsum2 = v_zero_f32();
 		v_f32 vsum3 = v_zero_f32();
-		while (i < unrollx4)
+		for (; i < unrollx4; i += vstep * 4)
 		{
-			vsum0 = v_add_f32(vsum0, v_loadu_f32(x));
-			vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep));
-			vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2));
-			vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3));
-			i += vstep * 4;
+			vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
+			vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep));
+			vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2));
+			vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3));
 		}
 		vsum0 = v_add_f32(
 			v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3));
-		while (i < unrollx)
+		for (; i < unrollx; i += vstep)
 		{
 			vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i));
-			i += vstep;
 		}
 		sumf = v_sum_f32(vsum0);
 #endif

From 8c0b206d4cf9909017a52919a41406ee303f472e Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Wed, 11 Nov 2020 14:33:12 +0800
Subject: [PATCH 050/121] Optimize the performance of rot by using universal
 intrinsics

---
 kernel/simd/intrin_avx.h    | 10 ++++++
 kernel/simd/intrin_avx512.h |  5 +++
 kernel/simd/intrin_neon.h   | 10 ++++++
 kernel/simd/intrin_sse.h    | 13 +++++++
 kernel/x86_64/drot.c        | 68 ++++++++++++++++++++++++++++++++++-
 kernel/x86_64/srot.c        | 70 ++++++++++++++++++++++++++++++++++++-
 6 files changed, 174 insertions(+), 2 deletions(-)

diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h
index 3f79646e0..fbe531417 100644
--- a/kernel/simd/intrin_avx.h
+++ b/kernel/simd/intrin_avx.h
@@ -12,6 +12,8 @@ typedef __m256d  v_f64;
  ***************************/
 #define v_add_f32 _mm256_add_ps
 #define v_add_f64 _mm256_add_pd
+#define v_sub_f32 _mm256_sub_ps
+#define v_sub_f64 _mm256_sub_pd
 #define v_mul_f32 _mm256_mul_ps
 #define v_mul_f64 _mm256_mul_pd
 
@@ -19,12 +21,20 @@ typedef __m256d  v_f64;
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm256_fmadd_ps
     #define v_muladd_f64 _mm256_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm256_fmsub_ps
+    #define v_mulsub_f64 _mm256_fmsub_pd
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return v_add_f32(v_mul_f32(a, b), c); }
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return v_add_f64(v_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_sub_f32(v_mul_f32(a, b), c); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return v_sub_f64(v_mul_f64(a, b), c); }
 #endif // !HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h
index f00af53e9..8f38eedd9 100644
--- a/kernel/simd/intrin_avx512.h
+++ b/kernel/simd/intrin_avx512.h
@@ -12,11 +12,16 @@ typedef __m512d  v_f64;
  ***************************/
 #define v_add_f32 _mm512_add_ps
 #define v_add_f64 _mm512_add_pd
+#define v_sub_f32 _mm512_sub_ps
+#define v_sub_f64 _mm512_sub_pd
 #define v_mul_f32 _mm512_mul_ps
 #define v_mul_f64 _mm512_mul_pd
 // multiply and add, a*b + c
 #define v_muladd_f32 _mm512_fmadd_ps
 #define v_muladd_f64 _mm512_fmadd_pd
+// multiply and subtract, a*b - c
+#define v_mulsub_f32 _mm512_fmsub_ps
+#define v_mulsub_f64 _mm512_fmsub_pd
 BLAS_FINLINE float v_sum_f32(v_f32 a)
 {
     __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2));
diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h
index 22cef10ca..cd44599fe 100644
--- a/kernel/simd/intrin_neon.h
+++ b/kernel/simd/intrin_neon.h
@@ -18,6 +18,8 @@ typedef float32x4_t v_f32;
  ***************************/
 #define v_add_f32 vaddq_f32
 #define v_add_f64 vaddq_f64
+#define v_sub_f32 vsubq_f32
+#define v_sub_f64 vsubq_f64
 #define v_mul_f32 vmulq_f32
 #define v_mul_f64 vmulq_f64
 
@@ -26,16 +28,24 @@ typedef float32x4_t v_f32;
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return vfmaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return vfmaq_f32(vnegq_f32(c), a, b); }
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return vmlaq_f32(c, a, b); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return vmlaq_f32(vnegq_f32(c), a, b); }
 #endif
 
 // FUSED F64
 #if V_SIMD_F64
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return vfmaq_f64(c, a, b); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return vfmaq_f64(vnegq_f64(c), a, b); }
 #endif
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h
index 06a3fe78b..6a542072e 100644
--- a/kernel/simd/intrin_sse.h
+++ b/kernel/simd/intrin_sse.h
@@ -12,22 +12,35 @@ typedef __m128d  v_f64;
  ***************************/
 #define v_add_f32 _mm_add_ps
 #define v_add_f64 _mm_add_pd
+#define v_sub_f32 _mm_sub_ps
+#define v_sub_f64 _mm_sub_pd
 #define v_mul_f32 _mm_mul_ps
 #define v_mul_f64 _mm_mul_pd
 #ifdef HAVE_FMA3
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm_fmadd_ps
     #define v_muladd_f64 _mm_fmadd_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm_fmsub_ps
+    #define v_mulsub_f64 _mm_fmsub_pd
 #elif defined(HAVE_FMA4)
     // multiply and add, a*b + c
     #define v_muladd_f32 _mm_macc_ps
     #define v_muladd_f64 _mm_macc_pd
+    // multiply and subtract, a*b - c
+    #define v_mulsub_f32 _mm_msub_ps
+    #define v_mulsub_f64 _mm_msub_pd
 #else
     // multiply and add, a*b + c
     BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c)
     { return v_add_f32(v_mul_f32(a, b), c); }
     BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c)
     { return v_add_f64(v_mul_f64(a, b), c); }
+    // multiply and subtract, a*b - c
+    BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c)
+    { return v_sub_f32(v_mul_f32(a, b), c); }
+    BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c)
+    { return v_sub_f64(v_mul_f64(a, b), c); }
 #endif // HAVE_FMA3
 
 // Horizontal add: Calculates the sum of all vector elements.
diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c
index a312b7ff9..66e9ff907 100644
--- a/kernel/x86_64/drot.c
+++ b/kernel/x86_64/drot.c
@@ -7,10 +7,76 @@
 #endif
 
 #ifndef HAVE_DROT_KERNEL
+#include "../simd/intrin.h"
 
 static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
     BLASLONG i = 0;
+#if V_SIMD_F64 && V_SIMD > 256
+    const int vstep = v_nlanes_f64;
+    const int unrollx4 = n & (-vstep * 4);
+    const int unrollx = n & -vstep;
+
+    v_f64 __c = v_setall_f64(c);
+    v_f64 __s = v_setall_f64(s);
+    v_f64 vx0, vx1, vx2, vx3;
+    v_f64 vy0, vy1, vy2, vy3;
+    v_f64 vt0, vt1, vt2, vt3;
+
+    for (; i < unrollx4; i += vstep * 4) {
+        vx0 = v_loadu_f64(x + i);
+        vx1 = v_loadu_f64(x + i + vstep);
+        vx2 = v_loadu_f64(x + i + vstep * 2);
+        vx3 = v_loadu_f64(x + i + vstep * 3);
+        vy0 = v_loadu_f64(y + i);
+        vy1 = v_loadu_f64(y + i + vstep);
+        vy2 = v_loadu_f64(y + i + vstep * 2);
+        vy3 = v_loadu_f64(y + i + vstep * 3);
+
+        vt0 = v_mul_f64(__s, vy0);
+        vt1 = v_mul_f64(__s, vy1);
+        vt2 = v_mul_f64(__s, vy2);
+        vt3 = v_mul_f64(__s, vy3);
+
+        vt0 = v_muladd_f64(__c, vx0, vt0);
+        vt1 = v_muladd_f64(__c, vx1, vt1);
+        vt2 = v_muladd_f64(__c, vx2, vt2);
+        vt3 = v_muladd_f64(__c, vx3, vt3);
+
+        v_storeu_f64(x + i, vt0);
+        v_storeu_f64(x + i + vstep, vt1);
+        v_storeu_f64(x + i + vstep * 2, vt2);
+        v_storeu_f64(x + i + vstep * 3, vt3);
+
+        vt0 = v_mul_f64(__s, vx0);
+        vt1 = v_mul_f64(__s, vx1);
+        vt2 = v_mul_f64(__s, vx2);
+        vt3 = v_mul_f64(__s, vx3);
+
+        vt0 = v_mulsub_f64(__c, vy0, vt0);
+        vt1 = v_mulsub_f64(__c, vy1, vt1);
+        vt2 = v_mulsub_f64(__c, vy2, vt2);
+        vt3 = v_mulsub_f64(__c, vy3, vt3);
+
+        v_storeu_f64(y + i, vt0);
+        v_storeu_f64(y + i + vstep, vt1);
+        v_storeu_f64(y + i + vstep * 2, vt2);
+        v_storeu_f64(y + i + vstep * 3, vt3);
+    }
+
+    for (; i < unrollx; i += vstep) {
+        vx0 = v_loadu_f64(x + i);
+        vy0 = v_loadu_f64(y + i);
+
+        vt0 = v_mul_f64(__s, vy0);
+        vt0 = v_muladd_f64(__c, vx0, vt0);
+        v_storeu_f64(x + i, vt0);
+
+        vt0 = v_mul_f64(__s, vx0);
+        vt0 = v_mulsub_f64(__c, vy0, vt0);
+        v_storeu_f64(y + i, vt0);
+    }
+#else
     FLOAT f0, f1, f2, f3;
     FLOAT x0, x1, x2, x3;
     FLOAT g0, g1, g2, g3;
@@ -53,7 +119,7 @@ static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
         yp += 4;
         i += 4;
     }
-
+#endif
     while (i < n) {
         FLOAT temp = c*x[i] + s*y[i];
         y[i] = c*y[i] - s*x[i];
diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index 021c20d82..d9583cdfa 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -7,10 +7,78 @@
 #endif
 
 #ifndef HAVE_SROT_KERNEL
+#include"../simd/intrin.h"
 
 static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
     BLASLONG i = 0;
+    
+#if V_SIMD
+    const int vstep = v_nlanes_f32;
+    const int unrollx4 = n & (-vstep * 4);
+    const int unrollx = n & -vstep;
+
+    v_f32 __c = v_setall_f32(c);
+    v_f32 __s = v_setall_f32(s);
+    v_f32 vx0, vx1, vx2, vx3;
+    v_f32 vy0, vy1, vy2, vy3;
+    v_f32 vt0, vt1, vt2, vt3;
+
+    for (; i < unrollx4; i += vstep * 4) {
+        vx0 = v_loadu_f32(x + i);
+        vx1 = v_loadu_f32(x + i + vstep);
+        vx2 = v_loadu_f32(x + i + vstep * 2);
+        vx3 = v_loadu_f32(x + i + vstep * 3);
+        vy0 = v_loadu_f32(y + i);
+        vy1 = v_loadu_f32(y + i + vstep);
+        vy2 = v_loadu_f32(y + i + vstep * 2);
+        vy3 = v_loadu_f32(y + i + vstep * 3);
+
+        vt0 = v_mul_f32(__s, vy0);
+        vt1 = v_mul_f32(__s, vy1);
+        vt2 = v_mul_f32(__s, vy2);
+        vt3 = v_mul_f32(__s, vy3);
+
+        vt0 = v_muladd_f32(__c, vx0, vt0);
+        vt1 = v_muladd_f32(__c, vx1, vt1);
+        vt2 = v_muladd_f32(__c, vx2, vt2);
+        vt3 = v_muladd_f32(__c, vx3, vt3);
+
+        v_storeu_f32(x + i, vt0);
+        v_storeu_f32(x + i + vstep, vt1);
+        v_storeu_f32(x + i + vstep * 2, vt2);
+        v_storeu_f32(x + i + vstep * 3, vt3);
+
+        vt0 = v_mul_f32(__s, vx0);
+        vt1 = v_mul_f32(__s, vx1);
+        vt2 = v_mul_f32(__s, vx2);
+        vt3 = v_mul_f32(__s, vx3);
+
+        vt0 = v_mulsub_f32(__c, vy0, vt0);
+        vt1 = v_mulsub_f32(__c, vy1, vt1);
+        vt2 = v_mulsub_f32(__c, vy2, vt2);
+        vt3 = v_mulsub_f32(__c, vy3, vt3);
+
+        v_storeu_f32(y + i, vt0);
+        v_storeu_f32(y + i + vstep, vt1);
+        v_storeu_f32(y + i + vstep * 2, vt2);
+        v_storeu_f32(y + i + vstep * 3, vt3);
+
+    }
+
+    for (; i < unrollx; i += vstep) {
+        vx0 = v_loadu_f32(x + i);
+        vy0 = v_loadu_f32(y + i);
+
+        vt0 = v_mul_f32(__s, vy0);
+        vt0 = v_muladd_f32(__c, vx0, vt0);
+        v_storeu_f32(x + i, vt0);
+
+        vt0 = v_mul_f32(__s, vx0);
+        vt0 = v_mulsub_f32(__c, vy0, vt0);
+        v_storeu_f32(y + i, vt0);
+    }
+#else
     FLOAT f0, f1, f2, f3;
     FLOAT x0, x1, x2, x3;
     FLOAT g0, g1, g2, g3;
@@ -20,7 +88,6 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
     FLOAT* yp = y;
 
     BLASLONG n1 = n & (~7);
-
     while (i < n1) {
         x0 = xp[0];
         y0 = yp[0];
@@ -53,6 +120,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
         yp += 4;
         i += 4;
     }
+#endif
 
     while (i < n) {
         FLOAT temp = c*x[i] + s*y[i];

From 5bc0a7583fed3328f176b69419ae12a063f2f4e0 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Wed, 11 Nov 2020 15:18:01 +0800
Subject: [PATCH 051/121] only FMA3 and vector larger than 128 have positive
 effects.

---
 kernel/x86_64/srot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index d9583cdfa..4273f7fe7 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
     BLASLONG i = 0;
     
-#if V_SIMD
+#if V_SIMD && (HAVE_FMA3 || V_SIMD > 128)
     const int vstep = v_nlanes_f32;
     const int unrollx4 = n & (-vstep * 4);
     const int unrollx = n & -vstep;

From a87e537b8cd5844159dd5806204470a945be695d Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Wed, 11 Nov 2020 15:53:48 +0800
Subject: [PATCH 052/121] modify macro

---
 kernel/x86_64/srot.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c
index 4273f7fe7..3de586cb8 100644
--- a/kernel/x86_64/srot.c
+++ b/kernel/x86_64/srot.c
@@ -13,7 +13,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {
     BLASLONG i = 0;
     
-#if V_SIMD && (HAVE_FMA3 || V_SIMD > 128)
+#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128)
     const int vstep = v_nlanes_f32;
     const int unrollx4 = n & (-vstep * 4);
     const int unrollx = n & -vstep;

From e5c2ceb6750c4e649aef87e06bd87ed4fcbdc6a5 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 12 Nov 2020 17:35:17 +0800
Subject: [PATCH 053/121] fix the CI failure of lack the head

---
 kernel/simd/intrin.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/simd/intrin.h b/kernel/simd/intrin.h
index ef8fcb865..3802a91e1 100644
--- a/kernel/simd/intrin.h
+++ b/kernel/simd/intrin.h
@@ -47,7 +47,7 @@ extern "C" {
 #endif
 
 /** AVX **/
-#ifdef HAVE_AVX
+#if defined(HAVE_AVX) || defined(HAVE_FMA3)
 #include <immintrin.h>
 #endif
 

From e0dac6b53b27b2d79404577d17fdee8b2303e123 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Thu, 12 Nov 2020 20:31:03 +0800
Subject: [PATCH 054/121] fix the CI failure of target specific option mismatch

---
 kernel/Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/kernel/Makefile b/kernel/Makefile
index fb1d5d39a..fd9105fee 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,6 +5,10 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
+ifdef HAVE_FMA3
+CFLAGS += -mfma
+endif
+
 ifeq ($(ARCH), power)
 ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as

From ae0b1dea19bf836fb0c8af3630ccfcbbf4b8e37f Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Fri, 13 Nov 2020 10:20:24 +0800
Subject: [PATCH 055/121] modify system.cmake to enable fma flag

---
 cmake/system.cmake | 2 +-
 kernel/Makefile    | 4 ----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 66e95c6d3..68df2d900 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -174,7 +174,7 @@ if (DEFINED TARGET)
   endif()
   if (DEFINED HAVE_AVX)
 	if (NOT NO_AVX)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx -mfma")
 	endif()
   endif()
   if (DEFINED HAVE_AVX2)
diff --git a/kernel/Makefile b/kernel/Makefile
index fd9105fee..fb1d5d39a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -5,10 +5,6 @@ endif
 TOPDIR	= ..
 include $(TOPDIR)/Makefile.system
 
-ifdef HAVE_FMA3
-CFLAGS += -mfma
-endif
-
 ifeq ($(ARCH), power)
 ifeq ($(C_COMPILER), CLANG)
  override CFLAGS += -fno-integrated-as

From d6e7e05bb36d77f26274abf7d8be03dd2bd78c1d Mon Sep 17 00:00:00 2001
From: Gengxin Xie <gengxin.xie@intel.com>
Date: Fri, 13 Nov 2020 14:20:52 +0800
Subject: [PATCH 056/121] Improve the performance of dasum and sasum when SMP
 is defined

---
 kernel/x86_64/dasum.c | 66 +++++++++++++++++++++++++++++++++++++------
 kernel/x86_64/sasum.c | 59 ++++++++++++++++++++++++++++++++++----
 2 files changed, 110 insertions(+), 15 deletions(-)

diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c
index 8a40ea4b9..ddec21383 100644
--- a/kernel/x86_64/dasum.c
+++ b/kernel/x86_64/dasum.c
@@ -58,21 +58,19 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1)
 }
 
 #endif
-
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
-    BLASLONG i=0;
+    BLASLONG i = 0;
     FLOAT sumf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (sumf);
 
-    if (n <= 0 || inc_x <= 0) return(sumf);
-
-    if ( inc_x == 1 ) {
+    if (inc_x == 1) {
         sumf = dasum_kernel(n, x);
-    } 
+    }
     else {
         n *= inc_x;
-       
-        while(i < n) {
+        while (i < n) {
             sumf += ABS_K(x[i]);
             i += inc_x;
         }
@@ -80,3 +78,53 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
     return(sumf);
 }
 
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
+{
+    *(FLOAT *)result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha;
+    FLOAT * dummy_b;
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 100000 || inc_x <= 0)
+        nthreads = 1;
+    else 
+	    nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
+
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    } else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_REAL;
+#else
+        mode = BLAS_DOUBLE | BLAS_REAL;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
+
diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c
index 36ec4a737..d0cea9bee 100644
--- a/kernel/x86_64/sasum.c
+++ b/kernel/x86_64/sasum.c
@@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1)
 
 #endif
 
-FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x)
 {
-    BLASLONG i=0;
+    BLASLONG i = 0;
     FLOAT sumf = 0.0;
+    
+    if (n <= 0 || inc_x <= 0) return (sumf);
 
-    if (n <= 0 || inc_x <= 0) return(sumf);
-
-    if ( inc_x == 1 ) {
+    if (inc_x == 1) {
         sumf = sasum_kernel(n, x);
     }
     else {
-
         n *= inc_x;
         while(i < n) {
             sumf += ABS_K(x[i]);
             i += inc_x;
         }
+    }
+    return (sumf);
+}
 
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5)
+{
+    *(FLOAT *)result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha;
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 100000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/100000 ? num_cpu : n/100000;
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
     }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT * ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_REAL;
+#else
+        mode = BLAS_DOUBLE | BLAS_REAL;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
     return(sumf);
 }

From ec4d77c47c46358521c3b38e42eb8bfebcb94ec3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 13 Nov 2020 09:16:34 +0100
Subject: [PATCH 057/121] Add -mfma for HAVE_FMA3 in the non-DYNAMIC_ARCH case
 as well

---
 cmake/cc.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/cc.cmake b/cmake/cc.cmake
index b963940d6..76952152b 100644
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@@ -124,6 +124,9 @@ if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX)
         set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx")
 	endif ()
+	if (HAVE_FMA3)
+	set (CCOMMON_OPT  "${CCOMMON_OPT} -mfma")
+	endif ()
 	if (HAVE_SSE)
 	set (CCOMMON_OPT  "${CCOMMON_OPT} -msse")
 	endif ()

From b00a0de1323732a1b82c15bc4f0b0bac3e01c262 Mon Sep 17 00:00:00 2001
From: Qiyu8 <fangchunlin@huawei.com>
Date: Mon, 16 Nov 2020 09:14:56 +0800
Subject: [PATCH 058/121] remove the -mfma flag in when the host has AVX.

---
 cmake/system.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/system.cmake b/cmake/system.cmake
index 68df2d900..66e95c6d3 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -174,7 +174,7 @@ if (DEFINED TARGET)
   endif()
   if (DEFINED HAVE_AVX)
 	if (NOT NO_AVX)
-    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx -mfma")
+    set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx")
 	endif()
   endif()
   if (DEFINED HAVE_AVX2)

From fdf71d66b3799f730bae282edf84345ccdf7c21b Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@linux.ibm.com>
Date: Thu, 19 Nov 2020 20:50:42 +1100
Subject: [PATCH 059/121] POWER10: Fix ld version detection

LDVERSIONGTEQ35 needs to escape the '>' character.

LDVERSIONGTEQ35 is checking the system ld version which may be different
to the toolchain being used to compile OpenBLAS. We don't have a path
to the linker in our Makefiles, so (ab)use gcc -Wl,--version to get the
version of ld in our toolchain.
---
 Makefile.system | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index aae7ba503..6ee8beff8 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -672,7 +672,7 @@ DYNAMIC_CORE += POWER9
 else
 $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.)
 endif
-LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35)
+LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35)
 ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11)
 DYNAMIC_CORE += POWER10
 CCOMMON_OPT += -DHAVE_P10_SUPPORT

From 043f3d6faa797e0fe79c165b0a31acf0cf8f2b38 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@ozlabs.org>
Date: Thu, 19 Nov 2020 21:04:10 +1100
Subject: [PATCH 060/121] POWER10: Use POWER9 as a fallback

If the toolchain is too old, or the mma features isn't set on a POWER10
fall back to the POWER9 loops.
---
 driver/others/dynamic_power.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index 85fc5b3ba..d60ae68fc 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -52,6 +52,9 @@ static gotoblas_t *get_coretype(void) {
 	if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma"))
 		return &gotoblas_POWER10;
 #endif
+	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+	if (__builtin_cpu_is("power10"))
+		return &gotoblas_POWER9;
 	return NULL;
 }
 

From 60005eb47b5d30dcf35edff8c824a9f9fd9f6e6c Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 19 Nov 2020 14:39:00 +0100
Subject: [PATCH 061/121] Don't overwrite blas_thread_buffer if already set

After a fork it is possible that blas_thread_buffer has already
allocated memory buffers: goto_set_num_threads does allocate those
already and it may be called by num_cpu_avail in case the OpenBLAS
NUM_THREADS differ from the OMP num threads.
This leads to a memory leak which can cause subsequent execution of BLAS
kernels to fail.

Fixes #2993
---
 driver/others/blas_server_omp.c | 48 +++++++++++++++------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c
index a8b3e9a4b..a576127aa 100644
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@@ -76,10 +76,28 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
 #endif
 
-void goto_set_num_threads(int num_threads) {
+static void adjust_thread_buffers() {
 
   int i=0, j=0;
 
+  //adjust buffer for each thread
+  for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
+    for(j=0; j < blas_cpu_number; j++){
+      if(blas_thread_buffer[i][j] == NULL){
+        blas_thread_buffer[i][j] = blas_memory_alloc(2);
+      }
+    }
+    for(; j < MAX_CPU_NUMBER; j++){
+      if(blas_thread_buffer[i][j] != NULL){
+        blas_memory_free(blas_thread_buffer[i][j]);
+        blas_thread_buffer[i][j] = NULL;
+      }
+    }
+  }
+}
+
+void goto_set_num_threads(int num_threads) {
+
   if (num_threads < 1) num_threads = blas_num_threads;
 
   if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
@@ -92,20 +110,7 @@ void goto_set_num_threads(int num_threads) {
 
   omp_set_num_threads(blas_cpu_number);
 
-  //adjust buffer for each thread
-  for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
-    for(j=0; j<blas_cpu_number; j++){
-      if(blas_thread_buffer[i][j]==NULL){
-        blas_thread_buffer[i][j]=blas_memory_alloc(2);
-      }
-    }
-    for(; j<MAX_CPU_NUMBER; j++){
-      if(blas_thread_buffer[i][j]!=NULL){
-        blas_memory_free(blas_thread_buffer[i][j]);
-        blas_thread_buffer[i][j]=NULL;
-      }
-    }
-  }
+  adjust_thread_buffers();
 #if defined(ARCH_MIPS64)
   //set parameters for different number of threads.
   blas_set_parameter();
@@ -119,20 +124,11 @@ void openblas_set_num_threads(int num_threads) {
 
 int blas_thread_init(void){
 
-  int i=0, j=0;
-
   blas_get_cpu_number();
 
-  blas_server_avail = 1;
+  adjust_thread_buffers();
 
-  for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
-    for(j=0; j<blas_num_threads; j++){
-      blas_thread_buffer[i][j]=blas_memory_alloc(2);
-    }
-    for(; j<MAX_CPU_NUMBER; j++){
-      blas_thread_buffer[i][j]=NULL;
-    }
-  }
+  blas_server_avail = 1;
 
   return 0;
 }

From a05dc6e62b48bc86eabdc229e58c327f5e57cf66 Mon Sep 17 00:00:00 2001
From: Alexander Grund <alexander.grund@tu-dresden.de>
Date: Thu, 19 Nov 2020 15:24:57 +0100
Subject: [PATCH 062/121] Add reproducer test for crash after fork

See #2993 for an analysis
---
 utest/CMakeLists.txt   |   6 +-
 utest/Makefile         |   3 +-
 utest/test_fork.c      |   4 +-
 utest/test_post_fork.c | 131 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 140 insertions(+), 4 deletions(-)
 create mode 100644 utest/test_post_fork.c

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index dc5175fc5..357e61301 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -27,13 +27,17 @@ endif ()
 
 # known to hang with the native Windows and Android threads
 # FIXME needs checking if this works on any of the other platforms
-if (NOT USE_OPENMP)
 if (OS_CYGWIN_NT OR OS_LINUX)
+if (NOT USE_OPENMP)
 set(OpenBLAS_utest_src
   ${OpenBLAS_utest_src}
   test_fork.c
   )
 endif()
+set(OpenBLAS_utest_src
+  ${OpenBLAS_utest_src}
+  test_post_fork.c
+  )
 endif()
 
 if (NOT NO_LAPACK)
diff --git a/utest/Makefile b/utest/Makefile
index 31d4ccf00..ac8c6f72a 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -25,10 +25,11 @@ endif
 
 #this does not work with OpenMP nor with native Windows or Android threads
 # FIXME TBD if this works on OSX, SunOS, POWER and zarch
-ifndef USE_OPENMP
 ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT))
+ifneq ($(USE_OPENMP), 1)
 OBJS += test_fork.o
 endif
+OBJS += test_post_fork.o
 endif
 
 ifeq ($(C_COMPILER), PGI)
diff --git a/utest/test_fork.c b/utest/test_fork.c
index 5c976f920..bd531e7fb 100644
--- a/utest/test_fork.c
+++ b/utest/test_fork.c
@@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cblas.h>
 #include "openblas_utest.h"
 
-void* xmalloc(size_t n)
+static void* xmalloc(size_t n)
 {
     void* tmp;
     tmp = malloc(n);
@@ -49,7 +49,7 @@ void* xmalloc(size_t n)
 }
 
 #ifdef BUILD_DOUBLE
-void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
+static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
 {
     char trans1 = 'T';
     char trans2 = 'N';
diff --git a/utest/test_post_fork.c b/utest/test_post_fork.c
new file mode 100644
index 000000000..9370a02ce
--- /dev/null
+++ b/utest/test_post_fork.c
@@ -0,0 +1,131 @@
+/*****************************************************************************
+Copyright (c) 2011-2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************************/
+
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <cblas.h>
+#ifdef USE_OPENMP
+#include <omp.h>
+#endif
+#include "openblas_utest.h"
+
+static void* xmalloc(size_t n)
+{
+    void* tmp;
+    tmp = malloc(n);
+    if (tmp == NULL) {
+        fprintf(stderr, "You are about to die\n");
+        exit(1);
+    } else {
+        return tmp;
+    }
+}
+
+#ifdef BUILD_DOUBLE
+static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n)
+{
+    char trans1 = 'T';
+    char trans2 = 'N';
+    double zerod = 0, oned = 1;
+    int i;
+    BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n);
+    for(i = 0; i < n * n; ++i) {
+        ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS);
+    }
+}
+#endif
+
+CTEST(fork, safety_after_fork_in_parent)
+{
+#ifndef BUILD_DOUBLE
+exit(0);
+#else
+    blasint n = 100;
+    int i, nthreads_omp;
+
+    double *a, *b, *c, *d;
+    size_t n_bytes;
+
+    pid_t fork_pid;
+
+    n_bytes = sizeof(*a) * n * n;
+
+    a = xmalloc(n_bytes);
+    b = xmalloc(n_bytes);
+    c = xmalloc(n_bytes);
+    d = xmalloc(n_bytes);
+
+    // Put ones in a, b and n in c (result)
+    for(i = 0; i < n * n; ++i) {
+        a[i] = 1;
+        b[i] = 1;
+        c[i] = 1 * n;
+    }
+
+    // Test that OpenBLAS works after a fork.
+    // This situation routinely happens with Pythons numpy where a
+    // `sys.platform` calls `uname` in a forked process.
+    // So we simulate this situation here.
+
+    // There was an issue where a different number of OpenBLAS and OpenMP
+    // threads triggered a memory leak. So run this multiple times
+    // with different number of threads set.
+#ifdef USE_OPENMP
+    nthreads_omp = omp_get_max_threads();
+    // Run with half the max OMP threads, the max threads and twice that
+    for(i = (nthreads_omp + 1) / 2; i <= nthreads_omp * 2; i *= 2) {
+        omp_set_num_threads(i);
+#endif
+
+        fork_pid = fork();
+        if (fork_pid == -1) {
+            CTEST_ERR("Failed to fork process.");
+        } else if (fork_pid == 0) {
+            // Just pretend to do something, e.g. call `uname`, then exit
+            exit(0);
+        } else {
+            // Wait for the child to finish and check the exit code.
+            int child_status = 0;
+            pid_t wait_pid = wait(&child_status);
+            ASSERT_EQUAL(wait_pid, fork_pid);
+            ASSERT_EQUAL(0, WEXITSTATUS (child_status));
+
+            // Now OpenBLAS has to work
+            check_dgemm(a, b, d, c, n);
+        }
+#ifdef USE_OPENMP
+    }
+#endif
+
+#endif
+}

From c6c9c24d1b64430033e733c7341a5d37c79e4668 Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Sun, 22 Nov 2020 16:02:19 +0800
Subject: [PATCH 063/121] Update doc for C910.

---
 README.md      | 7 +++++++
 TargetList.txt | 2 ++
 2 files changed, 9 insertions(+)

diff --git a/README.md b/README.md
index ca034e747..267df5358 100644
--- a/README.md
+++ b/README.md
@@ -172,6 +172,13 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
 - **Z13**: Optimized Level-3 BLAS and Level-1,2
 - **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2
 
+#### RISC-V
+
+- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1.
+  ```sh
+  make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
+  ```
+
 ### Support for multiple targets in a single library
 
 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
diff --git a/TargetList.txt b/TargetList.txt
index 86177ebca..d19964916 100644
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -107,3 +107,5 @@ Z14
 
 10.RISC-V 64:
 RISCV64_GENERIC
+C910V
+

From 8a6b17f97dae84fe935d049761399b4dac59652e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 16:19:31 +0100
Subject: [PATCH 064/121] Change ifndefs to ifneq

---
 ctest/Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ctest/Makefile b/ctest/Makefile
index cba904f75..2a893cae8 100644
--- a/ctest/Makefile
+++ b/ctest/Makefile
@@ -61,7 +61,7 @@ endif
 
 all1: $(all1targets)
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
 	OMP_NUM_THREADS=2 ./xscblat1
@@ -106,7 +106,7 @@ endif
 
 all2: $(all2targets)
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
 	OMP_NUM_THREADS=2 ./xscblat2 < sin2
@@ -152,7 +152,7 @@ endif
 
 all3: $(all3targets)
 
-ifndef CROSS
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
 	OMP_NUM_THREADS=2 ./xscblat3 < sin3

From 65eb7afaf42450f3073bfc89ed4029e2ee21d61f Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 16:25:36 +0100
Subject: [PATCH 065/121] Change ifndef CROSS to ifneq

---
 test/Makefile | 360 +++++++++++++++++++-------------------------------
 1 file changed, 133 insertions(+), 227 deletions(-)

diff --git a/test/Makefile b/test/Makefile
index 1ecce0be7..2a893cae8 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,269 +1,211 @@
-TOPDIR	= ..
-include ../Makefile.system
+#
+# The Makefile compiles c wrappers and testers for CBLAS.
+#
+
+TOPDIR = ..
+include $(TOPDIR)/Makefile.system
+
+override CFLAGS += -DADD$(BU) -DCBLAS
+override TARGET_ARCH=
+override TARGET_MACH=
+
+LIB = $(TOPDIR)/$(LIBNAME)
+
+stestl1o = c_sblas1.o
+
+stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o constant.o
+
+stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o constant.o
+
+dtestl1o = c_dblas1.o
+
+dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o constant.o
+
+dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o constant.o
+
+ctestl1o = c_cblas1.o
+
+ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o
+
+ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o
+
+ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o
+
+ztestl1o = c_zblas1.o
+
+ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o
+
+ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
+
+ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
 
 
 ifeq ($(NOFORTRAN),1)
 all ::
 else
-all :: level1 level2 level3
+all :: all1 all2 all3
 endif
 
 ifeq ($(BUILD_SINGLE),1)
-S1=sblat1
+all1targets += xscblat1
 endif
 ifeq ($(BUILD_DOUBLE),1)
-D1=dblat1
+all1targets += xdcblat1
 endif
 ifeq ($(BUILD_COMPLEX),1)
-C1=cblat1
+all1targets += xccblat1
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-Z1=zblat1
+all1targets += xzcblat1
 endif
 
-level1: $(S1) $(D1) $(C1) $(Z1)
+all1: $(all1targets)
 
-ifndef CROSS
-ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
-endif
-ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
-endif
-ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
-endif
-ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat1
-endif
-ifdef SMP
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
-	OMP_NUM_THREADS=2 ./sblat1
+	OMP_NUM_THREADS=2 ./xscblat1
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OMP_NUM_THREADS=2 ./dblat1
-endif
+	OMP_NUM_THREADS=2 ./xdcblat1
+endif	
 ifeq ($(BUILD_COMPLEX),1)
-	OMP_NUM_THREADS=2 ./cblat1
+	OMP_NUM_THREADS=2 ./xccblat1
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OMP_NUM_THREADS=2 ./zblat1
+	OMP_NUM_THREADS=2 ./xzcblat1
 endif
 else
 ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=2 ./sblat1
+	OPENBLAS_NUM_THREADS=2 ./xscblat1
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=2 ./dblat1
+	OPENBLAS_NUM_THREADS=2 ./xdcblat1
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=2 ./cblat1
+	OPENBLAS_NUM_THREADS=2 ./xccblat1
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=2 ./zblat1
-endif
+	OPENBLAS_NUM_THREADS=2 ./xzcblat1
 endif
 endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
-S2=sblat2
+all2targets += xscblat2
 endif
 ifeq ($(BUILD_DOUBLE),1)
-D2=dblat2
+all2targets += xdcblat2
 endif
 ifeq ($(BUILD_COMPLEX),1)
-C2=cblat2
+all2targets += xccblat2
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-Z2=zblat2
+all2targets += xzcblat2
 endif
 
-level2: $(S2) $(D2) $(C2) $(Z2)
+all2: $(all2targets)
 
-
-ifndef CROSS
-	rm -f ?BLAT2.SUMM
-ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
-	@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
-endif
-ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat2 < ./dblat2.dat
-	@$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0
-endif
-ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat2 < ./cblat2.dat
-	@$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0
-endif
-ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat2 < ./zblat2.dat
-	@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
-endif
-ifdef SMP
-	rm -f ?BLAT2.SUMM
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
-	OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat
-	@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xscblat2 < sin2
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OMP_NUM_THREADS=2 ./dblat2 < ./dblat2.dat
-	@$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xdcblat2 < din2
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OMP_NUM_THREADS=2 ./cblat2 < ./cblat2.dat
-	@$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xccblat2 < cin2
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OMP_NUM_THREADS=2 ./zblat2 < ./zblat2.dat
-	@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xzcblat2 < zin2
 endif
 else
 ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat
-	@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./xscblat2 < sin2
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=2 ./dblat2 < ./dblat2.dat
-	@$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./xdcblat2 < din2
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=2 ./cblat2 < ./cblat2.dat
-	@$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=2 ./zblat2 < ./zblat2.dat
-	@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
-endif
+	OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
 endif
 endif
 endif
 
-ifeq ($(BUILD_BFLOAT16),1)
-B3= test_sbgemm
-endif
+
 ifeq ($(BUILD_SINGLE),1)
-S3=sblat3
+all3targets += xscblat3
 endif
 ifeq ($(BUILD_DOUBLE),1)
-D3=dblat3
+all3targets += xdcblat3
 endif
 ifeq ($(BUILD_COMPLEX),1)
-C3=cblat3
+all3targets += xccblat3
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-Z3=zblat3
+all3targets += xzcblat3
 endif
 
-level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
-
+all3: $(all3targets)
 
-ifndef CROSS
-	rm -f ?BLAT3.SUMM
-ifeq ($(BUILD_BFLOAT16),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM
-	@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
-endif
-ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
-	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
-endif
-ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat
-	@$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0
-endif
-ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3 < ./cblat3.dat
-	@$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0
-endif
-ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3 < ./zblat3.dat
-	@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
-endif
-ifdef SMP
-	rm -f ?BLAT3.SUMM
+ifneq ($(CROSS), 1)
 ifeq ($(USE_OPENMP), 1)
-ifeq ($(BUILD_BFLOAT16),1)
-	OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM
-	@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
-endif
 ifeq ($(BUILD_SINGLE),1)
-	OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
-	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xscblat3 < sin3
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat
-	@$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xdcblat3 < din3
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OMP_NUM_THREADS=2 ./cblat3 < ./cblat3.dat
-	@$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xccblat3 < cin3
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat
-	@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./xzcblat3 < zin3
 endif
 else
-ifeq ($(BUILD_BFLOAT16),1)
-	OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM
-	@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
-endif
 ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
-	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./xscblat3 < sin3
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat
-	@$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./xdcblat3 < din3
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=2 ./cblat3 < ./cblat3.dat
-	@$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./xccblat3 < cin3
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=2 ./zblat3 < ./zblat3.dat
-	@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3
 endif
 endif
 endif
-endif
-
 
-level3_3m : zblat3_3m cblat3_3m
-ifndef CROSS
-	rm -f ?BLAT3_3M.SUMM
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
-	@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
-	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat
-	@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
-ifdef SMP
-	rm -f ?BLAT3_3M.SUMM
+all3_3m: xzcblat3_3m xccblat3_3m
 ifeq ($(USE_OPENMP), 1)
-	OMP_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat
-	@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
-	OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat
-	@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+ifeq ($(BUILD_SINGLE),1)
+	OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
+endif
+ifeq ($(BUILD_COMPLEX16),1)
+	OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
+endif
 else
-	OPENBLAS_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat
-	@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
-	OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat
-	@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+ifeq ($(BUILD_COMPLEX),1)
+	OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
 endif
+ifeq ($(BUILD_COMPLEX16),1)
+	OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
 endif
 endif
 
 
 
 
-FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
-
-ifeq ($(CORE), C910V)
-EXTRALIB =
-CEXTRALIB =
-endif
+clean ::
+	rm -f x*
 
+FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
@@ -273,90 +215,54 @@ endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
-sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+# Single real
+xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 
-sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 
-sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 ifeq ($(BUILD_DOUBLE),1)
-dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-
-dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-
-dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-else
-dblat2:
-dblat3:
+# Double real
+xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 
-qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-
 ifeq ($(BUILD_COMPLEX),1)
-cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-
-cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-
-cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-endif
-
-ifeq ($(BUILD_COMPLEX16),1)
-zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-
-zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+# Single complex
+xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 
-zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 endif
 
-ifeq ($(BUILD_BFLOAT16),1)
-test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-endif
-
-ifeq ($(BUILD_COMPLEX),1)
-cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-endif
 
 ifeq ($(BUILD_COMPLEX16),1)
-zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
-endif
+# Double complex
+xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
 
 
+xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+endif
 
-clean:
-	@rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \
-	sblat1 dblat1 cblat1 zblat1 \
-	sblat2 dblat2 cblat2 zblat2 \
-	test_sbgemm sblat3 dblat3 cblat3 zblat3 \
-	sblat1p dblat1p cblat1p zblat1p \
-	sblat2p dblat2p cblat2p zblat2p \
-	sblat3p dblat3p cblat3p zblat3p \
-	zblat3_3m zblat3_3mp \
-	cblat3_3m cblat3_3mp \
-	*.stackdump *.dll
-
-libs:
-
-prof:
-
-quick :
-	$(MAKE) -C $(TOPDIR) libs
-
-# include ../Makefile.tail
+include $(TOPDIR)/Makefile.tail

From d3ff1f889fad96bf20cc3536bfab1c9ac58f4056 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 16:27:17 +0100
Subject: [PATCH 066/121] Convert ifndefs to ifneq

---
 driver/level3/Makefile | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/driver/level3/Makefile b/driver/level3/Makefile
index b528dfa2d..78f32b961 100644
--- a/driver/level3/Makefile
+++ b/driver/level3/Makefile
@@ -206,7 +206,7 @@ ifdef SMP
 COMMONOBJS  += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(SUFFIX) gemm_thread_variable.$(SUFFIX)
 COMMONOBJS  += syrk_thread.$(SUFFIX)
 
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 ifeq ($(BUILD_BFLOAT16),1)
 SBBLASOBJS    += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX)
 endif
@@ -282,7 +282,7 @@ HPLOBJS = \
 	dtrsm_RNUU.$(SUFFIX) dtrsm_RNUN.$(SUFFIX) dtrsm_RNLU.$(SUFFIX) dtrsm_RNLN.$(SUFFIX) \
 	dtrsm_RTUU.$(SUFFIX) dtrsm_RTUN.$(SUFFIX) dtrsm_RTLU.$(SUFFIX) dtrsm_RTLN.$(SUFFIX)
 
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 HPLOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) \
 	   dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
 endif
@@ -297,13 +297,13 @@ ifeq ($(BUILD_DOUBLE),1)
 	strsm_RTUU.$(SUFFIX) strsm_RTUN.$(SUFFIX) strsm_RTLU.$(SUFFIX) strsm_RTLN.$(SUFFIX) \
 	ssyrk_UN.$(SUFFIX) ssyrk_UT.$(SUFFIX) ssyrk_LN.$(SUFFIX) ssyrk_LT.$(SUFFIX) \
 	ssyrk_kernel_U.$(SUFFIX)  ssyrk_kernel_L.$(SUFFIX)
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 SBLASOBJS    += ssyrk_thread_UN.$(SUFFIX) ssyrk_thread_UT.$(SUFFIX) ssyrk_thread_LN.$(SUFFIX) ssyrk_thread_LT.$(SUFFIX)
 endif
 endif
 ifeq ($(BUILD_COMPLEX),1)
 	SBLASOBJS = sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX)
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 SBLASOBJS    += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
 endif
 endif
@@ -312,7 +312,7 @@ ifneq ($(BUILD_DOUBLE),1)
 	DBLASOBJS=
 ifeq ($(BUILD_COMPLEX16),1)
 	DBLASOBJS = dgemm_nn.$(SUFFIX) dgemm_nt.$(SUFFIX) dgemm_tn.$(SUFFIX) dgemm_tt.$(SUFFIX)
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 DBLASOBJS    += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
 endif
 endif
@@ -332,7 +332,7 @@ ifeq ($(BUILD_COMPLEX16),1)
 	ctrsm_RTUU.$(SUFFIX) ctrsm_RTUN.$(SUFFIX) ctrsm_RTLU.$(SUFFIX) ctrsm_RTLN.$(SUFFIX) \
 	ctrsm_RRUU.$(SUFFIX) ctrsm_RRUN.$(SUFFIX) ctrsm_RRLU.$(SUFFIX) ctrsm_RRLN.$(SUFFIX) \
 	ctrsm_RCUU.$(SUFFIX) ctrsm_RCUN.$(SUFFIX) ctrsm_RCLU.$(SUFFIX) ctrsm_RCLN.$(SUFFIX) 
-ifndef USE_SIMPLE_THREADED_LEVEL3
+ifneq ($(USE_SIMPLE_THREADED_LEVEL3), 1)
 CBLASOBJS    += cherk_thread_UN.$(SUFFIX) cherk_thread_UC.$(SUFFIX) cherk_thread_LN.$(SUFFIX) cherk_thread_LC.$(SUFFIX)
 endif
 endif

From 5fa305172a610264747cf6324bce639c67b3a7b9 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 16:29:56 +0100
Subject: [PATCH 067/121] Use ifeq instead of ifdef for user-definable options

---
 driver/others/Makefile | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/driver/others/Makefile b/driver/others/Makefile
index 7558ec058..d09444f56 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -7,7 +7,7 @@ COMMONOBJS	 = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX)
 
 ifdef SMP
 COMMONOBJS	+= blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX)
-ifndef NO_AFFINITY
+ifneq ($(NO_AFFINITY), 1)
 COMMONOBJS	+= init.$(SUFFIX)
 endif
 endif
@@ -32,11 +32,11 @@ else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
 
-ifdef EXPRECISION
+ifeq ($(EXPRECISION), 1)
 COMMONOBJS	+= x_abs.$(SUFFIX) qlamch.$(SUFFIX) qlamc3.$(SUFFIX)
 endif
 
-ifdef QUAD_PRECISION
+ifeq ($(QUAD_PRECISION), 1)
 COMMONOBJS	+= addx.$(SUFFIX) mulx.$(SUFFIX)
 endif
 
@@ -46,11 +46,9 @@ ifeq ($(C_COMPILER), PGI)
 endif
 endif
 
-ifdef USE_CUDA
 ifeq ($(USE_CUDA), 1)
 COMMONOBJS	+= cuda_init.$(SUFFIX)
 endif
-endif
 
 ifdef FUNCTION_PROFILE
 COMMONOBJS	+= profile.$(SUFFIX)

From 857afcc41d695cf6ed0279d8476bad50e0e9fdf3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 16:31:44 +0100
Subject: [PATCH 068/121] Use ifeq instead of ifdef for user-definable build
 options

---
 interface/Makefile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/interface/Makefile b/interface/Makefile
index 7b0bf1792..597956fdb 100644
--- a/interface/Makefile
+++ b/interface/Makefile
@@ -19,7 +19,7 @@ ifeq ($(ARCH), MIPS)
 SUPPORT_GEMM3M = 1
 endif
 
-ifndef NO_FBLAS
+ifneq ($(NO_FBLAS), 1)
 
 SBLAS1OBJS    = \
 		saxpy.$(SUFFIX) sswap.$(SUFFIX) \
@@ -146,7 +146,7 @@ ZBLAS3OBJS   +=  zgemm3m.$(SUFFIX)
 
 endif
 
-ifdef EXPRECISION
+ifeq ($(EXPRECISION), 1)
 
 QBLAS1OBJS    = \
 		qaxpy.$(SUFFIX) qswap.$(SUFFIX) \
@@ -511,11 +511,11 @@ endif
 
 FUNCOBJS    = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
 
-ifdef EXPRECISION
+ifeq ($(EXPRECISION), 1)
 FUNCOBJS   += $(QBLASOBJS) $(XBLASOBJS)
 endif
 
-ifdef QUAD_PRECISION
+ifeq ($(QUAD_PRECISION), 1)
 FUNCOBJS   += $(QBLASOBJS) $(XBLASOBJS)
 endif
 

From ebb8788696a61adba6819c08f323a68e8d2c43c8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 16:33:34 +0100
Subject: [PATCH 069/121] Use ifneq instead of ifdef for CROSS option

---
 utest/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utest/Makefile b/utest/Makefile
index ac8c6f72a..1fc30d088 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -45,7 +45,7 @@ $(UTESTBIN): $(OBJS)
 	$(CC) $(CFLAGS) $(LDFLAGS) -o $@ $^ ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB)
 
 run_test: $(UTESTBIN)
-ifndef CROSS
+ifneq ($(CROSS), 1)
 	./$(UTESTBIN)
 endif
 

From 60e1fddca7634917a56bcc4cb43bbbee08eb136a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 16:48:22 +0100
Subject: [PATCH 070/121] Ensure that the same (large) BUFFERSIZE is used for
 all cpus in DYNAMIC_ARCH builds

---
 common_power.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/common_power.h b/common_power.h
index a61e4e28a..6fb2af30a 100644
--- a/common_power.h
+++ b/common_power.h
@@ -849,6 +849,10 @@ Lmcount$lazy_ptr:
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
+#ifeq ($(DYNAMIC_ARCH), 1)
+#undefine BUFFER_SIZE
+#define BUFFER_SIZE (64 << 22)
+#endif
 
 #ifndef PAGESIZE
 #define PAGESIZE	( 4 << 10)

From 2b114c3f30ff70c23fbbe3215e62f83fadb70f9e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 17:16:22 +0100
Subject: [PATCH 071/121] Restore proper Makefile

---
 test/Makefile | 354 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 224 insertions(+), 130 deletions(-)

diff --git a/test/Makefile b/test/Makefile
index 2a893cae8..5f653414a 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,211 +1,269 @@
-#
-# The Makefile compiles c wrappers and testers for CBLAS.
-#
-
-TOPDIR = ..
-include $(TOPDIR)/Makefile.system
-
-override CFLAGS += -DADD$(BU) -DCBLAS
-override TARGET_ARCH=
-override TARGET_MACH=
-
-LIB = $(TOPDIR)/$(LIBNAME)
-
-stestl1o = c_sblas1.o
-
-stestl2o = c_sblas2.o c_s2chke.o auxiliary.o c_xerbla.o constant.o
-
-stestl3o = c_sblas3.o c_s3chke.o auxiliary.o c_xerbla.o constant.o
-
-dtestl1o = c_dblas1.o
-
-dtestl2o = c_dblas2.o c_d2chke.o auxiliary.o c_xerbla.o constant.o
-
-dtestl3o = c_dblas3.o c_d3chke.o auxiliary.o c_xerbla.o constant.o
-
-ctestl1o = c_cblas1.o
-
-ctestl2o = c_cblas2.o c_c2chke.o auxiliary.o c_xerbla.o constant.o
-
-ctestl3o = c_cblas3.o c_c3chke.o auxiliary.o c_xerbla.o constant.o
-
-ctestl3o_3m = c_cblas3_3m.o c_c3chke_3m.o auxiliary.o c_xerbla.o constant.o
-
-ztestl1o = c_zblas1.o
-
-ztestl2o = c_zblas2.o c_z2chke.o auxiliary.o c_xerbla.o constant.o
-
-ztestl3o = c_zblas3.o c_z3chke.o auxiliary.o c_xerbla.o constant.o
-
-ztestl3o_3m = c_zblas3_3m.o c_z3chke_3m.o auxiliary.o c_xerbla.o constant.o
+TOPDIR	= ..
+include ../Makefile.system
 
 
 ifeq ($(NOFORTRAN),1)
 all ::
 else
-all :: all1 all2 all3
+all :: level1 level2 level3
 endif
 
 ifeq ($(BUILD_SINGLE),1)
-all1targets += xscblat1
+S1=sblat1
 endif
 ifeq ($(BUILD_DOUBLE),1)
-all1targets += xdcblat1
+D1=dblat1
 endif
 ifeq ($(BUILD_COMPLEX),1)
-all1targets += xccblat1
+C1=cblat1
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-all1targets += xzcblat1
+Z1=zblat1
 endif
 
-all1: $(all1targets)
+level1: $(S1) $(D1) $(C1) $(Z1)
 
 ifneq ($(CROSS), 1)
+ifeq ($(BUILD_SINGLE),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat1
+endif
+ifeq ($(BUILD_DOUBLE),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat1
+endif
+ifeq ($(BUILD_COMPLEX),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat1
+endif
+ifeq ($(BUILD_COMPLEX16),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat1
+endif
+ifdef SMP
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
-	OMP_NUM_THREADS=2 ./xscblat1
+	OMP_NUM_THREADS=2 ./sblat1
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OMP_NUM_THREADS=2 ./xdcblat1
-endif	
+	OMP_NUM_THREADS=2 ./dblat1
+endif
 ifeq ($(BUILD_COMPLEX),1)
-	OMP_NUM_THREADS=2 ./xccblat1
+	OMP_NUM_THREADS=2 ./cblat1
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OMP_NUM_THREADS=2 ./xzcblat1
+	OMP_NUM_THREADS=2 ./zblat1
 endif
 else
 ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=2 ./xscblat1
+	OPENBLAS_NUM_THREADS=2 ./sblat1
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=2 ./xdcblat1
+	OPENBLAS_NUM_THREADS=2 ./dblat1
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=2 ./xccblat1
+	OPENBLAS_NUM_THREADS=2 ./cblat1
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=2 ./xzcblat1
+	OPENBLAS_NUM_THREADS=2 ./zblat1
+endif
 endif
 endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
-all2targets += xscblat2
+S2=sblat2
 endif
 ifeq ($(BUILD_DOUBLE),1)
-all2targets += xdcblat2
+D2=dblat2
 endif
 ifeq ($(BUILD_COMPLEX),1)
-all2targets += xccblat2
+C2=cblat2
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-all2targets += xzcblat2
+Z2=zblat2
 endif
 
-all2: $(all2targets)
+level2: $(S2) $(D2) $(C2) $(Z2)
+
 
 ifneq ($(CROSS), 1)
+	rm -f ?BLAT2.SUMM
+ifeq ($(BUILD_SINGLE),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat2 < ./sblat2.dat
+	@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
+endif
+ifeq ($(BUILD_DOUBLE),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat2 < ./dblat2.dat
+	@$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0
+endif
+ifeq ($(BUILD_COMPLEX),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat2 < ./cblat2.dat
+	@$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0
+endif
+ifeq ($(BUILD_COMPLEX16),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat2 < ./zblat2.dat
+	@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
+endif
+ifdef SMP
+	rm -f ?BLAT2.SUMM
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
-	OMP_NUM_THREADS=2 ./xscblat2 < sin2
+	OMP_NUM_THREADS=2 ./sblat2 < ./sblat2.dat
+	@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OMP_NUM_THREADS=2 ./xdcblat2 < din2
+	OMP_NUM_THREADS=2 ./dblat2 < ./dblat2.dat
+	@$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OMP_NUM_THREADS=2 ./xccblat2 < cin2
+	OMP_NUM_THREADS=2 ./cblat2 < ./cblat2.dat
+	@$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OMP_NUM_THREADS=2 ./xzcblat2 < zin2
+	OMP_NUM_THREADS=2 ./zblat2 < ./zblat2.dat
+	@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
 endif
 else
 ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=2 ./xscblat2 < sin2
+	OPENBLAS_NUM_THREADS=2 ./sblat2 < ./sblat2.dat
+	@$(GREP) -q FATAL SBLAT2.SUMM && cat SBLAT2.SUMM || exit 0
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=2 ./xdcblat2 < din2
+	OPENBLAS_NUM_THREADS=2 ./dblat2 < ./dblat2.dat
+	@$(GREP) -q FATAL DBLAT2.SUMM && cat DBLAT2.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=2 ./xccblat2 < cin2
+	OPENBLAS_NUM_THREADS=2 ./cblat2 < ./cblat2.dat
+	@$(GREP) -q FATAL CBLAT2.SUMM && cat CBLAT2.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=2 ./xzcblat2 < zin2
+	OPENBLAS_NUM_THREADS=2 ./zblat2 < ./zblat2.dat
+	@$(GREP) -q FATAL ZBLAT2.SUMM && cat ZBLAT2.SUMM || exit 0
+endif
 endif
 endif
 endif
 
-
+ifeq ($(BUILD_BFLOAT16),1)
+B3= test_sbgemm
+endif
 ifeq ($(BUILD_SINGLE),1)
-all3targets += xscblat3
+S3=sblat3
 endif
 ifeq ($(BUILD_DOUBLE),1)
-all3targets += xdcblat3
+D3=dblat3
 endif
 ifeq ($(BUILD_COMPLEX),1)
-all3targets += xccblat3
+C3=cblat3
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-all3targets += xzcblat3
+Z3=zblat3
 endif
 
-all3: $(all3targets)
+level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
+
 
 ifneq ($(CROSS), 1)
-ifeq ($(USE_OPENMP), 1)
+	rm -f ?BLAT3.SUMM
+ifeq ($(BUILD_BFLOAT16),1)
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SBBLAT3.SUMM
+	@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
+endif
 ifeq ($(BUILD_SINGLE),1)
-	OMP_NUM_THREADS=2 ./xscblat3 < sin3
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat
+	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OMP_NUM_THREADS=2 ./xdcblat3 < din3
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./dblat3 < ./dblat3.dat
+	@$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OMP_NUM_THREADS=2 ./xccblat3 < cin3
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3 < ./cblat3.dat
+	@$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OMP_NUM_THREADS=2 ./xzcblat3 < zin3
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3 < ./zblat3.dat
+	@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
+endif
+ifdef SMP
+	rm -f ?BLAT3.SUMM
+ifeq ($(USE_OPENMP), 1)
+ifeq ($(BUILD_BFLOAT16),1)
+	OMP_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM
+	@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
 endif
-else
 ifeq ($(BUILD_SINGLE),1)
-	OPENBLAS_NUM_THREADS=2 ./xscblat3 < sin3
+	OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
+	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
 endif
 ifeq ($(BUILD_DOUBLE),1)
-	OPENBLAS_NUM_THREADS=2 ./xdcblat3 < din3
+	OMP_NUM_THREADS=2 ./dblat3 < ./dblat3.dat
+	@$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=2 ./xccblat3 < cin3
+	OMP_NUM_THREADS=2 ./cblat3 < ./cblat3.dat
+	@$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=2 ./xzcblat3 < zin3
-endif
+	OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat
+	@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
 endif
+else
+ifeq ($(BUILD_BFLOAT16),1)
+	OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SBBLAT3.SUMM
+	@$(GREP) -q FATAL SBBLAT3.SUMM && cat SBBLAT3.SUMM || exit 0
 endif
-
-all3_3m: xzcblat3_3m xccblat3_3m
-ifeq ($(USE_OPENMP), 1)
 ifeq ($(BUILD_SINGLE),1)
-	OMP_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
+	OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat
+	@$(GREP) -q FATAL SBLAT3.SUMM && cat SBLAT3.SUMM || exit 0
 endif
-ifeq ($(BUILD_COMPLEX16),1)
-	OMP_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
+ifeq ($(BUILD_DOUBLE),1)
+	OPENBLAS_NUM_THREADS=2 ./dblat3 < ./dblat3.dat
+	@$(GREP) -q FATAL DBLAT3.SUMM && cat DBLAT3.SUMM || exit 0
 endif
-else
 ifeq ($(BUILD_COMPLEX),1)
-	OPENBLAS_NUM_THREADS=2 ./xccblat3_3m < cin3_3m
+	OPENBLAS_NUM_THREADS=2 ./cblat3 < ./cblat3.dat
+	@$(GREP) -q FATAL CBLAT3.SUMM && cat CBLAT3.SUMM || exit 0
 endif
 ifeq ($(BUILD_COMPLEX16),1)
-	OPENBLAS_NUM_THREADS=2 ./xzcblat3_3m < zin3_3m
+	OPENBLAS_NUM_THREADS=2 ./zblat3 < ./zblat3.dat
+	@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
+endif
 endif
 endif
+endif
+
 
+level3_3m : zblat3_3m cblat3_3m
+ifneq ($(CROSS), 1)
+	rm -f ?BLAT3_3M.SUMM
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat
+	@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./zblat3_3m < ./zblat3_3m.dat
+	@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+ifdef SMP
+	rm -f ?BLAT3_3M.SUMM
+ifeq ($(USE_OPENMP), 1)
+	OMP_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat
+	@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
+	OMP_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat
+	@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+else
+	OPENBLAS_NUM_THREADS=2 ./cblat3_3m < ./cblat3_3m.dat
+	@$(GREP) -q FATAL CBLAT3_3M.SUMM && cat CBLAT3_3M.SUMM || exit 0
+	OPENBLAS_NUM_THREADS=2 ./zblat3_3m < ./zblat3_3m.dat
+	@$(GREP) -q FATAL ZBLAT3_3M.SUMM && cat ZBLAT3_3M.SUMM || exit 0
+endif
+endif
+endif
 
 
 
-clean ::
-	rm -f x*
 
 FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS)
+
+ifeq ($(CORE), C910V)
+EXTRALIB =
+CEXTRALIB =
+endif
+
 ifeq ($(USE_OPENMP), 1)
 ifeq ($(F_COMPILER), GFORTRAN)
 ifeq ($(C_COMPILER), CLANG)
@@ -215,54 +273,90 @@ endif
 endif
 
 ifeq ($(BUILD_SINGLE),1)
-# Single real
-xscblat1: $(stestl1o) c_sblat1.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xscblat1 c_sblat1.o $(stestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+sblat1 : sblat1.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o sblat1 sblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 
-xscblat2: $(stestl2o) c_sblat2.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xscblat2 c_sblat2.o $(stestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+sblat2 : sblat2.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o sblat2 sblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 
-xscblat3: $(stestl3o) c_sblat3.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xscblat3 c_sblat3.o $(stestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 endif
 
 ifeq ($(BUILD_DOUBLE),1)
-# Double real
-xdcblat1: $(dtestl1o) c_dblat1.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xdcblat1 c_dblat1.o $(dtestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-xdcblat2: $(dtestl2o) c_dblat2.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xdcblat2 c_dblat2.o $(dtestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-xdcblat3: $(dtestl3o) c_dblat3.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xdcblat3 c_dblat3.o $(dtestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+dblat1 : dblat1.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o dblat1 dblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+
+dblat2 : dblat2.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o dblat2 dblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+
+dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o dblat3 dblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+else
+dblat2:
+dblat3:
 endif
 
 
+qblat1 : qblat1.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o qblat1 qblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+
 ifeq ($(BUILD_COMPLEX),1)
-# Single complex
-xccblat1: $(ctestl1o) c_cblat1.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xccblat1 c_cblat1.o $(ctestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-xccblat2: $(ctestl2o) c_cblat2.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xccblat2 c_cblat2.o $(ctestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-xccblat3: $(ctestl3o) c_cblat3.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xccblat3 c_cblat3.o $(ctestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+cblat1 : cblat1.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o cblat1 cblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 
-xccblat3_3m: $(ctestl3o_3m) c_cblat3_3m.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xccblat3_3m c_cblat3_3m.o $(ctestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-endif
+cblat2 : cblat2.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o cblat2 cblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 
+cblat3 : cblat3.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o cblat3 cblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+endif
 
 ifeq ($(BUILD_COMPLEX16),1)
-# Double complex
-xzcblat1: $(ztestl1o) c_zblat1.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xzcblat1 c_zblat1.o $(ztestl1o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-xzcblat2: $(ztestl2o) c_zblat2.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xzcblat2 c_zblat2.o $(ztestl2o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
-xzcblat3: $(ztestl3o) c_zblat3.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xzcblat3 c_zblat3.o $(ztestl3o) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+zblat1 : zblat1.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o zblat1 zblat1.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+
+zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o zblat2 zblat2.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 
+zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+endif
 
-xzcblat3_3m: $(ztestl3o_3m) c_zblat3_3m.o $(TOPDIR)/$(LIBNAME)
-	$(FC) $(FLDFLAGS) -o xzcblat3_3m c_zblat3_3m.o $(ztestl3o_3m) $(LIB) $(EXTRALIB) $(CEXTRALIB)
+ifeq ($(BUILD_BFLOAT16),1)
+test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
 endif
 
-include $(TOPDIR)/Makefile.tail
+ifeq ($(BUILD_COMPLEX),1)
+cblat3_3m : cblat3_3m.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o cblat3_3m cblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+endif
+
+ifeq ($(BUILD_COMPLEX16),1)
+zblat3_3m : zblat3_3m.$(SUFFIX) ../$(LIBNAME)
+	$(FC) $(FLDFLAGS) -o zblat3_3m zblat3_3m.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
+endif
+
+
+
+clean:
+	@rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \
+	sblat1 dblat1 cblat1 zblat1 \
+	sblat2 dblat2 cblat2 zblat2 \
+	test_sbgemm sblat3 dblat3 cblat3 zblat3 \
+	sblat1p dblat1p cblat1p zblat1p \
+	sblat2p dblat2p cblat2p zblat2p \
+	sblat3p dblat3p cblat3p zblat3p \
+	zblat3_3m zblat3_3mp \
+	cblat3_3m cblat3_3mp \
+	*.stackdump *.dll
+
+libs:
+
+prof:
+
+quick :
+	$(MAKE) -C $(TOPDIR) libs
+
+# include ../Makefile.tail

From 02562949218dded905b100cff21eae15364598ce Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 17:41:44 +0100
Subject: [PATCH 072/121] Fix syntax mixup

---
 common_power.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common_power.h b/common_power.h
index 6fb2af30a..a49197fd7 100644
--- a/common_power.h
+++ b/common_power.h
@@ -849,8 +849,8 @@ Lmcount$lazy_ptr:
 #else
 #define BUFFER_SIZE     ( 16 << 20)
 #endif
-#ifeq ($(DYNAMIC_ARCH), 1)
-#undefine BUFFER_SIZE
+#ifdef DYNAMIC_ARCH
+#undef BUFFER_SIZE
 #define BUFFER_SIZE (64 << 22)
 #endif
 

From e7bf8ced6ccdc9c579ff5f8b94c20f104d98f616 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 20:20:28 +0100
Subject: [PATCH 073/121] Build fix for systems that do not support getauxval

---
 driver/others/dynamic_arm64.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c
index 007a221db..4f1b12f27 100644
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@@ -143,7 +143,7 @@ static gotoblas_t *get_coretype(void) {
 
 #if (!defined OS_LINUX && !defined OS_ANDROID)
   return NULL;
-#endif
+#else
 
   if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
 #ifdef __linux
@@ -235,6 +235,7 @@ static gotoblas_t *get_coretype(void) {
       openblas_warning(1, coremsg);
   }
   return NULL;
+#endif
 }
 
 void gotoblas_dynamic_init(void) {

From 01f01dae98abd447f3c962ba5c08498831e58f00 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 21:15:08 +0100
Subject: [PATCH 074/121] Add -msse if supported

---
 Makefile.x86 | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile.x86 b/Makefile.x86
index 330690935..f310f4973 100644
--- a/Makefile.x86
+++ b/Makefile.x86
@@ -1,5 +1,10 @@
 # COMPILER_PREFIX = mingw32-
 
+ifdef HAVE_SSE
+CCOMMON_OPT += -msse
+FCOMMON_OPT += -msse
+endif
+
 
 ifeq ($(OSNAME), Interix)
 ARFLAGS		= -m x86

From 11ebe5fa255eae6544f1087a2b673042894afd02 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 22 Nov 2020 21:16:07 +0100
Subject: [PATCH 075/121] Avoid redefinition warning

---
 getarch.c | 35 ++---------------------------------
 1 file changed, 2 insertions(+), 33 deletions(-)

diff --git a/getarch.c b/getarch.c
index 8b00aaee7..cf0be8d23 100644
--- a/getarch.c
+++ b/getarch.c
@@ -97,9 +97,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if defined(__x86_64__) || defined(_M_X64)
 #if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
 #else
+#ifndef NO_AVX512
 #define NO_AVX512
 #endif
 #endif
+#endif
 /* #define FORCE_P2		*/
 /* #define FORCE_KATMAI		*/
 /* #define FORCE_COPPERMINE	*/
@@ -981,20 +983,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
-#ifdef FORCE_RISCV64_GENERIC
-#define FORCE
-#define ARCHITECTURE    "RISCV64"
-#define SUBARCHITECTURE "RISCV64_GENERIC"
-#define SUBDIRNAME      "riscv64"
-#define ARCHCONFIG   "-DRISCV64_GENERIC " \
-       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
-       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "riscv64_generic"
-#define CORENAME  "RISCV64_GENERIC"
-#else
-#endif
-
 #ifdef FORCE_CORTEXA15
 #define FORCE
 #define ARCHITECTURE    "ARM"
@@ -1280,21 +1268,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "Z14"
 #endif
 
-#ifdef FORCE_C910V
-#define FORCE
-#define ARCHITECTURE    "RISCV64"
-#define SUBARCHITECTURE "C910V"
-#define SUBDIRNAME      "riscv64"
-#define ARCHCONFIG   "-DC910V " \
-       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
-       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "c910v"
-#define CORENAME  "C910V"
-#else
-#endif
-
-
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1349,10 +1322,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
-#ifdef __riscv
-#include "cpuid_riscv64.c"
-#endif
-
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED

From 358100ec15b3fff0b4ac560489c970385fb6f87b Mon Sep 17 00:00:00 2001
From: Gilles Gouaillardet <gilles@rist.or.jp>
Date: Sun, 29 Nov 2020 13:57:57 +0900
Subject: [PATCH 076/121] add Fujitsu compilers

Co-authored-by: Tomoki Karatsu <karatsu.spack@gmail.com>
---
 f_check | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/f_check b/f_check
index c12b0f2ef..9ef7b8086 100644
--- a/f_check
+++ b/f_check
@@ -69,7 +69,12 @@ if ($compiler eq "") {
 	    $bu       = "_";
 	}
 
-	if ($data =~ /GNU/ || $data =~ /GCC/ ) {
+	if ($data =~ /Fujitsu/) {
+
+	    $vendor = FUJITSU;
+	    $openmp = "-Kopenmp";
+
+	} elsif ($data =~ /GNU/ || $data =~ /GCC/ ) {
 
 	    $data =~ /(\d+)\.(\d+).(\d+)/;
 	    $major = $1;
@@ -337,8 +342,8 @@ if ($link ne "") {
 	    && ($flags !~ /kernel32/)
 	    && ($flags !~ /advapi32/)
 	    && ($flags !~ /shell32/)
-	    && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/))
-	    && ($flags !~ /[0-9]+/)
+	    && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $vendor !~ /FUJITSU/ && $flags =~ /omp/))
+	    && ($flags !~ /[0-9]+/ || ($vendor == FUJITSU && $flags =~ /^-lfj90/))
 		&& ($flags !~ /^\-l$/)
 	    ) {
 	    $linker_l .= $flags . " ";

From 3b4c016110a7de5e52a76045aaa4be25965c8e6c Mon Sep 17 00:00:00 2001
From: cyy <cyyever@outlook.com>
Date: Sun, 29 Nov 2020 17:17:07 +0800
Subject: [PATCH 077/121] link math lib on FreeBSD

---
 utest/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt
index 357e61301..0c99e0d12 100644
--- a/utest/CMakeLists.txt
+++ b/utest/CMakeLists.txt
@@ -58,7 +58,7 @@ add_executable(${OpenBLAS_utest_bin} ${OpenBLAS_utest_src})
 
 target_link_libraries(${OpenBLAS_utest_bin} ${OpenBLAS_LIBNAME})
 
-if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
+if(${CMAKE_SYSTEM_NAME} MATCHES "Linux" OR ${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD")
 target_link_libraries(${OpenBLAS_utest_bin} m)
 endif()
 

From ca17d3dc3d51589c8048f23355b2ac1cdf32771c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 29 Nov 2020 13:19:51 +0100
Subject: [PATCH 078/121] Restore RISCV entries accidentally trashed by my PR
 3005

---
 getarch.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/getarch.c b/getarch.c
index cf0be8d23..f107da3e9 100644
--- a/getarch.c
+++ b/getarch.c
@@ -983,6 +983,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_RISCV64_GENERIC
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_GENERIC"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_GENERIC " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_generic"
+#define CORENAME  "RISCV64_GENERIC"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA15
 #define FORCE
 #define ARCHITECTURE    "ARM"
@@ -1268,6 +1282,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "Z14"
 #endif
 
+#ifdef FORCE_C910V
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "C910V"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DC910V " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "c910v"
+#define CORENAME  "C910V"
+#else
+#endif
+
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1322,6 +1351,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __riscv
+#include "cpuid_riscv64.c"
+#endif
+
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED

From 2e99e2699b6d381a7d5709ad2e0dbcd0269826ad Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 29 Nov 2020 15:32:17 +0100
Subject: [PATCH 079/121] Add workaround for gcc 4.6 miscompiling assembly
 kernels with -mavx

---
 Makefile.system |  1 +
 Makefile.x86_64 |  4 ++++
 c_check         | 12 +++++++++++
 getarch.c       | 55 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 72 insertions(+)

diff --git a/Makefile.system b/Makefile.system
index afc8ee207..b5974f872 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1561,6 +1561,7 @@ export KERNELDIR
 export FUNCTION_PROFILE
 export TARGET_CORE
 export NO_AVX512
+export NO_AVX2
 export BUILD_BFLOAT16
 
 export SBGEMM_UNROLL_M
diff --git a/Makefile.x86_64 b/Makefile.x86_64
index 43bfc9ecd..d806a4ed2 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -20,14 +20,18 @@ ifdef HAVE_SSE4_1
 CCOMMON_OPT += -msse4.1
 FCOMMON_OPT += -msse4.1
 endif
+ifndef OLDGCC
 ifdef HAVE_AVX
 CCOMMON_OPT += -mavx
 FCOMMON_OPT += -mavx
 endif
+endif
+ifndef NO_AVX2
 ifdef HAVE_AVX2
 CCOMMON_OPT += -mavx2
 FCOMMON_OPT += -mavx2
 endif
+endif
 ifdef HAVE_FMA3
 CCOMMON_OPT += -mfma
 FCOMMON_OPT += -mfma
diff --git a/c_check b/c_check
index 405963ae6..efea9b0fb 100644
--- a/c_check
+++ b/c_check
@@ -229,6 +229,16 @@ $architecture = zarch  if ($data =~ /ARCH_ZARCH/);
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
 
+
+if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
+$no_avx2 = 0;
+$oldgcc = 0;
+$data = `$compiler_name -dumpversion`;
+if ($data <= 4.6) {
+$no_avx2 = 1;
+$oldgcc = 1;
+}
+}
 $no_avx512= 0;
 if (($architecture eq "x86") || ($architecture eq "x86_64")) {
     eval "use File::Temp qw(tempfile)";
@@ -368,6 +378,8 @@ print MAKEFILE "CEXTRALIB=$linker_L $linker_l $linker_a\n";
 print MAKEFILE "HAVE_MSA=1\n" if $have_msa eq 1;
 print MAKEFILE "MSA_FLAGS=$msa_flags\n" if $have_msa eq 1;
 print MAKEFILE "NO_AVX512=1\n" if $no_avx512 eq 1;
+print MAKEFILE "NO_AVX2=1\n" if $no_avx2 eq 1;
+print MAKEFILE "OLDGCC=1\n" if $oldgcc eq 1;
 
 $os           =~ tr/[a-z]/[A-Z]/;
 $architecture =~ tr/[a-z]/[A-Z]/;
diff --git a/getarch.c b/getarch.c
index cf0be8d23..9344defb5 100644
--- a/getarch.c
+++ b/getarch.c
@@ -326,6 +326,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX2
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#else
 #define SUBARCHITECTURE "HASWELL"
 #define ARCHCONFIG   "-DHASWELL " \
 		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
@@ -336,6 +346,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "haswell"
 #define CORENAME  "HASWELL"
 #endif
+#endif
 
 #ifdef FORCE_SKYLAKEX
 #ifdef NO_AVX512
@@ -551,6 +562,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define FORCE
 #define FORCE_INTEL
 #define ARCHITECTURE    "X86"
+#ifdef NO_AVX2
+#define SUBARCHITECTURE "SANDYBRIDGE"
+#define ARCHCONFIG   "-DSANDYBRIDGE " \
+		     "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+		     "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+		     "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
+		     "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX"
+#define LIBNAME   "sandybridge"
+#define CORENAME  "SANDYBRIDGE"
+#else
 #define SUBARCHITECTURE "ZEN"
 #define ARCHCONFIG   "-DZEN " \
 		     "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \
@@ -565,6 +586,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "zen"
 #define CORENAME  "ZEN"
 #endif
+#endif
 
 
 #ifdef FORCE_SSE_GENERIC
@@ -983,6 +1005,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #else
 #endif
 
+#ifdef FORCE_RISCV64_GENERIC
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_GENERIC"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_GENERIC " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_generic"
+#define CORENAME  "RISCV64_GENERIC"
+#else
+#endif
+
 #ifdef FORCE_CORTEXA15
 #define FORCE
 #define ARCHITECTURE    "ARM"
@@ -1268,6 +1304,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "Z14"
 #endif
 
+#ifdef FORCE_C910V
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "C910V"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DC910V " \
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "c910v"
+#define CORENAME  "C910V"
+#else
+#endif
+
+
 #ifndef FORCE
 
 #ifdef USER_TARGET
@@ -1322,6 +1373,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __riscv
+#include "cpuid_riscv64.c"
+#endif
+
 #ifdef __arm__
 #include "cpuid_arm.c"
 #define OPENBLAS_SUPPORTED

From 62a2eb884f0d364716a94d12284e339d20ffcc29 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 29 Nov 2020 15:33:07 +0100
Subject: [PATCH 080/121] Add SSE  flags for x86

---
 Makefile.x86 | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/Makefile.x86 b/Makefile.x86
index f310f4973..0e27264d8 100644
--- a/Makefile.x86
+++ b/Makefile.x86
@@ -59,9 +59,11 @@ LIBATLAS	= -L$(ATLASPATH)/32 -lcblas -lf77blas -latlas -lm
 else
 LIBATLAS	= -L$(ATLASPATH)/32 -lptf77blas -lptatlas -lpthread -lm
 endif
-
+ifdef HAVE_SSE2
+CCOMMON_OPT += -msse2
+FCOMMON_OPT += -msse2
+endif
 ifdef HAVE_SSE3
-ifndef DYNAMIC_ARCH
 CCOMMON_OPT += -msse3
 FCOMMON_OPT += -msse3
 ifdef HAVE_SSSE3
@@ -73,5 +75,4 @@ CCOMMON_OPT += -msse4.1
 FCOMMON_OPT += -msse4.1
 endif
 endif
-endif
 

From 7d46e31de1a206ea55ae31e7a0a1ae4b704458e0 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Sun, 29 Nov 2020 15:28:28 -0600
Subject: [PATCH 081/121] POWER10:  Optimize dgemv_n

Handling as 4x8 with vector pairs gives better performance than
existing code in POWER10.
---
 kernel/power/dgemv_n_microk_power10.c | 150 +++++++++++++++++++--
 kernel/power/dgemv_n_power10.c        | 185 ++------------------------
 2 files changed, 155 insertions(+), 180 deletions(-)

diff --git a/kernel/power/dgemv_n_microk_power10.c b/kernel/power/dgemv_n_microk_power10.c
index 4be8a5f9b..e47de2cb5 100644
--- a/kernel/power/dgemv_n_microk_power10.c
+++ b/kernel/power/dgemv_n_microk_power10.c
@@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
-/**************************************************************************************
-* 2016/03/30 Werner Saar (wernsaar@googlemail.com)
-* 	 BLASTEST 		: OK
-* 	 CTEST			: OK
-* 	 TEST			: OK
-*	 LAPACK-TEST		: OK
-**************************************************************************************/
-
 #define HAVE_KERNEL_4x4 1
 
 static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
@@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y
        "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
      );
 }
+static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha)
+{
+
+  double *a0;
+  double *a1;
+  double *a2;
+  double *a3;
+  double *a4;
+  double *a5;
+  double *a6;
+  double *a7;
+  long tmp;
+  __asm__
+    (
+       "lxvp		34, 0( %15)	\n\t"	// x0, x1
+       "lxvp		38, 32( %15)	\n\t"	// x4, x5
+
+       XXSPLTD_S(58,%x14,0)	// alpha, alpha
+       "sldi		%10, %17, 3	\n\t"	// lda * sizeof (double)
+       "xvmuldp         34, 34, 58      \n\t"   // x0 * alpha, x1 * alpha
+       "xvmuldp         35, 35, 58      \n\t"   // x2 * alpha, x3 * alpha
+       "xvmuldp         38, 38, 58      \n\t"   // x4 * alpha, x5 * alpha
+       "xvmuldp         39, 39, 58      \n\t"   // x6 * alpha, x7 * alpha
+
+       "li		%11, 32   \n\t"
+
+       "add		%4, %3, %10	\n\t"	// a0 = ap, a1 = a0 + lda
+       "add		%10, %10, %10	\n\t"	// 2 * lda
+       XXSPLTD_S(32,34,1)       // x0 * alpha, x0 * alpha
+       XXSPLTD_S(33,34,0)       // x1 * alpha, x1 * alpha
+       XXSPLTD_S(34,35,1)       // x2 * alpha, x2 * alpha
+       XXSPLTD_S(35,35,0)       // x3 * alpha, x3 * alpha
+       XXSPLTD_S(48,39,1)       // x6 * alpha, x6 * alpha
+       XXSPLTD_S(49,39,0)       // x7 * alpha, x7 * alpha
+       XXSPLTD_S(39,38,0)       // x5 * alpha, x5 * alpha
+       XXSPLTD_S(38,38,1)       // x4 * alpha, x4 * alpha
+
+       "add		%5, %3, %10	\n\t"	// a2 = a0 + 2 * lda
+       "add		%6, %4, %10	\n\t"	// a3 = a1 + 2 * lda
+       "add		%7, %5, %10	\n\t"	// a4 = a2 + 2 * lda
+       "add		%8, %6, %10	\n\t"	// a5 = a3 + 2 * lda
+       "add		%9, %7, %10	\n\t"	// a6 = a4 + 2 * lda
+       "add		%10, %8, %10	\n\t"	// a7 = a5 + 2 * lda
+
+       "lxvp		40, 0( %3)	\n\t"	// a0[0], a0[1]
+       "lxvp		42, 0( %4)	\n\t"	// a1[0], a1[1]
+       "lxvp		44, 0( %5)	\n\t"	// a2[0], a2[1]
+       "lxvp		46, 0( %6)	\n\t"	// a3[0], a3[1]
+       "lxvp		50, 0( %7)	\n\t"	// a4[0]
+       "lxvp		52, 0( %8)	\n\t"	// a5[0]
+       "lxvp		54, 0( %9)	\n\t"	// a6[0]
+       "lxvp		56, 0( %10)	\n\t"	// a7[0]
+
+
+       "addic.		%1, %1, -4	\n\t"
+       "ble		two%=		\n\t"
+
+       ".align	5		\n"
+     "one%=:				\n\t"
+
+       "lxvp		36, 0( %2)	\n\t"	// y0, y1
+
+       "xvmaddadp       36, 40, 34      \n\t"
+       "xvmaddadp       37, 41, 34      \n\t"
+       "lxvpx		40, %3, %11	\n\t"	// a0[0], a0[1]
+       "xvmaddadp       36, 42, 35      \n\t"
+       "xvmaddadp       37, 43, 35      \n\t"
+       "lxvpx		42, %4, %11	\n\t"	// a1[0], a1[1]
+       "xvmaddadp       36, 44, 32      \n\t"
+       "xvmaddadp       37, 45, 32      \n\t"
+       "lxvpx		44, %5, %11	\n\t"	// a2[0], a2[1]
+       "xvmaddadp       36, 46, 33      \n\t"
+       "xvmaddadp       37, 47, 33      \n\t"
+       "lxvpx		46, %6, %11	\n\t"	// a3[0], a3[1]
+       "xvmaddadp       36, 50, 48      \n\t"
+       "xvmaddadp       37, 51, 48      \n\t"
+       "lxvpx		50, %7, %11	\n\t"	// a4[0]
+       "xvmaddadp       36, 52, 49      \n\t"
+       "xvmaddadp       37, 53, 49      \n\t"
+       "lxvpx		52, %8, %11	\n\t"	// a5[0]
+       "xvmaddadp       36, 54, 38      \n\t"
+       "xvmaddadp       37, 55, 38      \n\t"
+       "lxvpx		54, %9, %11	\n\t"	// a6[0]
+       "xvmaddadp       36, 56, 39      \n\t"
+       "xvmaddadp       37, 57, 39      \n\t"
+       "lxvpx		56, %10, %11	\n\t"	// a7[0]
+       "addi		%11, %11, 32    \n\t"
+
+       "stxvp		36, 0( %2)	\n\t"	// y0, y1
+       "addi		%2, %2, 32	\n\t"
+
+       "addic.		%1, %1, -4	\n\t"
+       "bgt		one%=		\n"
+
+     "two%=:				\n\t"
+
+       "lxvp		36, 0( %2)	\n\t"	// y0, y1
+       "xvmaddadp       36, 40, 34      \n\t"
+       "xvmaddadp       37, 41, 34      \n\t"
+       "xvmaddadp       36, 42, 35      \n\t"
+       "xvmaddadp       37, 43, 35      \n\t"
+       "xvmaddadp       36, 44, 32      \n\t"
+       "xvmaddadp       37, 45, 32      \n\t"
+       "xvmaddadp       36, 46, 33      \n\t"
+       "xvmaddadp       37, 47, 33      \n\t"
+       "xvmaddadp       36, 50, 48      \n\t"
+       "xvmaddadp       37, 51, 48      \n\t"
+       "xvmaddadp       36, 52, 49      \n\t"
+       "xvmaddadp       37, 53, 49      \n\t"
+       "xvmaddadp       36, 54, 38      \n\t"
+       "xvmaddadp       37, 55, 38      \n\t"
+       "xvmaddadp       36, 56, 39      \n\t"
+       "xvmaddadp       37, 57, 39      \n\t"
+       "stxvp		36, 0( %2)	\n\t"	// y0, y1
+
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (y),	// 2
+       "=b" (a0),	// 3
+       "=b" (a1),	// 4
+       "=&b" (a2),	// 5
+       "=&b" (a3),	// 6
+       "=&b" (a4),	// 7
+       "=&b" (a5),	// 8
+       "=&b" (a6),	// 9
+       "=&b" (a7),	// 10
+       "=b" (tmp)
+     :
+       "m" (*x),
+       "m" (*ap),
+       "d" (alpha),	// 14
+       "r" (x),		// 15
+       "3" (ap),	// 16
+       "4" (lda)	// 17
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48",
+       "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58"
+     );
+}
diff --git a/kernel/power/dgemv_n_power10.c b/kernel/power/dgemv_n_power10.c
index ad5f1ba0d..aba15ab4e 100644
--- a/kernel/power/dgemv_n_power10.c
+++ b/kernel/power/dgemv_n_power10.c
@@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
 
 #include "common.h"
-#include <altivec.h>
-
-typedef __vector unsigned char  vec_t;
-typedef FLOAT v4sf_t __attribute__ ((vector_size (16)));
-typedef __vector_pair          __attribute__((aligned(8))) vecp_t;
 
 #include "dgemv_n_microk_power10.c"
 
-#define MMA(X, APTR, ACC) \
-        rX = (vec_t *) & X; \
-        rowA = *((vecp_t*)((void*)&APTR)); \
-        __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]);
-
-#define SAVE(ACC, Z) \
-        rowC = (v4sf_t *) &y[Z]; \
-        __builtin_mma_disassemble_acc ((void *)result, ACC); \
-        result[0][1] = result[1][0]; \
-        result[2][1] = result[3][0]; \
-        rowC[0] += valpha * result[0]; \
-        rowC[1] += valpha * result[2];
-
-void
-dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo,
-                    FLOAT * y, FLOAT alpha)
-{
-  BLASLONG i, j, tmp;
-  FLOAT *a0 = a_ptr;
-  FLOAT *x1 = xo;
-  vector double valpha = { alpha, alpha };
-  v4sf_t *rowC;
-  __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7;
-  v4sf_t result[4];
-  vecp_t rowA;
-  vec_t *rX;
-  tmp = (n / 32) * 32;
-  for (i = 0; i < tmp; i += 32)
-    {
-      xo = x1;
-      a0 = a_ptr;
-      __builtin_mma_xxsetaccz (&acc0);
-      __builtin_mma_xxsetaccz (&acc1);
-      __builtin_mma_xxsetaccz (&acc2);
-      __builtin_mma_xxsetaccz (&acc3);
-      __builtin_mma_xxsetaccz (&acc4);
-      __builtin_mma_xxsetaccz (&acc5);
-      __builtin_mma_xxsetaccz (&acc6);
-      __builtin_mma_xxsetaccz (&acc7);
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + 0 + j * lda], &acc0);
-          MMA (xo[j], a0[i + 4 + j * lda], &acc1);
-          MMA (xo[j], a0[i + 8 + j * lda], &acc2);
-          MMA (xo[j], a0[i + 12 + j * lda], &acc3);
-          MMA (xo[j], a0[i + 16 + j * lda], &acc4);
-          MMA (xo[j], a0[i + 20 + j * lda], &acc5);
-          MMA (xo[j], a0[i + 24 + j * lda], &acc6);
-          MMA (xo[j], a0[i + 28 + j * lda], &acc7);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      SAVE (&acc0, i + 0);
-      SAVE (&acc1, i + 4);
-      SAVE (&acc2, i + 8);
-      SAVE (&acc3, i + 12);
-      SAVE (&acc4, i + 16);
-      SAVE (&acc5, i + 20);
-      SAVE (&acc6, i + 24);
-      SAVE (&acc7, i + 28);
-
-    }
-  for (i = tmp; i < n; i += 4)
-    {
-      xo = x1;
-      a0 = a_ptr;
-      __builtin_mma_xxsetaccz (&acc0);
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      for (j = 0; j < 32; j++)
-        {
-          __builtin_prefetch (xo+j);
-          __builtin_prefetch (a0+i+j+lda);
-          MMA (xo[j], a0[i + j * lda], &acc0);
-        }
-      xo += 32;
-      a0 += lda << 5;
-      SAVE (&acc0, i);
-    }
-}
-
-
 #define NBMAX 4096
 
 #ifndef HAVE_KERNEL_4x4
@@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
-	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
 	BLASLONG m3;
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
-	BLASLONG lda128 = lda << 7;
+	BLASLONG lda8 = lda << 3;
 
 	FLOAT xbuffer[8] __attribute__ ((aligned (16)));
 	FLOAT *ybuffer;
@@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
         if ( n < 1 ) return(0);
 
 	ybuffer = buffer;
-	BLASLONG n128 = n >> 7;
-	n1 = (n - (n128 * 128)) >> 2;
-	n2 = (n - (n128 * 128)) & 3;
+	BLASLONG n8 = n >> 3;
+	n2 = n & 3;
 
         m3 = m & 3  ;
         m1 = m & -4 ;
@@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		if ( inc_x == 1 )
 		{
 
-			for( i = 0; i < n128 ; i++)
+			for( i = 0; i < n8 ; i++)
 			{
-				dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
-				a_ptr += lda128;
-				x_ptr += 128;
+				dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
+				a_ptr += lda8;
+				x_ptr += 8;
 			}
 
-			for( i = 0; i < n1 ; i++)
+			if( n & 4 )
 			{
 				dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
 				a_ptr += lda4;
@@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		}
 		else
 		{
-			for( i = 0; i < n128 ; i++)
+			for( i = 0; i < n8 ; i++)
 			{
-	                        FLOAT xbuffer[128] __attribute__ ((aligned (16)));
 				BLASLONG j;
-				for ( j = 0; j < 128 ; j++)
+				for ( j = 0; j < 8 ; j++)
 				{
 					xbuffer[j] = x_ptr[0];
 				        x_ptr += inc_x;
 				}
-				dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
-				a_ptr += lda128;
+				dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
+				a_ptr += lda8;
 			}
 
-			for( i = 0; i < n1 ; i++)
+			if( n & 4 )
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	

From f6620229942eb7b670d13a527e2b22bc5ac05441 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 30 Nov 2020 17:24:27 +0100
Subject: [PATCH 082/121] Move the version check to avoid overwriting
 unprocessed compiler data

---
 c_check | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/c_check b/c_check
index efea9b0fb..a841df153 100644
--- a/c_check
+++ b/c_check
@@ -229,16 +229,6 @@ $architecture = zarch  if ($data =~ /ARCH_ZARCH/);
 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
 
-
-if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
-$no_avx2 = 0;
-$oldgcc = 0;
-$data = `$compiler_name -dumpversion`;
-if ($data <= 4.6) {
-$no_avx2 = 1;
-$oldgcc = 1;
-}
-}
 $no_avx512= 0;
 if (($architecture eq "x86") || ($architecture eq "x86_64")) {
     eval "use File::Temp qw(tempfile)";
@@ -286,6 +276,15 @@ if ($data =~ /HAVE_C11/) {
     }
 }
 
+if ($compiler eq "GCC" &&( ($architecture eq "x86") || ($architecture eq "x86_64"))) {
+	$no_avx2 = 0;
+	$oldgcc = 0;
+	$data = `$compiler_name -dumpversion`;
+	if ($data <= 4.6) {
+		$no_avx2 = 1;
+		$oldgcc = 1;
+	}
+}
 
 $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;
 

From 22574b474eec3220b4fe78257f66898281502bd5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Mon, 30 Nov 2020 21:41:51 +0100
Subject: [PATCH 083/121] Suppress -mfma as well for gcc 4.6

---
 Makefile.x86_64 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Makefile.x86_64 b/Makefile.x86_64
index d806a4ed2..00967bcb6 100644
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@@ -32,10 +32,12 @@ CCOMMON_OPT += -mavx2
 FCOMMON_OPT += -mavx2
 endif
 endif
+ifndef OLDGCC
 ifdef HAVE_FMA3
 CCOMMON_OPT += -mfma
 FCOMMON_OPT += -mfma
 endif
+endif
 
 ifeq ($(CORE), SKYLAKEX)
 ifndef DYNAMIC_ARCH

From b766c1e9bb592396b0c71ba47bf48e83534ca52c Mon Sep 17 00:00:00 2001
From: Gengxin Xie <gengxin.xie@intel.com>
Date: Tue, 1 Dec 2020 16:49:26 +0800
Subject: [PATCH 084/121] Improve the performance of zasum and casum with
 AVX512 intrinsic

---
 kernel/x86_64/KERNEL.SKYLAKEX           |   3 +
 kernel/x86_64/casum.c                   | 144 ++++++++++
 kernel/x86_64/casum_microk_skylakex-2.c | 349 ++++++++++++++++++++++++
 kernel/x86_64/zasum.c                   | 144 ++++++++++
 kernel/x86_64/zasum_microk_skylakex-2.c | 340 +++++++++++++++++++++++
 5 files changed, 980 insertions(+)
 create mode 100644 kernel/x86_64/casum.c
 create mode 100644 kernel/x86_64/casum_microk_skylakex-2.c
 create mode 100644 kernel/x86_64/zasum.c
 create mode 100644 kernel/x86_64/zasum_microk_skylakex-2.c

diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX
index 9b8b84c30..3d71584fe 100644
--- a/kernel/x86_64/KERNEL.SKYLAKEX
+++ b/kernel/x86_64/KERNEL.SKYLAKEX
@@ -27,3 +27,6 @@ ZGEMMKERNEL    =  zgemm_kernel_4x2_skylakex.c
 
 CSCALKERNEL    = ../arm/zscal.c
 ZSCALKERNEL    = ../arm/zscal.c
+
+CASUMKERNEL = casum.c
+ZASUMKERNEL = zasum.c
diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
new file mode 100644
index 000000000..dce30e9b0
--- /dev/null
+++ b/kernel/x86_64/casum.c
@@ -0,0 +1,144 @@
+#include "common.h"
+
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
+
+#if defined(SKYLAKEX)
+#include "casum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_CASUM_KERNEL
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x1)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x = x1;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        temp0 = ABS_K(x[0]);
+        temp1 = ABS_K(x[1]);
+        temp2 = ABS_K(x[2]);
+        temp3 = ABS_K(x[3]);
+        temp4 = ABS_K(x[4]);
+        temp5 = ABS_K(x[5]);
+        temp6 = ABS_K(x[6]);
+        temp7 = ABS_K(x[7]);
+        
+        sum0 += temp0;
+        sum1 += temp1;
+        sum2 += temp2;
+        sum3 += temp3;
+        
+        sum0 += temp4;
+        sum1 += temp5;
+        sum2 += temp6;
+        sum3 += temp7;
+        
+        x+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += (ABS_K(x1[0]) + ABS_K(x1[1]));
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = casum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/casum_microk_skylakex-2.c b/kernel/x86_64/casum_microk_skylakex-2.c
new file mode 100644
index 000000000..d51929f9f
--- /dev/null
+++ b/kernel/x86_64/casum_microk_skylakex-2.c
@@ -0,0 +1,349 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_CASUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT casum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+    if (n2 < 64) {
+        __m128 accum_10, accum_11, accum_12, accum_13;
+        __m128 abs_mask1;
+
+        accum_10 = _mm_setzero_ps();
+        accum_11 = _mm_setzero_ps();
+        accum_12 = _mm_setzero_ps();
+        accum_13 = _mm_setzero_ps();
+        
+        abs_mask1 = (__m128)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
+        abs_mask1 = (__m128)_mm_srli_epi32((__m128i) abs_mask1, 1);
+                
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        
+        if (n2 >= 32){
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+            
+            _mm_prefetch(&x1[16], _MM_HINT_T0);
+            __m128 x04 = _mm_loadu_ps(&x1[16]);
+            __m128 x05 = _mm_loadu_ps(&x1[20]);
+            __m128 x06 = _mm_loadu_ps(&x1[24]);
+            __m128 x07 = _mm_loadu_ps(&x1[28]);
+
+            x00 = _mm_and_ps(x00, abs_mask1);
+            x01 = _mm_and_ps(x01, abs_mask1);
+            x02 = _mm_and_ps(x02, abs_mask1);
+            x03 = _mm_and_ps(x03, abs_mask1);
+            
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+
+            x04 = _mm_and_ps(x04, abs_mask1);
+            x05 = _mm_and_ps(x05, abs_mask1);
+            x06 = _mm_and_ps(x06, abs_mask1);
+            x07 = _mm_and_ps(x07, abs_mask1);
+            
+            accum_10 = _mm_add_ps(accum_10, x04);
+            accum_11 = _mm_add_ps(accum_11, x05);
+            accum_12 = _mm_add_ps(accum_12, x06);
+            accum_13 = _mm_add_ps(accum_13, x07);
+
+            n2 -= 32;
+            x1 += 32;
+        }
+
+        if (n2 >= 16) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            __m128 x02 = _mm_loadu_ps(&x1[ 8]);
+            __m128 x03 = _mm_loadu_ps(&x1[12]);
+
+            x00 = _mm_and_ps(x00, abs_mask1);
+            x01 = _mm_and_ps(x01, abs_mask1);
+            x02 = _mm_and_ps(x02, abs_mask1);
+            x03 = _mm_and_ps(x03, abs_mask1);
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+            accum_12 = _mm_add_ps(accum_12, x02);
+            accum_13 = _mm_add_ps(accum_13, x03);
+            
+            n2 -= 16;
+            x1 += 16;
+        }
+
+        if (n2 >= 8) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            __m128 x01 = _mm_loadu_ps(&x1[ 4]);
+            x00 = _mm_and_ps(x00, abs_mask1);
+            x01 = _mm_and_ps(x01, abs_mask1);
+            accum_10 = _mm_add_ps(accum_10, x00);
+            accum_11 = _mm_add_ps(accum_11, x01);
+
+            n2 -= 8;
+            x1 += 8;
+        }
+        
+        if (n2 >= 4) {
+            __m128 x00 = _mm_loadu_ps(&x1[ 0]);
+            x00 = _mm_and_ps(x00, abs_mask1);
+            accum_10 = _mm_add_ps(accum_10, x00);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+
+        if (n2) {
+            sumf += (ABS_K(x1[0]) + ABS_K(x1[1]));
+        }
+
+        accum_10 = _mm_add_ps(accum_10, accum_11);
+        accum_12 = _mm_add_ps(accum_12, accum_13);
+        accum_10 = _mm_add_ps(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+        accum_10 = _mm_hadd_ps(accum_10, accum_10);
+
+        sumf += accum_10[0];
+    }
+    else {
+        __m512 accum_0, accum_1, accum_2, accum_3;
+        __m512 x00, x01, x02, x03, x04, x05, x06, x07;
+        __m512 abs_mask = (__m512)_mm512_set1_epi32(0x7fffffff);
+        
+        accum_0 = _mm512_setzero_ps();
+        accum_1 = _mm512_setzero_ps();
+        accum_2 = _mm512_setzero_ps();
+        accum_3 = _mm512_setzero_ps();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 256) {
+            if (n2 >= 128) {
+                x00 = _mm512_loadu_ps(&x1[  0]);
+                x01 = _mm512_loadu_ps(&x1[ 16]);
+                x02 = _mm512_loadu_ps(&x1[ 32]);
+                x03 = _mm512_loadu_ps(&x1[ 48]);
+                x04 = _mm512_loadu_ps(&x1[ 64]);
+                x05 = _mm512_loadu_ps(&x1[ 80]);
+                x06 = _mm512_loadu_ps(&x1[ 96]);
+                x07 = _mm512_loadu_ps(&x1[112]);
+
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                
+                x04 = _mm512_and_ps(x04, abs_mask);
+                x05 = _mm512_and_ps(x05, abs_mask);
+                x06 = _mm512_and_ps(x06, abs_mask);
+                x07 = _mm512_and_ps(x07, abs_mask);
+                
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                
+                n2 -= 128;
+                x1 += 128;
+            }
+
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                x02 = _mm512_loadu_ps(&x1[32]);
+                x03 = _mm512_loadu_ps(&x1[48]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x01 = _mm512_loadu_ps(&x1[16]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_ps(&x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+
+            sumf =  _mm512_reduce_add_ps(accum_0);
+        }
+        // n2 >= 256, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 2) & 0xf;
+
+            if (0 != align_header) {
+                uint16_t align_mask16 = (((uint16_t)0xffff) >> (16 - align_header));
+                x00 = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &x1[0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_ps(&x1[  0]);
+            x01 = _mm512_load_ps(&x1[ 16]);
+            x02 = _mm512_load_ps(&x1[ 32]);
+            x03 = _mm512_load_ps(&x1[ 48]);
+            x04 = _mm512_load_ps(&x1[ 64]);
+            x05 = _mm512_load_ps(&x1[ 80]);
+            x06 = _mm512_load_ps(&x1[ 96]);
+            x07 = _mm512_load_ps(&x1[112]);
+            
+            n2 -= 128;
+            x1 += 128;
+
+            while (n2 >= 128) {
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                x00 = _mm512_load_ps(&x1[  0]);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                x01 = _mm512_load_ps(&x1[ 16]);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                x02 = _mm512_load_ps(&x1[ 32]);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+                x03 = _mm512_load_ps(&x1[ 48]);
+                
+                x04 = _mm512_and_ps(x04, abs_mask);
+                x05 = _mm512_and_ps(x05, abs_mask);
+                x06 = _mm512_and_ps(x06, abs_mask);
+                x07 = _mm512_and_ps(x07, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x04);
+                x04 = _mm512_load_ps(&x1[ 64]);
+                accum_1 = _mm512_add_ps(accum_1, x05);
+                x05 = _mm512_load_ps(&x1[ 80]);
+                accum_2 = _mm512_add_ps(accum_2, x06);
+                x06 = _mm512_load_ps(&x1[ 96]);
+                accum_3 = _mm512_add_ps(accum_3, x07);
+                x07 = _mm512_load_ps(&x1[112]);
+
+                n2 -= 128;
+                x1 += 128;
+            }
+            x00 = _mm512_and_ps(x00, abs_mask);
+            x01 = _mm512_and_ps(x01, abs_mask);
+            x02 = _mm512_and_ps(x02, abs_mask);
+            x03 = _mm512_and_ps(x03, abs_mask);
+            
+            accum_0 = _mm512_add_ps(accum_0, x00);
+            accum_1 = _mm512_add_ps(accum_1, x01);
+            accum_2 = _mm512_add_ps(accum_2, x02);
+            accum_3 = _mm512_add_ps(accum_3, x03);
+            
+            x04 = _mm512_and_ps(x04, abs_mask);
+            x05 = _mm512_and_ps(x05, abs_mask);
+            x06 = _mm512_and_ps(x06, abs_mask);
+            x07 = _mm512_and_ps(x07, abs_mask);
+            
+            accum_0 = _mm512_add_ps(accum_0, x04);
+            accum_1 = _mm512_add_ps(accum_1, x05);
+            accum_2 = _mm512_add_ps(accum_2, x06);
+            accum_3 = _mm512_add_ps(accum_3, x07);
+
+            if (n2 >= 64) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                x02 = _mm512_load_ps(&x1[32]);
+                x03 = _mm512_load_ps(&x1[48]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                x02 = _mm512_and_ps(x02, abs_mask);
+                x03 = _mm512_and_ps(x03, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+                accum_2 = _mm512_add_ps(accum_2, x02);
+                accum_3 = _mm512_add_ps(accum_3, x03);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x01 = _mm512_load_ps(&x1[16]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                x01 = _mm512_and_ps(x01, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+                accum_1 = _mm512_add_ps(accum_1, x01);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_ps(&x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2) {
+                uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16 - n2));
+                x00 = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &x1[ 0]);
+                x00 = _mm512_and_ps(x00, abs_mask);
+                accum_0 = _mm512_add_ps(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_ps(accum_0, accum_1);
+            accum_2 = _mm512_add_ps(accum_2, accum_3);
+            accum_0 = _mm512_add_ps(accum_0, accum_2);
+            sumf = _mm512_reduce_add_ps(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif
diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c
new file mode 100644
index 000000000..514ce2434
--- /dev/null
+++ b/kernel/x86_64/zasum.c
@@ -0,0 +1,144 @@
+#include "common.h"
+
+#ifndef ABS_K
+#define ABS_K(a) ((a) > 0 ? (a) : (-(a)))
+#endif
+
+#if defined(SKYLAKEX)
+#include "zasum_microk_skylakex-2.c"
+#endif
+
+#ifndef HAVE_ZASUM_KERNEL
+static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
+{
+
+    BLASLONG i=0;
+    BLASLONG n_8 = n & -8;
+    FLOAT *x1 = x;
+    FLOAT temp0, temp1, temp2, temp3;
+    FLOAT temp4, temp5, temp6, temp7;
+    FLOAT sum0 = 0.0;
+    FLOAT sum1 = 0.0;
+    FLOAT sum2 = 0.0;
+    FLOAT sum3 = 0.0;
+    FLOAT sum4 = 0.0;
+    
+    while (i < n_8) {
+        temp0 = ABS_K(x1[0]);
+        temp1 = ABS_K(x1[1]);
+        temp2 = ABS_K(x1[2]);
+        temp3 = ABS_K(x1[3]);
+        temp4 = ABS_K(x1[4]);
+        temp5 = ABS_K(x1[5]);
+        temp6 = ABS_K(x1[6]);
+        temp7 = ABS_K(x1[7]);
+        
+        sum0 += temp0;
+        sum1 += temp1;
+        sum2 += temp2;
+        sum3 += temp3;
+        
+        sum0 += temp4;
+        sum1 += temp5;
+        sum2 += temp6;
+        sum3 += temp7;
+        
+        x1+=8;
+        i+=4;
+    }
+
+     while (i < n) {
+        sum4 += ABS_K(x1[0]) + ABS_K(x1[1]);
+        x1 += 2;
+        i++;
+     }
+
+    return sum0+sum1+sum2+sum3+sum4;
+}
+
+#endif
+
+static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+    BLASLONG i = 0;
+    BLASLONG ip = 0;
+    BLASLONG inc_x2;
+    FLOAT sumf = 0.0;
+
+    if (n <= 0 || inc_x <= 0) return(sumf);
+    if (inc_x == 1) {
+        sumf = zasum_kernel(n, x);
+    }
+    else {
+        inc_x2 = 2 * inc_x;
+
+        while (i < n) {
+            sumf += ABS_K(x[ip]) + ABS_K(x[ip + 1]);
+            ip += inc_x2;
+            i++;
+        }
+    }
+
+    return(sumf);
+}
+
+#if defined(SMP)
+static int asum_thread_function(BLASLONG n, 
+        BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2,
+        FLOAT *x, BLASLONG inc_x,
+        FLOAT * dummy3, BLASLONG dummy4,
+        FLOAT * result, BLASLONG dummy5)
+{
+    *(FLOAT *) result = asum_compute(n, x, inc_x);
+    return 0;
+}
+
+extern int blas_level1_thread_with_value(int mode, 
+        BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
+        void *a, BLASLONG lda, 
+        void *b, BLASLONG ldb,
+        void *c, BLASLONG ldc,
+        int (*function)(),
+        int nthread);
+#endif
+
+FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
+{
+#if defined(SMP)
+    int nthreads;
+    FLOAT dummy_alpha[2];
+#endif
+    FLOAT sumf = 0.0;
+
+#if defined(SMP)
+    int num_cpu = num_cpu_avail(1);
+    if (n <= 10000 || inc_x <= 0)
+        nthreads = 1;
+    else
+        nthreads = num_cpu < n/10000 ? num_cpu : n/10000;
+    
+    if (nthreads == 1) {
+        sumf = asum_compute(n, x, inc_x);
+    }
+    else {
+        int mode, i;
+        char result[MAX_CPU_NUMBER * sizeof(double) *2];
+        FLOAT *ptr;
+#if !defined(DOUBLE)
+        mode = BLAS_SINGLE | BLAS_COMPLEX;
+#else
+        mode = BLAS_DOUBLE | BLAS_COMPLEX;
+#endif
+        blas_level1_thread_with_return_value(mode, n, 0, 0, dummy_alpha, x, inc_x, 
+                NULL, 0, result, 0, (void *)asum_thread_function, nthreads);
+        ptr = (FLOAT *)result;
+        for (i = 0; i < nthreads; i++) {
+            sumf += (*ptr);
+            ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2);
+        }
+    }
+#else
+    sumf = asum_compute(n, x, inc_x);
+#endif
+    return(sumf);
+}
diff --git a/kernel/x86_64/zasum_microk_skylakex-2.c b/kernel/x86_64/zasum_microk_skylakex-2.c
new file mode 100644
index 000000000..b44c53801
--- /dev/null
+++ b/kernel/x86_64/zasum_microk_skylakex-2.c
@@ -0,0 +1,340 @@
+/* need a new enough GCC for avx512 support */
+#if (( defined(__GNUC__)  && __GNUC__   > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9))
+
+#define HAVE_ZASUM_KERNEL 1
+
+#include <immintrin.h>
+
+#include <stdint.h>
+
+static FLOAT zasum_kernel(BLASLONG n, FLOAT *x)
+{
+    FLOAT *x1 = x;
+    FLOAT sumf=0.0;
+    BLASLONG n2 = n + n;
+    
+
+    if (n2 < 32) {
+        __m128d accum_10, accum_11, accum_12, accum_13;
+        __m128d abs_mask1;
+
+        accum_10 = _mm_setzero_pd();
+        accum_11 = _mm_setzero_pd();
+        accum_12 = _mm_setzero_pd();
+        accum_13 = _mm_setzero_pd();
+        
+        // abs_mask1 = (__m128d)_mm_set1_epi64x(0x7fffffffffffffff);
+        abs_mask1 = (__m128d)_mm_cmpeq_epi8((__m128i) abs_mask1, (__m128i) abs_mask1);
+        abs_mask1 = (__m128d)_mm_srli_epi64((__m128i) abs_mask1, 1);
+                
+        _mm_prefetch(&x1[0], _MM_HINT_T0);
+        if (n2 >= 16){
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+            
+            _mm_prefetch(&x1[8], _MM_HINT_T0);
+            __m128d x04 = _mm_loadu_pd(&x1[ 8]);
+            __m128d x05 = _mm_loadu_pd(&x1[10]);
+            __m128d x06 = _mm_loadu_pd(&x1[12]);
+            __m128d x07 = _mm_loadu_pd(&x1[14]);
+
+            x00 = _mm_and_pd(x00, abs_mask1);
+            x01 = _mm_and_pd(x01, abs_mask1);
+            x02 = _mm_and_pd(x02, abs_mask1);
+            x03 = _mm_and_pd(x03, abs_mask1);
+            
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+
+            x04 = _mm_and_pd(x04, abs_mask1);
+            x05 = _mm_and_pd(x05, abs_mask1);
+            x06 = _mm_and_pd(x06, abs_mask1);
+            x07 = _mm_and_pd(x07, abs_mask1);
+            
+            accum_10 = _mm_add_pd(accum_10, x04);
+            accum_11 = _mm_add_pd(accum_11, x05);
+            accum_12 = _mm_add_pd(accum_12, x06);
+            accum_13 = _mm_add_pd(accum_13, x07);
+
+            x1 += 16;
+            n2 -= 16;
+        }
+
+        if (n2 >= 8) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            __m128d x02 = _mm_loadu_pd(&x1[ 4]);
+            __m128d x03 = _mm_loadu_pd(&x1[ 6]);
+
+            x00 = _mm_and_pd(x00, abs_mask1);
+            x01 = _mm_and_pd(x01, abs_mask1);
+            x02 = _mm_and_pd(x02, abs_mask1);
+            x03 = _mm_and_pd(x03, abs_mask1);
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+            accum_12 = _mm_add_pd(accum_12, x02);
+            accum_13 = _mm_add_pd(accum_13, x03);
+            
+            n2 -= 8;
+            x1 += 8;
+        }
+
+        if (n2 >= 4) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            __m128d x01 = _mm_loadu_pd(&x1[ 2]);
+            x00 = _mm_and_pd(x00, abs_mask1);
+            x01 = _mm_and_pd(x01, abs_mask1);
+            accum_10 = _mm_add_pd(accum_10, x00);
+            accum_11 = _mm_add_pd(accum_11, x01);
+
+            n2 -= 4;
+            x1 += 4;
+        }
+        
+        if (n2) {
+            __m128d x00 = _mm_loadu_pd(&x1[ 0]);
+            x00 = _mm_and_pd(x00, abs_mask1);
+            accum_10 = _mm_add_pd(accum_10, x00);
+        }
+
+        accum_10 = _mm_add_pd(accum_10, accum_11);
+        accum_12 = _mm_add_pd(accum_12, accum_13);
+        accum_10 = _mm_add_pd(accum_10, accum_12);
+
+        accum_10 = _mm_hadd_pd(accum_10, accum_10);
+
+        sumf = accum_10[0];
+    }
+    else {
+        __m512d accum_0, accum_1, accum_2, accum_3;
+        __m512d x00, x01, x02, x03, x04, x05, x06, x07;
+        __m512d abs_mask = (__m512d)_mm512_set1_epi64(0x7fffffffffffffff);
+        
+        accum_0 = _mm512_setzero_pd();
+        accum_1 = _mm512_setzero_pd();
+        accum_2 = _mm512_setzero_pd();
+        accum_3 = _mm512_setzero_pd();
+
+        // alignment has side-effect when the size of input array is not large enough
+        if (n2 < 128) {
+            if (n2 >= 64) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                x04 = _mm512_loadu_pd(&x1[32]);
+                x05 = _mm512_loadu_pd(&x1[40]);
+                x06 = _mm512_loadu_pd(&x1[48]);
+                x07 = _mm512_loadu_pd(&x1[56]);
+
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                
+                x04 = _mm512_and_pd(x04, abs_mask);
+                x05 = _mm512_and_pd(x05, abs_mask);
+                x06 = _mm512_and_pd(x06, abs_mask);
+                x07 = _mm512_and_pd(x07, abs_mask);
+                
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                
+                n2 -= 64;
+                x1 += 64;
+            }
+
+            if (n2 >= 32) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x02 = _mm512_loadu_pd(&x1[16]);
+                x03 = _mm512_loadu_pd(&x1[24]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x01 = _mm512_loadu_pd(&x1[ 8]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_loadu_pd(&x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf =  _mm512_reduce_add_pd(accum_0);
+        }
+        // n2 >= 128, doing alignment
+        else {
+
+            int align_header = ((64 - ((uintptr_t)x1 & (uintptr_t)0x3f)) >> 3) & 0x7;
+
+            if (0 != align_header) {
+                unsigned char align_mask8 = (((unsigned char)0xff) >> (8 - align_header));
+                x00 = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &x1[0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= align_header;
+                x1 += align_header;
+            }
+
+            x00 = _mm512_load_pd(&x1[ 0]);
+            x01 = _mm512_load_pd(&x1[ 8]);
+            x02 = _mm512_load_pd(&x1[16]);
+            x03 = _mm512_load_pd(&x1[24]);
+            x04 = _mm512_load_pd(&x1[32]);
+            x05 = _mm512_load_pd(&x1[40]);
+            x06 = _mm512_load_pd(&x1[48]);
+            x07 = _mm512_load_pd(&x1[56]);
+            
+            n2 -= 64;
+            x1 += 64;
+
+            while (n2 >= 64) {
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                x00 = _mm512_load_pd(&x1[ 0]);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                x02 = _mm512_load_pd(&x1[16]);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+                x03 = _mm512_load_pd(&x1[24]);
+                
+                x04 = _mm512_and_pd(x04, abs_mask);
+                x05 = _mm512_and_pd(x05, abs_mask);
+                x06 = _mm512_and_pd(x06, abs_mask);
+                x07 = _mm512_and_pd(x07, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x04);
+                x04 = _mm512_load_pd(&x1[32]);
+                accum_1 = _mm512_add_pd(accum_1, x05);
+                x05 = _mm512_load_pd(&x1[40]);
+                accum_2 = _mm512_add_pd(accum_2, x06);
+                x06 = _mm512_load_pd(&x1[48]);
+                accum_3 = _mm512_add_pd(accum_3, x07);
+                x07 = _mm512_load_pd(&x1[56]);
+
+                n2 -= 64;
+                x1 += 64;
+            }
+            x00 = _mm512_and_pd(x00, abs_mask);
+            x01 = _mm512_and_pd(x01, abs_mask);
+            x02 = _mm512_and_pd(x02, abs_mask);
+            x03 = _mm512_and_pd(x03, abs_mask);
+            
+            accum_0 = _mm512_add_pd(accum_0, x00);
+            accum_1 = _mm512_add_pd(accum_1, x01);
+            accum_2 = _mm512_add_pd(accum_2, x02);
+            accum_3 = _mm512_add_pd(accum_3, x03);
+            
+            x04 = _mm512_and_pd(x04, abs_mask);
+            x05 = _mm512_and_pd(x05, abs_mask);
+            x06 = _mm512_and_pd(x06, abs_mask);
+            x07 = _mm512_and_pd(x07, abs_mask);
+            
+            accum_0 = _mm512_add_pd(accum_0, x04);
+            accum_1 = _mm512_add_pd(accum_1, x05);
+            accum_2 = _mm512_add_pd(accum_2, x06);
+            accum_3 = _mm512_add_pd(accum_3, x07);
+
+            if (n2 >= 32) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                x02 = _mm512_load_pd(&x1[16]);
+                x03 = _mm512_load_pd(&x1[24]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                x02 = _mm512_and_pd(x02, abs_mask);
+                x03 = _mm512_and_pd(x03, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+                accum_2 = _mm512_add_pd(accum_2, x02);
+                accum_3 = _mm512_add_pd(accum_3, x03);
+
+                n2 -= 32;
+                x1 += 32;
+            }
+
+            if (n2 >= 16) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x01 = _mm512_load_pd(&x1[ 8]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                x01 = _mm512_and_pd(x01, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+                accum_1 = _mm512_add_pd(accum_1, x01);
+
+                n2 -= 16;
+                x1 += 16;
+            }
+
+            if (n2 >= 8) {
+                x00 = _mm512_load_pd(&x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+
+                n2 -= 8;
+                x1 += 8;
+            }
+
+            if (n2) {
+                unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 - n2));
+                x00 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &x1[ 0]);
+                x00 = _mm512_and_pd(x00, abs_mask);
+                accum_0 = _mm512_add_pd(accum_0, x00);
+            }
+
+            accum_0 = _mm512_add_pd(accum_0, accum_1);
+            accum_2 = _mm512_add_pd(accum_2, accum_3);
+            accum_0 = _mm512_add_pd(accum_0, accum_2);
+            sumf = _mm512_reduce_add_pd(accum_0);
+        }
+    }
+
+    return sumf;
+}
+#endif

From 9621062ebabcfb8f75a318fbcaf9558b26de9799 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 1 Dec 2020 12:23:30 +0100
Subject: [PATCH 085/121] Update OSX xcode version to 11.5

---
 .travis.yml | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 3f917ce72..909d1eddb 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -211,7 +211,7 @@ matrix:
 
     - &test-macos
       os: osx
-      osx_image: xcode10.1
+      osx_image: xcode11.5
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
         - brew update
@@ -238,17 +238,23 @@ matrix:
         - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
 
     - <<: *test-macos
-      osx_image: xcode10.1
+      osx_image: xcode11.5
+      before_script:
+        - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+        - brew update
       env:
-        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
+#        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0"
+        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+        - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0"
         - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1"
-
     - <<: *test-macos
-      osx_image: xcode10.1
+      osx_image: xcode11.5
       env:
-        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
-        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
+#        - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+#        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1"
+        - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang"
+        - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1"
         - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1"
 
     - &test-graviton2

From 77a538d4ba34b2736014346285006b43ece2d0a4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 1 Dec 2020 22:05:35 +0100
Subject: [PATCH 086/121] Update an overlooked instance of xcode 10.0 as well

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 909d1eddb..7fe2ab388 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -233,7 +233,7 @@ matrix:
         - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
         
     - <<: *test-macos
-      osx_image: xcode10.0
+      osx_image: xcode11.5
       env:
         - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
 

From 0cb7a403b25ebd623f9de97123742c0274fb7147 Mon Sep 17 00:00:00 2001
From: Gengxin Xie <gengxin.xie@intel.com>
Date: Wed, 2 Dec 2020 09:51:52 +0800
Subject: [PATCH 087/121] fix error declare function
 blas_level1_thread_with_return_value

---
 kernel/x86_64/casum.c | 2 +-
 kernel/x86_64/zasum.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/x86_64/casum.c b/kernel/x86_64/casum.c
index dce30e9b0..a1bd76f33 100644
--- a/kernel/x86_64/casum.c
+++ b/kernel/x86_64/casum.c
@@ -93,7 +93,7 @@ static int asum_thread_function(BLASLONG n,
     return 0;
 }
 
-extern int blas_level1_thread_with_value(int mode, 
+extern int blas_level1_thread_with_return_value(int mode, 
         BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
         void *a, BLASLONG lda, 
         void *b, BLASLONG ldb,
diff --git a/kernel/x86_64/zasum.c b/kernel/x86_64/zasum.c
index 514ce2434..6e758e2e3 100644
--- a/kernel/x86_64/zasum.c
+++ b/kernel/x86_64/zasum.c
@@ -93,7 +93,7 @@ static int asum_thread_function(BLASLONG n,
     return 0;
 }
 
-extern int blas_level1_thread_with_value(int mode, 
+extern int blas_level1_thread_with_return_value(int mode, 
         BLASLONG m, BLASLONG n, BLASLONG k, void * alpha,
         void *a, BLASLONG lda, 
         void *b, BLASLONG ldb,

From c361313564b9909aea1587435d56a0f5ffe8fcf7 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 2 Dec 2020 07:49:43 +0100
Subject: [PATCH 088/121] Disable deprecated 32bit xcode

---
 .travis.yml | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 7fe2ab388..d532899fe 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -214,8 +214,6 @@ matrix:
       osx_image: xcode11.5
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-        - brew install gcc@8 # for gfortran
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
@@ -232,10 +230,10 @@ matrix:
       env:
         - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
         
-    - <<: *test-macos
-      osx_image: xcode11.5
-      env:
-        - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
+  #  - <<: *test-macos
+  #    osx_image: xcode10
+  #    env:
+  #      - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1"
 
     - <<: *test-macos
       osx_image: xcode11.5

From 57456c248b6b240d396cc628b4e361836afb1a10 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 2 Dec 2020 15:56:21 +0100
Subject: [PATCH 089/121] fix gfortran requirement in osx interface64 test

---
 .travis.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index d532899fe..83237662f 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -216,8 +216,10 @@ matrix:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
+        - brew update
+        - brew install gcc-10
       env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-8"
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
 
     - <<: *test-macos
       osx_image: xcode12

From dcbb3b5ef1e2aecad926526d21cf080d659eb6fa Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Wed, 2 Dec 2020 23:13:13 +0100
Subject: [PATCH 090/121] fix misplaced lines

---
 .travis.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 83237662f..771e70d42 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -214,10 +214,10 @@ matrix:
       osx_image: xcode11.5
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+        - brew update
+        - brew install gcc@10
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
-        - brew update
-        - brew install gcc-10
       env:
         - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
 

From 72a553f5bc032a2c9fdb08729e6a5e8a0b722d07 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Dec 2020 09:17:27 +0100
Subject: [PATCH 091/121] Update .travis.yml

---
 .travis.yml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 771e70d42..6c5fb2f96 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -214,23 +214,19 @@ matrix:
       osx_image: xcode11.5
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-        - brew install gcc@10
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
 
     - <<: *test-macos
       osx_image: xcode12
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
-        - brew update
-        - brew install gcc@10 # for gfortran
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
         
   #  - <<: *test-macos
   #    osx_image: xcode10

From a6692dc129acdd317f011c6dab1ea0a7e5080931 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Dec 2020 14:32:21 +0100
Subject: [PATCH 092/121] use gfortran-10 with xcode 12

---
 .travis.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 6c5fb2f96..bde0e202d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -223,10 +223,12 @@ matrix:
       osx_image: xcode12
       before_script:
         - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32"
+        - brew update
+        - brew install gcc@10
       script:
         - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE
       env:
-        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9"
+        - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10"
         
   #  - <<: *test-macos
   #    osx_image: xcode10

From da0c94c76f1494b50274e9e41227a3f15e4765ba Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Dec 2020 21:25:57 +0100
Subject: [PATCH 093/121] Avoid linking both GNU libgomp and LLVM libomp in
 clang/gfortran builds

---
 f_check | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/f_check b/f_check
index 9ef7b8086..cb869b3bb 100644
--- a/f_check
+++ b/f_check
@@ -330,6 +330,9 @@ if ($link ne "") {
 	    $flags =~ s/\@/\,/g;
 	    $linker_L .= "-Wl,". $flags . " " ;
 	}
+	if ($flags =~ /-lgomp/ && $CC == /clang/) {
+	    $flags = "-lomp";
+	}
 
 	if (
 	    ($flags =~ /^\-l/)

From 74b585058145ee362ab57fbcbbc5c0d19332b432 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Dec 2020 21:28:10 +0100
Subject: [PATCH 094/121] Add libomp to the LAPACK(-test) dependencies in
 clang/gfortran builds

---
 Makefile | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/Makefile b/Makefile
index a9af62a22..54dd3be41 100644
--- a/Makefile
+++ b/Makefile
@@ -268,7 +268,11 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN)))
 	-@echo "POPTS       = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "FFLAGS_NOOPT       = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "PNOOPT      = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc
+ifeq ($(C_COMPILER)$(F_COMPILER)$(USE_OPENMP), CLANGGFORTRAN1)
+	-@echo "LDFLAGS     = $(FFLAGS) $(EXTRALIB) -lomp" >> $(NETLIB_LAPACK_DIR)/make.inc
+else
 	-@echo "LDFLAGS     = $(FFLAGS) $(EXTRALIB)" >> $(NETLIB_LAPACK_DIR)/make.inc
+endif
 	-@echo "CC          = $(CC)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "override CFLAGS      = $(LAPACK_CFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc
 	-@echo "AR          = $(AR)" >> $(NETLIB_LAPACK_DIR)/make.inc

From 41fe6e864ed70860cda1b1ccef09b55caf41fec9 Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Thu, 3 Dec 2020 14:40:11 -0600
Subject: [PATCH 095/121] POWER10: Update param.h

Increasing the values of DGEMM_DEFAULT_P and DGEMM_DEFAULT_Q helps
in improving performance ~10% for DGEMM.
---
 param.h | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/param.h b/param.h
index 7789c83c7..ee5ad17fb 100644
--- a/param.h
+++ b/param.h
@@ -2388,7 +2388,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #endif
 
-#if defined(POWER9) || defined(POWER10)
+#if defined(POWER9)
 
 #define SNUMOPT		16
 #define DNUMOPT		8
@@ -2426,6 +2426,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #if defined(POWER10)
+#define SNUMOPT		16
+#define DNUMOPT		8
+
+#define GEMM_DEFAULT_OFFSET_A 0
+#define GEMM_DEFAULT_OFFSET_B 65536
+#define GEMM_DEFAULT_ALIGN 0x0ffffUL
+
+#define SGEMM_DEFAULT_UNROLL_M 16
+#define SGEMM_DEFAULT_UNROLL_N 8
+#define DGEMM_DEFAULT_UNROLL_M 8
+#define DGEMM_DEFAULT_UNROLL_N 8
+#define CGEMM_DEFAULT_UNROLL_M 8
+#define CGEMM_DEFAULT_UNROLL_N 4
+#define ZGEMM_DEFAULT_UNROLL_M 8
+#define ZGEMM_DEFAULT_UNROLL_N 2
+
+#define SGEMM_DEFAULT_P 832
+#define DGEMM_DEFAULT_P 320
+#define CGEMM_DEFAULT_P  512
+#define ZGEMM_DEFAULT_P 256
+
+#define SGEMM_DEFAULT_Q 1026
+#define DGEMM_DEFAULT_Q 960
+#define CGEMM_DEFAULT_Q  1026
+#define ZGEMM_DEFAULT_Q 1026
+
+#define SGEMM_DEFAULT_R 4096
+#define DGEMM_DEFAULT_R 4096
+#define CGEMM_DEFAULT_R 4096
+#define ZGEMM_DEFAULT_R 4096
+
+#define SYMV_P	 8
+
 #undef SBGEMM_DEFAULT_UNROLL_N
 #undef SBGEMM_DEFAULT_UNROLL_M
 #undef SBGEMM_DEFAULT_P
@@ -2436,10 +2469,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SBGEMM_DEFAULT_P 832
 #define SBGEMM_DEFAULT_Q 1026
 #define SBGEMM_DEFAULT_R 4096
-#undef DGEMM_DEFAULT_UNROLL_M
-#undef DGEMM_DEFAULT_UNROLL_N
-#define DGEMM_DEFAULT_UNROLL_M 8
-#define DGEMM_DEFAULT_UNROLL_N 8
 #endif
 
 #if defined(SPARC) && defined(V7)

From a1eecccda28cf7d00a5ffbbcd5afb4ca6ef6c6a1 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 3 Dec 2020 23:43:17 +0100
Subject: [PATCH 096/121] Update f_check

---
 f_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/f_check b/f_check
index cb869b3bb..42241ae10 100644
--- a/f_check
+++ b/f_check
@@ -330,7 +330,7 @@ if ($link ne "") {
 	    $flags =~ s/\@/\,/g;
 	    $linker_L .= "-Wl,". $flags . " " ;
 	}
-	if ($flags =~ /-lgomp/ && $CC == /clang/) {
+	if ($flags =~ /-lgomp/ && $CC =~ /clang/) {
 	    $flags = "-lomp";
 	}
 

From 213c0e7abb6ab909479e8e956b159c040a1782f8 Mon Sep 17 00:00:00 2001
From: Gordon Fossum <fossum@us.ibm.com>
Date: Fri, 4 Dec 2020 17:07:06 -0600
Subject: [PATCH 097/121] Added special unrolled vectorized versions of "Solve"
 for specific sizes, in DTRSM and STRSM, to improve performance in Power9 and
 Power10.

---
 kernel/power/KERNEL.POWER10           |   18 +-
 kernel/power/KERNEL.POWER9            |   14 +-
 kernel/power/trsm_kernel_LN_power10.c | 1280 +++++++++++++++++++++++++
 kernel/power/trsm_kernel_LT_power10.c | 1265 ++++++++++++++++++++++++
 kernel/power/trsm_kernel_RN_power10.c |  828 ++++++++++++++++
 kernel/power/trsm_kernel_RT_power10.c |  855 +++++++++++++++++
 6 files changed, 4244 insertions(+), 16 deletions(-)
 create mode 100644 kernel/power/trsm_kernel_LN_power10.c
 create mode 100644 kernel/power/trsm_kernel_LT_power10.c
 create mode 100644 kernel/power/trsm_kernel_RN_power10.c
 create mode 100644 kernel/power/trsm_kernel_RT_power10.c

diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10
index c25cd9f04..d61f5194a 100644
--- a/kernel/power/KERNEL.POWER10
+++ b/kernel/power/KERNEL.POWER10
@@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+STRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+STRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+STRSMKERNEL_RT	= trsm_kernel_RT_power10.c
+
+DTRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+DTRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9
index ab8fbfcd9..2bd2516de 100644
--- a/kernel/power/KERNEL.POWER9
+++ b/kernel/power/KERNEL.POWER9
@@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
 ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
 
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+STRSMKERNEL_LN	= trsm_kernel_LN_power10.c
+STRSMKERNEL_LT	= trsm_kernel_LT_power10.c
+STRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+STRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LN	= trsm_kernel_LN_power10.c
 DTRSMKERNEL_LT	= dtrsm_kernel_LT_16x4_power8.S
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+DTRSMKERNEL_RN	= trsm_kernel_RN_power10.c
+DTRSMKERNEL_RT	= trsm_kernel_RT_power10.c
 
 CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c
new file mode 100644
index 000000000..5ca1603a6
--- /dev/null
+++ b/kernel/power/trsm_kernel_LN_power10.c
@@ -0,0 +1,1280 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[56] = (c0[7] *= a[63]);
+   b[57] = (c1[7] *= a[63]);
+   b[58] = (c2[7] *= a[63]);
+   b[59] = (c3[7] *= a[63]);
+   b[60] = (c4[7] *= a[63]);
+   b[61] = (c5[7] *= a[63]);
+   b[62] = (c6[7] *= a[63]);
+   b[63] = (c7[7] *= a[63]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[29], 0);
+   VbS3 = vec_splat(Vb[29], 1);
+   VbS4 = vec_splat(Vb[30], 0);
+   VbS5 = vec_splat(Vb[30], 1);
+   VbS6 = vec_splat(Vb[31], 0);
+   VbS7 = vec_splat(Vb[31], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]);
+   c0[6] -= c0[7] * a[62];
+   c1[6] -= c1[7] * a[62];
+   c2[6] -= c2[7] * a[62];
+   c3[6] -= c3[7] * a[62];
+   c4[6] -= c4[7] * a[62];
+   c5[6] -= c5[7] * a[62];
+   c6[6] -= c6[7] * a[62];
+   c7[6] -= c7[7] * a[62];
+
+   b[48] = (c0[6] *= a[54]);
+   b[49] = (c1[6] *= a[54]);
+   b[50] = (c2[6] *= a[54]);
+   b[51] = (c3[6] *= a[54]);
+   b[52] = (c4[6] *= a[54]);
+   b[53] = (c5[6] *= a[54]);
+   b[54] = (c6[6] *= a[54]);
+   b[55] = (c7[6] *= a[54]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[25], 0);
+   VbS3 = vec_splat(Vb[25], 1);
+   VbS4 = vec_splat(Vb[26], 0);
+   VbS5 = vec_splat(Vb[26], 1);
+   VbS6 = vec_splat(Vb[27], 0);
+   VbS7 = vec_splat(Vb[27], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]);
+
+   b[40] = (c0[5] *= a[45]);
+   b[41] = (c1[5] *= a[45]);
+   b[42] = (c2[5] *= a[45]);
+   b[43] = (c3[5] *= a[45]);
+   b[44] = (c4[5] *= a[45]);
+   b[45] = (c5[5] *= a[45]);
+   b[46] = (c6[5] *= a[45]);
+   b[47] = (c7[5] *= a[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   VbS5 = vec_splat(Vb[22], 1);
+   VbS6 = vec_splat(Vb[23], 0);
+   VbS7 = vec_splat(Vb[23], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]);
+   c0[4] -= c0[5] * a[44];
+   c1[4] -= c1[5] * a[44];
+   c2[4] -= c2[5] * a[44];
+   c3[4] -= c3[5] * a[44];
+   c4[4] -= c4[5] * a[44];
+   c5[4] -= c5[5] * a[44];
+   c6[4] -= c6[5] * a[44];
+   c7[4] -= c7[5] * a[44];
+
+   b[32] = (c0[4] *= a[36]);
+   b[33] = (c1[4] *= a[36]);
+   b[34] = (c2[4] *= a[36]);
+   b[35] = (c3[4] *= a[36]);
+   b[36] = (c4[4] *= a[36]);
+   b[37] = (c5[4] *= a[36]);
+   b[38] = (c6[4] *= a[36]);
+   b[39] = (c7[4] *= a[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   VbS4 = vec_splat(Vb[18], 0);
+   VbS5 = vec_splat(Vb[18], 1);
+   VbS6 = vec_splat(Vb[19], 0);
+   VbS7 = vec_splat(Vb[19], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]);
+   
+   b[24] = (c0[3] *= a[27]);
+   b[25] = (c1[3] *= a[27]);
+   b[26] = (c2[3] *= a[27]);
+   b[27] = (c3[3] *= a[27]);
+   b[28] = (c4[3] *= a[27]);
+   b[29] = (c5[3] *= a[27]);
+   b[30] = (c6[3] *= a[27]);
+   b[31] = (c7[3] *= a[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   VbS3 = vec_splat(Vb[13], 1);
+   VbS4 = vec_splat(Vb[14], 0);
+   VbS5 = vec_splat(Vb[14], 1);
+   VbS6 = vec_splat(Vb[15], 0);
+   VbS7 = vec_splat(Vb[15], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]);
+   c0[2] -= c0[3] * a[26];
+   c1[2] -= c1[3] * a[26];
+   c2[2] -= c2[3] * a[26];
+   c3[2] -= c3[3] * a[26];
+   c4[2] -= c4[3] * a[26];
+   c5[2] -= c5[3] * a[26];
+   c6[2] -= c6[3] * a[26];
+   c7[2] -= c7[3] * a[26];
+
+   b[16] = (c0[2] *= a[18]);
+   b[17] = (c1[2] *= a[18]);
+   b[18] = (c2[2] *= a[18]);
+   b[19] = (c3[2] *= a[18]);
+   b[20] = (c4[2] *= a[18]);
+   b[21] = (c5[2] *= a[18]);
+   b[22] = (c6[2] *= a[18]);
+   b[23] = (c7[2] *= a[18]);
+   VbS0 = vec_splat(Vb[ 8], 0);
+   VbS1 = vec_splat(Vb[ 8], 1);
+   VbS2 = vec_splat(Vb[ 9], 0);
+   VbS3 = vec_splat(Vb[ 9], 1);
+   VbS4 = vec_splat(Vb[10], 0);
+   VbS5 = vec_splat(Vb[10], 1);
+   VbS6 = vec_splat(Vb[11], 0);
+   VbS7 = vec_splat(Vb[11], 1);
+   Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]);
+
+   b[ 8] = (c0[1] *= a[9]);
+   b[ 9] = (c1[1] *= a[9]);
+   b[10] = (c2[1] *= a[9]);
+   b[11] = (c3[1] *= a[9]);
+   b[12] = (c4[1] *= a[9]);
+   b[13] = (c5[1] *= a[9]);
+   b[14] = (c6[1] *= a[9]);
+   b[15] = (c7[1] *= a[9]);
+   c0[0] -= c0[1] * a[8];
+   c1[0] -= c1[1] * a[8];
+   c2[0] -= c2[1] * a[8];
+   c3[0] -= c3[1] * a[8];
+   c4[0] -= c4[1] * a[8];
+   c5[0] -= c5[1] * a[8];
+   c6[0] -= c6[1] * a[8];
+   c7[0] -= c7[1] * a[8];
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+   int  j;
+
+   b[120] = (c0[15] *= a[255]);
+   b[121] = (c1[15] *= a[255]);
+   b[122] = (c2[15] *= a[255]);
+   b[123] = (c3[15] *= a[255]);
+   b[124] = (c4[15] *= a[255]);
+   b[125] = (c5[15] *= a[255]);
+   b[126] = (c6[15] *= a[255]);
+   b[127] = (c7[15] *= a[255]);
+   VbS0 = vec_splat(Vb[30], 0);
+   VbS1 = vec_splat(Vb[30], 1);
+   VbS2 = vec_splat(Vb[30], 2);
+   VbS3 = vec_splat(Vb[30], 3);
+   VbS4 = vec_splat(Vb[31], 0);
+   VbS5 = vec_splat(Vb[31], 1);
+   VbS6 = vec_splat(Vb[31], 2);
+   VbS7 = vec_splat(Vb[31], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]);
+   c0[12] -= b[120] * a[252];
+   c0[13] -= b[120] * a[253];
+   c0[14] -= b[120] * a[254];
+   c1[12] -= b[121] * a[252];
+   c1[13] -= b[121] * a[253];
+   c1[14] -= b[121] * a[254];
+   c2[12] -= b[122] * a[252];
+   c2[13] -= b[122] * a[253];
+   c2[14] -= b[122] * a[254];
+   c3[12] -= b[123] * a[252];
+   c3[13] -= b[123] * a[253];
+   c3[14] -= b[123] * a[254];
+   c4[12] -= b[124] * a[252];
+   c4[13] -= b[124] * a[253];
+   c4[14] -= b[124] * a[254];
+   c5[12] -= b[125] * a[252];
+   c5[13] -= b[125] * a[253];
+   c5[14] -= b[125] * a[254];
+   c6[12] -= b[126] * a[252];
+   c6[13] -= b[126] * a[253];
+   c6[14] -= b[126] * a[254];
+   c7[12] -= b[127] * a[252];
+   c7[13] -= b[127] * a[253];
+   c7[14] -= b[127] * a[254];
+
+   b[112] = (c0[14] *= a[238]);
+   b[113] = (c1[14] *= a[238]);
+   b[114] = (c2[14] *= a[238]);
+   b[115] = (c3[14] *= a[238]);
+   b[116] = (c4[14] *= a[238]);
+   b[117] = (c5[14] *= a[238]);
+   b[118] = (c6[14] *= a[238]);
+   b[119] = (c7[14] *= a[238]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[28], 2);
+   VbS3 = vec_splat(Vb[28], 3);
+   VbS4 = vec_splat(Vb[29], 0);
+   VbS5 = vec_splat(Vb[29], 1);
+   VbS6 = vec_splat(Vb[29], 2);
+   VbS7 = vec_splat(Vb[29], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]);
+   c0[12] -= b[112] * a[236];
+   c0[13] -= b[112] * a[237];
+   c1[12] -= b[113] * a[236];
+   c1[13] -= b[113] * a[237];
+   c2[12] -= b[114] * a[236];
+   c2[13] -= b[114] * a[237];
+   c3[12] -= b[115] * a[236];
+   c3[13] -= b[115] * a[237];
+   c4[12] -= b[116] * a[236];
+   c4[13] -= b[116] * a[237];
+   c5[12] -= b[117] * a[236];
+   c5[13] -= b[117] * a[237];
+   c6[12] -= b[118] * a[236];
+   c6[13] -= b[118] * a[237];
+   c7[12] -= b[119] * a[236];
+   c7[13] -= b[119] * a[237];
+
+   b[104] = (c0[13] *= a[221]);
+   b[105] = (c1[13] *= a[221]);
+   b[106] = (c2[13] *= a[221]);
+   b[107] = (c3[13] *= a[221]);
+   b[108] = (c4[13] *= a[221]);
+   b[109] = (c5[13] *= a[221]);
+   b[110] = (c6[13] *= a[221]);
+   b[111] = (c7[13] *= a[221]);
+   VbS0 = vec_splat(Vb[26], 0);
+   VbS1 = vec_splat(Vb[26], 1);
+   VbS2 = vec_splat(Vb[26], 2);
+   VbS3 = vec_splat(Vb[26], 3);
+   VbS4 = vec_splat(Vb[27], 0);
+   VbS5 = vec_splat(Vb[27], 1);
+   VbS6 = vec_splat(Vb[27], 2);
+   VbS7 = vec_splat(Vb[27], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]);
+   c0[12] -= b[104] * a[220];
+   c1[12] -= b[105] * a[220];
+   c2[12] -= b[106] * a[220];
+   c3[12] -= b[107] * a[220];
+   c4[12] -= b[108] * a[220];
+   c5[12] -= b[109] * a[220];
+   c6[12] -= b[110] * a[220];
+   c7[12] -= b[111] * a[220];
+
+   b[ 96] = (c0[12] *= a[204]);
+   b[ 97] = (c1[12] *= a[204]);
+   b[ 98] = (c2[12] *= a[204]);
+   b[ 99] = (c3[12] *= a[204]);
+   b[100] = (c4[12] *= a[204]);
+   b[101] = (c5[12] *= a[204]);
+   b[102] = (c6[12] *= a[204]);
+   b[103] = (c7[12] *= a[204]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[24], 2);
+   VbS3 = vec_splat(Vb[24], 3);
+   VbS4 = vec_splat(Vb[25], 0);
+   VbS5 = vec_splat(Vb[25], 1);
+   VbS6 = vec_splat(Vb[25], 2);
+   VbS7 = vec_splat(Vb[25], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]);
+   Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]);
+   Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]);
+   Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]);
+   Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]);
+   Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]);
+   Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]);
+   Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]);
+
+   b[88] = (c0[11] *= a[187]);
+   b[89] = (c1[11] *= a[187]);
+   b[90] = (c2[11] *= a[187]);
+   b[91] = (c3[11] *= a[187]);
+   b[92] = (c4[11] *= a[187]);
+   b[93] = (c5[11] *= a[187]);
+   b[94] = (c6[11] *= a[187]);
+   b[95] = (c7[11] *= a[187]);
+   VbS0 = vec_splat(Vb[22], 0);
+   VbS1 = vec_splat(Vb[22], 1);
+   VbS2 = vec_splat(Vb[22], 2);
+   VbS3 = vec_splat(Vb[22], 3);
+   VbS4 = vec_splat(Vb[23], 0);
+   VbS5 = vec_splat(Vb[23], 1);
+   VbS6 = vec_splat(Vb[23], 2);
+   VbS7 = vec_splat(Vb[23], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]);
+   c0[ 8] -= b[88] * a[184];
+   c0[ 9] -= b[88] * a[185];
+   c0[10] -= b[88] * a[186];
+   c1[ 8] -= b[89] * a[184];
+   c1[ 9] -= b[89] * a[185];
+   c1[10] -= b[89] * a[186];
+   c2[ 8] -= b[90] * a[184];
+   c2[ 9] -= b[90] * a[185];
+   c2[10] -= b[90] * a[186];
+   c3[ 8] -= b[91] * a[184];
+   c3[ 9] -= b[91] * a[185];
+   c3[10] -= b[91] * a[186];
+   c4[ 8] -= b[92] * a[184];
+   c4[ 9] -= b[92] * a[185];
+   c4[10] -= b[92] * a[186];
+   c5[ 8] -= b[93] * a[184];
+   c5[ 9] -= b[93] * a[185];
+   c5[10] -= b[93] * a[186];
+   c6[ 8] -= b[94] * a[184];
+   c6[ 9] -= b[94] * a[185];
+   c6[10] -= b[94] * a[186];
+   c7[ 8] -= b[95] * a[184];
+   c7[ 9] -= b[95] * a[185];
+   c7[10] -= b[95] * a[186];
+
+   b[80] = (c0[10] *= a[170]);
+   b[81] = (c1[10] *= a[170]);
+   b[82] = (c2[10] *= a[170]);
+   b[83] = (c3[10] *= a[170]);
+   b[84] = (c4[10] *= a[170]);
+   b[85] = (c5[10] *= a[170]);
+   b[86] = (c6[10] *= a[170]);
+   b[87] = (c7[10] *= a[170]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[20], 2);
+   VbS3 = vec_splat(Vb[20], 3);
+   VbS4 = vec_splat(Vb[21], 0);
+   VbS5 = vec_splat(Vb[21], 1);
+   VbS6 = vec_splat(Vb[21], 2);
+   VbS7 = vec_splat(Vb[21], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]);
+   c0[8] -= b[80] * a[168];
+   c0[9] -= b[80] * a[169];
+   c1[8] -= b[81] * a[168];
+   c1[9] -= b[81] * a[169];
+   c2[8] -= b[82] * a[168];
+   c2[9] -= b[82] * a[169];
+   c3[8] -= b[83] * a[168];
+   c3[9] -= b[83] * a[169];
+   c4[8] -= b[84] * a[168];
+   c4[9] -= b[84] * a[169];
+   c5[8] -= b[85] * a[168];
+   c5[9] -= b[85] * a[169];
+   c6[8] -= b[86] * a[168];
+   c6[9] -= b[86] * a[169];
+   c7[8] -= b[87] * a[168];
+   c7[9] -= b[87] * a[169];
+
+   b[72] = (c0[9] *= a[153]);
+   b[73] = (c1[9] *= a[153]);
+   b[74] = (c2[9] *= a[153]);
+   b[75] = (c3[9] *= a[153]);
+   b[76] = (c4[9] *= a[153]);
+   b[77] = (c5[9] *= a[153]);
+   b[78] = (c6[9] *= a[153]);
+   b[79] = (c7[9] *= a[153]);
+   VbS0 = vec_splat(Vb[18], 0);
+   VbS1 = vec_splat(Vb[18], 1);
+   VbS2 = vec_splat(Vb[18], 2);
+   VbS3 = vec_splat(Vb[18], 3);
+   VbS4 = vec_splat(Vb[19], 0);
+   VbS5 = vec_splat(Vb[19], 1);
+   VbS6 = vec_splat(Vb[19], 2);
+   VbS7 = vec_splat(Vb[19], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]);
+   c0[8] -= b[72] * a[152];
+   c1[8] -= b[73] * a[152];
+   c2[8] -= b[74] * a[152];
+   c3[8] -= b[75] * a[152];
+   c4[8] -= b[76] * a[152];
+   c5[8] -= b[77] * a[152];
+   c6[8] -= b[78] * a[152];
+   c7[8] -= b[79] * a[152];
+
+   b[64] = (c0[8] *= a[136]);
+   b[65] = (c1[8] *= a[136]);
+   b[66] = (c2[8] *= a[136]);
+   b[67] = (c3[8] *= a[136]);
+   b[68] = (c4[8] *= a[136]);
+   b[69] = (c5[8] *= a[136]);
+   b[70] = (c6[8] *= a[136]);
+   b[71] = (c7[8] *= a[136]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[16], 2);
+   VbS3 = vec_splat(Vb[16], 3);
+   VbS4 = vec_splat(Vb[17], 0);
+   VbS5 = vec_splat(Vb[17], 1);
+   VbS6 = vec_splat(Vb[17], 2);
+   VbS7 = vec_splat(Vb[17], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]);
+   Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]);
+   Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]);
+   Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]);
+   Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]);
+   Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]);
+   Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]);
+   Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]);
+
+   b[56] = (c0[7] *= a[119]);
+   b[57] = (c1[7] *= a[119]);
+   b[58] = (c2[7] *= a[119]);
+   b[59] = (c3[7] *= a[119]);
+   b[60] = (c4[7] *= a[119]);
+   b[61] = (c5[7] *= a[119]);
+   b[62] = (c6[7] *= a[119]);
+   b[63] = (c7[7] *= a[119]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]);
+   c0[4] -= b[56] * a[116];
+   c0[5] -= b[56] * a[117];
+   c0[6] -= b[56] * a[118];
+   c1[4] -= b[57] * a[116];
+   c1[5] -= b[57] * a[117];
+   c1[6] -= b[57] * a[118];
+   c2[4] -= b[58] * a[116];
+   c2[5] -= b[58] * a[117];
+   c2[6] -= b[58] * a[118];
+   c3[4] -= b[59] * a[116];
+   c3[5] -= b[59] * a[117];
+   c3[6] -= b[59] * a[118];
+   c4[4] -= b[60] * a[116];
+   c4[5] -= b[60] * a[117];
+   c4[6] -= b[60] * a[118];
+   c5[4] -= b[61] * a[116];
+   c5[5] -= b[61] * a[117];
+   c5[6] -= b[61] * a[118];
+   c6[4] -= b[62] * a[116];
+   c6[5] -= b[62] * a[117];
+   c6[6] -= b[62] * a[118];
+   c7[4] -= b[63] * a[116];
+   c7[5] -= b[63] * a[117];
+   c7[6] -= b[63] * a[118];
+
+   b[48] = (c0[6] *= a[102]);
+   b[49] = (c1[6] *= a[102]);
+   b[50] = (c2[6] *= a[102]);
+   b[51] = (c3[6] *= a[102]);
+   b[52] = (c4[6] *= a[102]);
+   b[53] = (c5[6] *= a[102]);
+   b[54] = (c6[6] *= a[102]);
+   b[55] = (c7[6] *= a[102]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+   VbS7 = vec_splat(Vb[13], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]);
+   c0[4] -= b[48] * a[100];
+   c0[5] -= b[48] * a[101];
+   c1[4] -= b[49] * a[100];
+   c1[5] -= b[49] * a[101];
+   c2[4] -= b[50] * a[100];
+   c2[5] -= b[50] * a[101];
+   c3[4] -= b[51] * a[100];
+   c3[5] -= b[51] * a[101];
+   c4[4] -= b[52] * a[100];
+   c4[5] -= b[52] * a[101];
+   c5[4] -= b[53] * a[100];
+   c5[5] -= b[53] * a[101];
+   c6[4] -= b[54] * a[100];
+   c6[5] -= b[54] * a[101];
+   c7[4] -= b[55] * a[100];
+   c7[5] -= b[55] * a[101];
+
+   b[40] = (c0[5] *= a[85]);
+   b[41] = (c1[5] *= a[85]);
+   b[42] = (c2[5] *= a[85]);
+   b[43] = (c3[5] *= a[85]);
+   b[44] = (c4[5] *= a[85]);
+   b[45] = (c5[5] *= a[85]);
+   b[46] = (c6[5] *= a[85]);
+   b[47] = (c7[5] *= a[85]);
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+   VbS6 = vec_splat(Vb[11], 2);
+   VbS7 = vec_splat(Vb[11], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]);
+   c0[4] -= b[40] * a[84];
+   c1[4] -= b[41] * a[84];
+   c2[4] -= b[42] * a[84];
+   c3[4] -= b[43] * a[84];
+   c4[4] -= b[44] * a[84];
+   c5[4] -= b[45] * a[84];
+   c6[4] -= b[46] * a[84];
+   c7[4] -= b[47] * a[84];
+
+   b[32] = (c0[4] *= a[68]);
+   b[33] = (c1[4] *= a[68]);
+   b[34] = (c2[4] *= a[68]);
+   b[35] = (c3[4] *= a[68]);
+   b[36] = (c4[4] *= a[68]);
+   b[37] = (c5[4] *= a[68]);
+   b[38] = (c6[4] *= a[68]);
+   b[39] = (c7[4] *= a[68]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+   VbS5 = vec_splat(Vb[9], 1);
+   VbS6 = vec_splat(Vb[9], 2);
+   VbS7 = vec_splat(Vb[9], 3);
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]);
+   Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]);
+   Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]);
+   Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]);
+
+   b[24] = (c0[3] *= a[51]);
+   b[25] = (c1[3] *= a[51]);
+   b[26] = (c2[3] *= a[51]);
+   b[27] = (c3[3] *= a[51]);
+   b[28] = (c4[3] *= a[51]);
+   b[29] = (c5[3] *= a[51]);
+   b[30] = (c6[3] *= a[51]);
+   b[31] = (c7[3] *= a[51]);
+   c0[0] -= b[24] * a[48];
+   c0[1] -= b[24] * a[49];
+   c0[2] -= b[24] * a[50];
+   c1[0] -= b[25] * a[48];
+   c1[1] -= b[25] * a[49];
+   c1[2] -= b[25] * a[50];
+   c2[0] -= b[26] * a[48];
+   c2[1] -= b[26] * a[49];
+   c2[2] -= b[26] * a[50];
+   c3[0] -= b[27] * a[48];
+   c3[1] -= b[27] * a[49];
+   c3[2] -= b[27] * a[50];
+   c4[0] -= b[28] * a[48];
+   c4[1] -= b[28] * a[49];
+   c4[2] -= b[28] * a[50];
+   c5[0] -= b[29] * a[48];
+   c5[1] -= b[29] * a[49];
+   c5[2] -= b[29] * a[50];
+   c6[0] -= b[30] * a[48];
+   c6[1] -= b[30] * a[49];
+   c6[2] -= b[30] * a[50];
+   c7[0] -= b[31] * a[48];
+   c7[1] -= b[31] * a[49];
+   c7[2] -= b[31] * a[50];
+
+   b[16] = (c0[2] *= a[34]);
+   b[17] = (c1[2] *= a[34]);
+   b[18] = (c2[2] *= a[34]);
+   b[19] = (c3[2] *= a[34]);
+   b[20] = (c4[2] *= a[34]);
+   b[21] = (c5[2] *= a[34]);
+   b[22] = (c6[2] *= a[34]);
+   b[23] = (c7[2] *= a[34]);
+   c0[0] -= b[16] * a[32];
+   c0[1] -= b[16] * a[33];
+   c1[0] -= b[17] * a[32];
+   c1[1] -= b[17] * a[33];
+   c2[0] -= b[18] * a[32];
+   c2[1] -= b[18] * a[33];
+   c3[0] -= b[19] * a[32];
+   c3[1] -= b[19] * a[33];
+   c4[0] -= b[20] * a[32];
+   c4[1] -= b[20] * a[33];
+   c5[0] -= b[21] * a[32];
+   c5[1] -= b[21] * a[33];
+   c6[0] -= b[22] * a[32];
+   c6[1] -= b[22] * a[33];
+   c7[0] -= b[23] * a[32];
+   c7[1] -= b[23] * a[33];
+
+   b[ 8] = (c0[1] *= a[17]);
+   b[ 9] = (c1[1] *= a[17]);
+   b[10] = (c2[1] *= a[17]);
+   b[11] = (c3[1] *= a[17]);
+   b[12] = (c4[1] *= a[17]);
+   b[13] = (c5[1] *= a[17]);
+   b[14] = (c6[1] *= a[17]);
+   b[15] = (c7[1] *= a[17]);
+   c0[0] -= b[ 8] * a[16];
+   c1[0] -= b[ 9] * a[16];
+   c2[0] -= b[10] * a[16];
+   c3[0] -= b[11] * a[16];
+   c4[0] -= b[12] * a[16];
+   c5[0] -= b[13] * a[16];
+   c6[0] -= b[14] * a[16];
+   c7[0] -= b[15] * a[16];
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (m - 1) * m;
+  b += (m - 1) * n;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a -= m;
+    b -= 2 * n;
+  }
+
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+  a += (m - 1) * m * 2;
+  b += (m - 1) * n * 2;
+
+  for (i = m - 1; i >= 0; i--) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -=   cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a -= m * 2;
+    b -= 4 * n;
+  }
+
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LN : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  while (j > 0) {
+
+    kk = m + offset;
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      for (i = 1; i < GEMM_UNROLL_M; i *= 2){
+	if (m & i) {
+	  aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+	  cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + i             * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  solve(i, GEMM_UNROLL_N,
+		aa + (kk - i) * i             * COMPSIZE,
+		b  + (kk - i) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  kk -= i;
+	}
+      }
+    }
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+    if (i > 0) {
+      aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
+      cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+
+      do {
+	if (k - kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa + GEMM_UNROLL_M * kk * COMPSIZE,
+		      b +  GEMM_UNROLL_N * kk * COMPSIZE,
+		      cc,
+		      ldc);
+	}
+
+	if (well_aligned) {
+#ifdef DOUBLE
+	  solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	           b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	           b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	}
+	else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+	      b  + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+	}
+
+	aa -= GEMM_UNROLL_M * k * COMPSIZE;
+	cc -= GEMM_UNROLL_M     * COMPSIZE;
+	kk -= GEMM_UNROLL_M;
+	i --;
+      } while (i > 0);
+    }
+
+    b += GEMM_UNROLL_N * k * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	kk = m + offset;
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  for (i = 1; i < GEMM_UNROLL_M; i *= 2){
+	    if (m & i) {
+	      aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE;
+	      cc = c + ((m & ~(i - 1)) - i)     * COMPSIZE;
+
+	      if (k - kk > 0) {
+		GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa + i * kk * COMPSIZE,
+			    b  + j * kk * COMPSIZE,
+			    cc, ldc);
+	      }
+
+	      solve(i, j,
+		    aa + (kk - i) * i * COMPSIZE,
+		    b  + (kk - i) * j * COMPSIZE,
+		    cc, ldc);
+
+	      kk -= i;
+	    }
+	  }
+	}
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+	if (i > 0) {
+	  aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE;
+	  cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M)     * COMPSIZE;
+
+	  do {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			  b +  j             * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(GEMM_UNROLL_M, j,
+		  aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_M) * j             * COMPSIZE,
+		  cc, ldc);
+
+	    aa -= GEMM_UNROLL_M * k * COMPSIZE;
+	    cc -= GEMM_UNROLL_M     * COMPSIZE;
+	    kk -= GEMM_UNROLL_M;
+	    i --;
+	  } while (i > 0);
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c
new file mode 100644
index 000000000..14ff12fe4
--- /dev/null
+++ b/kernel/power/trsm_kernel_LT_power10.c
@@ -0,0 +1,1265 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_L
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[1], 0);
+   VbS3 = vec_splat(Vb[1], 1);
+   VbS4 = vec_splat(Vb[2], 0);
+   VbS5 = vec_splat(Vb[2], 1);
+   VbS6 = vec_splat(Vb[3], 0);
+   VbS7 = vec_splat(Vb[3], 1);
+   Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+   c0[1] -= c0[0] * a[1];
+   c1[1] -= c1[0] * a[1];
+   c2[1] -= c2[0] * a[1];
+   c3[1] -= c3[0] * a[1];
+   c4[1] -= c4[0] * a[1];
+   c5[1] -= c5[0] * a[1];
+   c6[1] -= c6[0] * a[1];
+   c7[1] -= c7[0] * a[1];
+
+   b[ 8] = (c0[1] *= a[9]);
+   b[ 9] = (c1[1] *= a[9]);
+   b[10] = (c2[1] *= a[9]);
+   b[11] = (c3[1] *= a[9]);
+   b[12] = (c4[1] *= a[9]);
+   b[13] = (c5[1] *= a[9]);
+   b[14] = (c6[1] *= a[9]);
+   b[15] = (c7[1] *= a[9]);
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[5], 0);
+   VbS3 = vec_splat(Vb[5], 1);
+   VbS4 = vec_splat(Vb[6], 0);
+   VbS5 = vec_splat(Vb[6], 1);
+   VbS6 = vec_splat(Vb[7], 0);
+   VbS7 = vec_splat(Vb[7], 1);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]);
+
+   b[16] = (c0[2] *= a[18]);
+   b[17] = (c1[2] *= a[18]);
+   b[18] = (c2[2] *= a[18]);
+   b[19] = (c3[2] *= a[18]);
+   b[20] = (c4[2] *= a[18]);
+   b[21] = (c5[2] *= a[18]);
+   b[22] = (c6[2] *= a[18]);
+   b[23] = (c7[2] *= a[18]);
+   VbS0 = vec_splat(Vb[ 8], 0);
+   VbS1 = vec_splat(Vb[ 8], 1);
+   VbS2 = vec_splat(Vb[ 9], 0);
+   VbS3 = vec_splat(Vb[ 9], 1);
+   VbS4 = vec_splat(Vb[10], 0);
+   VbS5 = vec_splat(Vb[10], 1);
+   VbS6 = vec_splat(Vb[11], 0);
+   VbS7 = vec_splat(Vb[11], 1);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]);
+   c0[3] -= c0[2] * a[19];
+   c1[3] -= c1[2] * a[19];
+   c2[3] -= c2[2] * a[19];
+   c3[3] -= c3[2] * a[19];
+   c4[3] -= c4[2] * a[19];
+   c5[3] -= c5[2] * a[19];
+   c6[3] -= c6[2] * a[19];
+   c7[3] -= c7[2] * a[19];
+
+   b[24] = (c0[3] *= a[27]);
+   b[25] = (c1[3] *= a[27]);
+   b[26] = (c2[3] *= a[27]);
+   b[27] = (c3[3] *= a[27]);
+   b[28] = (c4[3] *= a[27]);
+   b[29] = (c5[3] *= a[27]);
+   b[30] = (c6[3] *= a[27]);
+   b[31] = (c7[3] *= a[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   VbS3 = vec_splat(Vb[13], 1);
+   VbS4 = vec_splat(Vb[14], 0);
+   VbS5 = vec_splat(Vb[14], 1);
+   VbS6 = vec_splat(Vb[15], 0);
+   VbS7 = vec_splat(Vb[15], 1);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]);
+
+   b[32] = (c0[4] *= a[36]);
+   b[33] = (c1[4] *= a[36]);
+   b[34] = (c2[4] *= a[36]);
+   b[35] = (c3[4] *= a[36]);
+   b[36] = (c4[4] *= a[36]);
+   b[37] = (c5[4] *= a[36]);
+   b[38] = (c6[4] *= a[36]);
+   b[39] = (c7[4] *= a[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   VbS4 = vec_splat(Vb[18], 0);
+   VbS5 = vec_splat(Vb[18], 1);
+   VbS6 = vec_splat(Vb[19], 0);
+   VbS7 = vec_splat(Vb[19], 1);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]);
+   c0[5] -= c0[4] * a[37];
+   c1[5] -= c1[4] * a[37];
+   c2[5] -= c2[4] * a[37];
+   c3[5] -= c3[4] * a[37];
+   c4[5] -= c4[4] * a[37];
+   c5[5] -= c5[4] * a[37];
+   c6[5] -= c6[4] * a[37];
+   c7[5] -= c7[4] * a[37];
+
+   b[40] = (c0[5] *= a[45]);
+   b[41] = (c1[5] *= a[45]);
+   b[42] = (c2[5] *= a[45]);
+   b[43] = (c3[5] *= a[45]);
+   b[44] = (c4[5] *= a[45]);
+   b[45] = (c5[5] *= a[45]);
+   b[46] = (c6[5] *= a[45]);
+   b[47] = (c7[5] *= a[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   VbS5 = vec_splat(Vb[22], 1);
+   VbS6 = vec_splat(Vb[23], 0);
+   VbS7 = vec_splat(Vb[23], 1);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]);
+
+   b[48] = (c0[6] *= a[54]);
+   b[49] = (c1[6] *= a[54]);
+   b[50] = (c2[6] *= a[54]);
+   b[51] = (c3[6] *= a[54]);
+   b[52] = (c4[6] *= a[54]);
+   b[53] = (c5[6] *= a[54]);
+   b[54] = (c6[6] *= a[54]);
+   b[55] = (c7[6] *= a[54]);
+   c0[7] -= c0[6] * a[55];
+   c1[7] -= c1[6] * a[55];
+   c2[7] -= c2[6] * a[55];
+   c3[7] -= c3[6] * a[55];
+   c4[7] -= c4[6] * a[55];
+   c5[7] -= c5[6] * a[55];
+   c6[7] -= c6[6] * a[55];
+   c7[7] -= c7[6] * a[55];
+
+   b[56] = (c0[7] *= a[63]);
+   b[57] = (c1[7] *= a[63]);
+   b[58] = (c2[7] *= a[63]);
+   b[59] = (c3[7] *= a[63]);
+   b[60] = (c4[7] *= a[63]);
+   b[61] = (c5[7] *= a[63]);
+   b[62] = (c6[7] *= a[63]);
+   b[63] = (c7[7] *= a[63]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+   int  j;
+
+   b[0] = (c0[0] *= a[0]);
+   b[1] = (c1[0] *= a[0]);
+   b[2] = (c2[0] *= a[0]);
+   b[3] = (c3[0] *= a[0]);
+   b[4] = (c4[0] *= a[0]);
+   b[5] = (c5[0] *= a[0]);
+   b[6] = (c6[0] *= a[0]);
+   b[7] = (c7[0] *= a[0]);
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[0], 2);
+   VbS3 = vec_splat(Vb[0], 3);
+   VbS4 = vec_splat(Vb[1], 0);
+   VbS5 = vec_splat(Vb[1], 1);
+   VbS6 = vec_splat(Vb[1], 2);
+   VbS7 = vec_splat(Vb[1], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+   c0[1] -= b[0] * a[ 1];
+   c0[2] -= b[0] * a[ 2];
+   c0[3] -= b[0] * a[ 3];
+   c1[1] -= b[1] * a[ 1];
+   c1[2] -= b[1] * a[ 2];
+   c1[3] -= b[1] * a[ 3];
+   c2[1] -= b[2] * a[ 1];
+   c2[2] -= b[2] * a[ 2];
+   c2[3] -= b[2] * a[ 3];
+   c3[1] -= b[3] * a[ 1];
+   c3[2] -= b[3] * a[ 2];
+   c3[3] -= b[3] * a[ 3];
+   c4[1] -= b[4] * a[ 1];
+   c4[2] -= b[4] * a[ 2];
+   c4[3] -= b[4] * a[ 3];
+   c5[1] -= b[5] * a[ 1];
+   c5[2] -= b[5] * a[ 2];
+   c5[3] -= b[5] * a[ 3];
+   c6[1] -= b[6] * a[ 1];
+   c6[2] -= b[6] * a[ 2];
+   c6[3] -= b[6] * a[ 3];
+   c7[1] -= b[7] * a[ 1];
+   c7[2] -= b[7] * a[ 2];
+   c7[3] -= b[7] * a[ 3];
+ 
+   b[ 8] = (c0[1] *= a[17]);
+   b[ 9] = (c1[1] *= a[17]);
+   b[10] = (c2[1] *= a[17]);
+   b[11] = (c3[1] *= a[17]);
+   b[12] = (c4[1] *= a[17]);
+   b[13] = (c5[1] *= a[17]);
+   b[14] = (c6[1] *= a[17]);
+   b[15] = (c7[1] *= a[17]);
+   VbS0 = vec_splat(Vb[2], 0);
+   VbS1 = vec_splat(Vb[2], 1);
+   VbS2 = vec_splat(Vb[2], 2);
+   VbS3 = vec_splat(Vb[2], 3);
+   VbS4 = vec_splat(Vb[3], 0);
+   VbS5 = vec_splat(Vb[3], 1);
+   VbS6 = vec_splat(Vb[3], 2);
+   VbS7 = vec_splat(Vb[3], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]);
+   c0[2] -= b[ 8] * a[18];
+   c0[3] -= b[ 8] * a[19];
+   c1[2] -= b[ 9] * a[18];
+   c1[3] -= b[ 9] * a[19];
+   c2[2] -= b[10] * a[18];
+   c2[3] -= b[10] * a[19];
+   c3[2] -= b[11] * a[18];
+   c3[3] -= b[11] * a[19];
+   c4[2] -= b[12] * a[18];
+   c4[3] -= b[12] * a[19];
+   c5[2] -= b[13] * a[18];
+   c5[3] -= b[13] * a[19];
+   c6[2] -= b[14] * a[18];
+   c6[3] -= b[14] * a[19];
+   c7[2] -= b[15] * a[18];
+   c7[3] -= b[15] * a[19];
+
+   b[16] = (c0[2] *= a[34]);
+   b[17] = (c1[2] *= a[34]);
+   b[18] = (c2[2] *= a[34]);
+   b[19] = (c3[2] *= a[34]);
+   b[20] = (c4[2] *= a[34]);
+   b[21] = (c5[2] *= a[34]);
+   b[22] = (c6[2] *= a[34]);
+   b[23] = (c7[2] *= a[34]);
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[4], 2);
+   VbS3 = vec_splat(Vb[4], 3);
+   VbS4 = vec_splat(Vb[5], 0);
+   VbS5 = vec_splat(Vb[5], 1);
+   VbS6 = vec_splat(Vb[5], 2);
+   VbS7 = vec_splat(Vb[5], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]);
+   c0[3] -= b[16] * a[35];
+   c1[3] -= b[17] * a[35];
+   c2[3] -= b[18] * a[35];
+   c3[3] -= b[19] * a[35];
+   c4[3] -= b[20] * a[35];
+   c5[3] -= b[21] * a[35];
+   c6[3] -= b[22] * a[35];
+   c7[3] -= b[23] * a[35];
+
+   b[24] = (c0[3] *= a[51]);
+   b[25] = (c1[3] *= a[51]);
+   b[26] = (c2[3] *= a[51]);
+   b[27] = (c3[3] *= a[51]);
+   b[28] = (c4[3] *= a[51]);
+   b[29] = (c5[3] *= a[51]);
+   b[30] = (c6[3] *= a[51]);
+   b[31] = (c7[3] *= a[51]);
+   VbS0 = vec_splat(Vb[6], 0);
+   VbS1 = vec_splat(Vb[6], 1);
+   VbS2 = vec_splat(Vb[6], 2);
+   VbS3 = vec_splat(Vb[6], 3);
+   VbS4 = vec_splat(Vb[7], 0);
+   VbS5 = vec_splat(Vb[7], 1);
+   VbS6 = vec_splat(Vb[7], 2);
+   VbS7 = vec_splat(Vb[7], 3);
+   Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+   Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]);
+   Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]);
+   Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]);
+   Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]);
+   Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]);
+
+   b[32] = (c0[4] *= a[68]);
+   b[33] = (c1[4] *= a[68]);
+   b[34] = (c2[4] *= a[68]);
+   b[35] = (c3[4] *= a[68]);
+   b[36] = (c4[4] *= a[68]);
+   b[37] = (c5[4] *= a[68]);
+   b[38] = (c6[4] *= a[68]);
+   b[39] = (c7[4] *= a[68]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+   VbS5 = vec_splat(Vb[9], 1);
+   VbS6 = vec_splat(Vb[9], 2);
+   VbS7 = vec_splat(Vb[9], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]);
+   c0[5] -= b[32] * a[69];
+   c0[6] -= b[32] * a[70];
+   c0[7] -= b[32] * a[71];
+   c1[5] -= b[33] * a[69];
+   c1[6] -= b[33] * a[70];
+   c1[7] -= b[33] * a[71];
+   c2[5] -= b[34] * a[69];
+   c2[6] -= b[34] * a[70];
+   c2[7] -= b[34] * a[71];
+   c3[5] -= b[35] * a[69];
+   c3[6] -= b[35] * a[70];
+   c3[7] -= b[35] * a[71];
+   c4[5] -= b[36] * a[69];
+   c4[6] -= b[36] * a[70];
+   c4[7] -= b[36] * a[71];
+   c5[5] -= b[37] * a[69];
+   c5[6] -= b[37] * a[70];
+   c5[7] -= b[37] * a[71];
+   c6[5] -= b[38] * a[69];
+   c6[6] -= b[38] * a[70];
+   c6[7] -= b[38] * a[71];
+   c7[5] -= b[39] * a[69];
+   c7[6] -= b[39] * a[70];
+   c7[7] -= b[39] * a[71];
+
+   b[40] = (c0[5] *= a[85]);
+   b[41] = (c1[5] *= a[85]);
+   b[42] = (c2[5] *= a[85]);
+   b[43] = (c3[5] *= a[85]);
+   b[44] = (c4[5] *= a[85]);
+   b[45] = (c5[5] *= a[85]);
+   b[46] = (c6[5] *= a[85]);
+   b[47] = (c7[5] *= a[85]);
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+   VbS6 = vec_splat(Vb[11], 2);
+   VbS7 = vec_splat(Vb[11], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]);
+   c0[6] -= b[40] * a[86];
+   c0[7] -= b[40] * a[87];
+   c1[6] -= b[41] * a[86];
+   c1[7] -= b[41] * a[87];
+   c2[6] -= b[42] * a[86];
+   c2[7] -= b[42] * a[87];
+   c3[6] -= b[43] * a[86];
+   c3[7] -= b[43] * a[87];
+   c4[6] -= b[44] * a[86];
+   c4[7] -= b[44] * a[87];
+   c5[6] -= b[45] * a[86];
+   c5[7] -= b[45] * a[87];
+   c6[6] -= b[46] * a[86];
+   c6[7] -= b[46] * a[87];
+   c7[6] -= b[47] * a[86];
+   c7[7] -= b[47] * a[87];
+
+   b[48] = (c0[6] *= a[102]);
+   b[49] = (c1[6] *= a[102]);
+   b[50] = (c2[6] *= a[102]);
+   b[51] = (c3[6] *= a[102]);
+   b[52] = (c4[6] *= a[102]);
+   b[53] = (c5[6] *= a[102]);
+   b[54] = (c6[6] *= a[102]);
+   b[55] = (c7[6] *= a[102]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+   VbS7 = vec_splat(Vb[13], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]);
+   c0[7] -= b[48] * a[103];
+   c1[7] -= b[49] * a[103];
+   c2[7] -= b[50] * a[103];
+   c3[7] -= b[51] * a[103];
+   c4[7] -= b[52] * a[103];
+   c5[7] -= b[53] * a[103];
+   c6[7] -= b[54] * a[103];
+   c7[7] -= b[55] * a[103];
+
+   b[56] = (c0[7] *= a[119]);
+   b[57] = (c1[7] *= a[119]);
+   b[58] = (c2[7] *= a[119]);
+   b[59] = (c3[7] *= a[119]);
+   b[60] = (c4[7] *= a[119]);
+   b[61] = (c5[7] *= a[119]);
+   b[62] = (c6[7] *= a[119]);
+   b[63] = (c7[7] *= a[119]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]);
+   Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]);
+
+   b[64] = (c0[8] *= a[136]);
+   b[65] = (c1[8] *= a[136]);
+   b[66] = (c2[8] *= a[136]);
+   b[67] = (c3[8] *= a[136]);
+   b[68] = (c4[8] *= a[136]);
+   b[69] = (c5[8] *= a[136]);
+   b[70] = (c6[8] *= a[136]);
+   b[71] = (c7[8] *= a[136]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[16], 2);
+   VbS3 = vec_splat(Vb[16], 3);
+   VbS4 = vec_splat(Vb[17], 0);
+   VbS5 = vec_splat(Vb[17], 1);
+   VbS6 = vec_splat(Vb[17], 2);
+   VbS7 = vec_splat(Vb[17], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]);
+   c0[ 9] -= b[64] * a[137];
+   c0[10] -= b[64] * a[138];
+   c0[11] -= b[64] * a[139];
+   c1[ 9] -= b[65] * a[137];
+   c1[10] -= b[65] * a[138];
+   c1[11] -= b[65] * a[139];
+   c2[ 9] -= b[66] * a[137];
+   c2[10] -= b[66] * a[138];
+   c2[11] -= b[66] * a[139];
+   c3[ 9] -= b[67] * a[137];
+   c3[10] -= b[67] * a[138];
+   c3[11] -= b[67] * a[139];
+   c4[ 9] -= b[68] * a[137];
+   c4[10] -= b[68] * a[138];
+   c4[11] -= b[68] * a[139];
+   c5[ 9] -= b[69] * a[137];
+   c5[10] -= b[69] * a[138];
+   c5[11] -= b[69] * a[139];
+   c6[ 9] -= b[70] * a[137];
+   c6[10] -= b[70] * a[138];
+   c6[11] -= b[70] * a[139];
+   c7[ 9] -= b[71] * a[137];
+   c7[10] -= b[71] * a[138];
+   c7[11] -= b[71] * a[139];
+
+   b[72] = (c0[9] *= a[153]);
+   b[73] = (c1[9] *= a[153]);
+   b[74] = (c2[9] *= a[153]);
+   b[75] = (c3[9] *= a[153]);
+   b[76] = (c4[9] *= a[153]);
+   b[77] = (c5[9] *= a[153]);
+   b[78] = (c6[9] *= a[153]);
+   b[79] = (c7[9] *= a[153]);
+   VbS0 = vec_splat(Vb[18], 0);
+   VbS1 = vec_splat(Vb[18], 1);
+   VbS2 = vec_splat(Vb[18], 2);
+   VbS3 = vec_splat(Vb[18], 3);
+   VbS4 = vec_splat(Vb[19], 0);
+   VbS5 = vec_splat(Vb[19], 1);
+   VbS6 = vec_splat(Vb[19], 2);
+   VbS7 = vec_splat(Vb[19], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]);
+   c0[10] -= b[72] * a[154];
+   c0[11] -= b[72] * a[155];
+   c1[10] -= b[73] * a[154];
+   c1[11] -= b[73] * a[155];
+   c2[10] -= b[74] * a[154];
+   c2[11] -= b[74] * a[155];
+   c3[10] -= b[75] * a[154];
+   c3[11] -= b[75] * a[155];
+   c4[10] -= b[76] * a[154];
+   c4[11] -= b[76] * a[155];
+   c5[10] -= b[77] * a[154];
+   c5[11] -= b[77] * a[155];
+   c6[10] -= b[78] * a[154];
+   c6[11] -= b[78] * a[155];
+   c7[10] -= b[79] * a[154];
+   c7[11] -= b[79] * a[155];
+
+   b[80] = (c0[10] *= a[170]);
+   b[81] = (c1[10] *= a[170]);
+   b[82] = (c2[10] *= a[170]);
+   b[83] = (c3[10] *= a[170]);
+   b[84] = (c4[10] *= a[170]);
+   b[85] = (c5[10] *= a[170]);
+   b[86] = (c6[10] *= a[170]);
+   b[87] = (c7[10] *= a[170]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[20], 2);
+   VbS3 = vec_splat(Vb[20], 3);
+   VbS4 = vec_splat(Vb[21], 0);
+   VbS5 = vec_splat(Vb[21], 1);
+   VbS6 = vec_splat(Vb[21], 2);
+   VbS7 = vec_splat(Vb[21], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]);
+   c0[11] -= b[80] * a[171];
+   c1[11] -= b[81] * a[171];
+   c2[11] -= b[82] * a[171];
+   c3[11] -= b[83] * a[171];
+   c4[11] -= b[84] * a[171];
+   c5[11] -= b[85] * a[171];
+   c6[11] -= b[86] * a[171];
+   c7[11] -= b[87] * a[171];
+
+   b[88] = (c0[11] *= a[187]);
+   b[89] = (c1[11] *= a[187]);
+   b[90] = (c2[11] *= a[187]);
+   b[91] = (c3[11] *= a[187]);
+   b[92] = (c4[11] *= a[187]);
+   b[93] = (c5[11] *= a[187]);
+   b[94] = (c6[11] *= a[187]);
+   b[95] = (c7[11] *= a[187]);
+   VbS0 = vec_splat(Vb[22], 0);
+   VbS1 = vec_splat(Vb[22], 1);
+   VbS2 = vec_splat(Vb[22], 2);
+   VbS3 = vec_splat(Vb[22], 3);
+   VbS4 = vec_splat(Vb[23], 0);
+   VbS5 = vec_splat(Vb[23], 1);
+   VbS6 = vec_splat(Vb[23], 2);
+   VbS7 = vec_splat(Vb[23], 3);
+   Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]);
+   Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]);
+   Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]);
+   Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]);
+   Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]);
+   Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]);
+   Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]);
+   Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]);
+
+   b[ 96] = (c0[12] *= a[204]);
+   b[ 97] = (c1[12] *= a[204]);
+   b[ 98] = (c2[12] *= a[204]);
+   b[ 99] = (c3[12] *= a[204]);
+   b[100] = (c4[12] *= a[204]);
+   b[101] = (c5[12] *= a[204]);
+   b[102] = (c6[12] *= a[204]);
+   b[103] = (c7[12] *= a[204]);
+   c0[13] -= b[ 96] * a[205];
+   c0[14] -= b[ 96] * a[206];
+   c0[15] -= b[ 96] * a[207];
+   c1[13] -= b[ 97] * a[205];
+   c1[14] -= b[ 97] * a[206];
+   c1[15] -= b[ 97] * a[207];
+   c2[13] -= b[ 98] * a[205];
+   c2[14] -= b[ 98] * a[206];
+   c2[15] -= b[ 98] * a[207];
+   c3[13] -= b[ 99] * a[205];
+   c3[14] -= b[ 99] * a[206];
+   c3[15] -= b[ 99] * a[207];
+   c4[13] -= b[100] * a[205];
+   c4[14] -= b[100] * a[206];
+   c4[15] -= b[100] * a[207];
+   c5[13] -= b[101] * a[205];
+   c5[14] -= b[101] * a[206];
+   c5[15] -= b[101] * a[207];
+   c6[13] -= b[102] * a[205];
+   c6[14] -= b[102] * a[206];
+   c6[15] -= b[102] * a[207];
+   c7[13] -= b[103] * a[205];
+   c7[14] -= b[103] * a[206];
+   c7[15] -= b[103] * a[207];
+
+   b[104] = (c0[13] *= a[221]);
+   b[105] = (c1[13] *= a[221]);
+   b[106] = (c2[13] *= a[221]);
+   b[107] = (c3[13] *= a[221]);
+   b[108] = (c4[13] *= a[221]);
+   b[109] = (c5[13] *= a[221]);
+   b[110] = (c6[13] *= a[221]);
+   b[111] = (c7[13] *= a[221]);
+   c0[14] -= b[104] * a[222];
+   c0[15] -= b[104] * a[223];
+   c1[14] -= b[105] * a[222];
+   c1[15] -= b[105] * a[223];
+   c2[14] -= b[106] * a[222];
+   c2[15] -= b[106] * a[223];
+   c3[14] -= b[107] * a[222];
+   c3[15] -= b[107] * a[223];
+   c4[14] -= b[108] * a[222];
+   c4[15] -= b[108] * a[223];
+   c5[14] -= b[109] * a[222];
+   c5[15] -= b[109] * a[223];
+   c6[14] -= b[110] * a[222];
+   c6[15] -= b[110] * a[223];
+   c7[14] -= b[111] * a[222];
+   c7[15] -= b[111] * a[223];
+
+   b[112] = (c0[14] *= a[238]);
+   b[113] = (c1[14] *= a[238]);
+   b[114] = (c2[14] *= a[238]);
+   b[115] = (c3[14] *= a[238]);
+   b[116] = (c4[14] *= a[238]);
+   b[117] = (c5[14] *= a[238]);
+   b[118] = (c6[14] *= a[238]);
+   b[119] = (c7[14] *= a[238]);
+   c0[15] -= b[112] * a[239];
+   c1[15] -= b[113] * a[239];
+   c2[15] -= b[114] * a[239];
+   c3[15] -= b[115] * a[239];
+   c4[15] -= b[116] * a[239];
+   c5[15] -= b[117] * a[239];
+   c6[15] -= b[118] * a[239];
+   c7[15] -= b[119] * a[239];
+
+   b[120] = (c0[15] *= a[255]);
+   b[121] = (c1[15] *= a[255]);
+   b[122] = (c2[15] *= a[255]);
+   b[123] = (c3[15] *= a[255]);
+   b[124] = (c4[15] *= a[255]);
+   b[125] = (c5[15] *= a[255]);
+   b[126] = (c6[15] *= a[255]);
+   b[127] = (c7[15] *= a[255]);
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < m; i++) {
+
+    aa = *(a + i);
+
+    for (j = 0; j < n; j ++) {
+      bb = *(c + i + j * ldc);
+      bb *= aa;
+      *b             = bb;
+      *(c + i + j * ldc) = bb;
+      b ++;
+
+      for (k = i + 1; k < m; k ++){
+	*(c + k + j * ldc) -= bb * *(a + k);
+      }
+
+    }
+    a += m;
+  }
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < m; i++) {
+
+    aa1 = *(a + i * 2 + 0);
+    aa2 = *(a + i * 2 + 1);
+
+    for (j = 0; j < n; j ++) {
+      bb1 = *(c + i * 2 + 0 + j * ldc);
+      bb2 = *(c + i * 2 + 1 + j * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 = aa1 * bb1 + aa2 * bb2;
+      cc2 = aa1 * bb2 - aa2 * bb1;
+#endif
+
+      *(b + 0) = cc1;
+      *(b + 1) = cc2;
+      *(c + i * 2 + 0 + j * ldc) = cc1;
+      *(c + i * 2 + 1 + j * ldc) = cc2;
+      b += 2;
+
+      for (k = i + 1; k < m; k ++){
+#ifndef CONJ
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#else
+	*(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1);
+	*(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0);
+#endif
+      }
+
+    }
+    a += m * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+
+#if 0
+  fprintf(stderr, "TRSM KERNEL LT : m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  while (j > 0) {
+
+    kk = offset;
+    aa = a;
+    cc = c;
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+
+    while (i > 0) {
+
+	if (kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+      if (well_aligned) {
+#ifdef DOUBLE
+	solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		 b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		  b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+      }
+      else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + kk * GEMM_UNROLL_M * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+      }
+
+      aa += GEMM_UNROLL_M * k * COMPSIZE;
+      cc += GEMM_UNROLL_M     * COMPSIZE;
+      kk += GEMM_UNROLL_M;
+      i --;
+    }
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      i = (GEMM_UNROLL_M >> 1);
+      while (i > 0) {
+	if (m & i) {
+	    if (kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa, b, cc, ldc);
+	    }
+	  solve(i, GEMM_UNROLL_N,
+		aa + kk * i             * COMPSIZE,
+		b  + kk * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += i * k * COMPSIZE;
+	  cc += i     * COMPSIZE;
+	  kk += i;
+	}
+	i >>= 1;
+      }
+    }
+
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += GEMM_UNROLL_M;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	kk = offset;
+	aa = a;
+	cc = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+
+	while (i > 0) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(GEMM_UNROLL_M, j,
+		aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  kk += GEMM_UNROLL_M;
+	  i --;
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  while (i > 0) {
+	    if (m & i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+	      kk += i;
+	      }
+	    i >>= 1;
+	  }
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c
new file mode 100644
index 000000000..92c26fcc3
--- /dev/null
+++ b/kernel/power/trsm_kernel_RN_power10.c
@@ -0,0 +1,828 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
+
+   a[0] = (c0[0] *= b[0]);
+   a[1] = (c0[1] *= b[0]);
+   a[2] = (c0[2] *= b[0]);
+   a[3] = (c0[3] *= b[0]);
+   a[4] = (c0[4] *= b[0]);
+   a[5] = (c0[5] *= b[0]);
+   a[6] = (c0[6] *= b[0]);
+   a[7] = (c0[7] *= b[0]);
+   VbS0 = vec_splat(Vb[0], 1);
+   VbS1 = vec_splat(Vb[1], 0);
+   VbS2 = vec_splat(Vb[1], 1);
+   VbS3 = vec_splat(Vb[2], 0);
+   VbS4 = vec_splat(Vb[2], 1);
+   VbS5 = vec_splat(Vb[3], 0);
+   VbS6 = vec_splat(Vb[3], 1);
+   Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]);
+
+   a[ 8] = (c1[0] *= b[9]);
+   a[ 9] = (c1[1] *= b[9]);
+   a[10] = (c1[2] *= b[9]);
+   a[11] = (c1[3] *= b[9]);
+   a[12] = (c1[4] *= b[9]);
+   a[13] = (c1[5] *= b[9]);
+   a[14] = (c1[6] *= b[9]);
+   a[15] = (c1[7] *= b[9]);
+   VbS0 = vec_splat(Vb[5], 0);
+   VbS1 = vec_splat(Vb[5], 1);
+   VbS2 = vec_splat(Vb[6], 0);
+   VbS3 = vec_splat(Vb[6], 1);
+   VbS4 = vec_splat(Vb[7], 0);
+   VbS5 = vec_splat(Vb[7], 1);
+   Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]);
+
+   a[16] = (c2[0] *= b[18]);
+   a[17] = (c2[1] *= b[18]);
+   a[18] = (c2[2] *= b[18]);
+   a[19] = (c2[3] *= b[18]);
+   a[20] = (c2[4] *= b[18]);
+   a[21] = (c2[5] *= b[18]);
+   a[22] = (c2[6] *= b[18]);
+   a[23] = (c2[7] *= b[18]);
+   VbS0 = vec_splat(Vb[ 9], 1);
+   VbS1 = vec_splat(Vb[10], 0);
+   VbS2 = vec_splat(Vb[10], 1);
+   VbS3 = vec_splat(Vb[11], 0);
+   VbS4 = vec_splat(Vb[11], 1);
+   Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]);
+
+   a[24] = (c3[0] *= b[27]);
+   a[25] = (c3[1] *= b[27]);
+   a[26] = (c3[2] *= b[27]);
+   a[27] = (c3[3] *= b[27]);
+   a[28] = (c3[4] *= b[27]);
+   a[29] = (c3[5] *= b[27]);
+   a[30] = (c3[6] *= b[27]);
+   a[31] = (c3[7] *= b[27]);
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[15], 0);
+   VbS3 = vec_splat(Vb[15], 1);
+   Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]);
+
+   a[32] = (c4[0] *= b[36]);
+   a[33] = (c4[1] *= b[36]);
+   a[34] = (c4[2] *= b[36]);
+   a[35] = (c4[3] *= b[36]);
+   a[36] = (c4[4] *= b[36]);
+   a[37] = (c4[5] *= b[36]);
+   a[38] = (c4[6] *= b[36]);
+   a[39] = (c4[7] *= b[36]);
+   VbS0 = vec_splat(Vb[18], 1);
+   VbS1 = vec_splat(Vb[19], 0);
+   VbS2 = vec_splat(Vb[19], 1);
+   Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]);
+
+   a[40] = (c5[0] *= b[45]);
+   a[41] = (c5[1] *= b[45]);
+   a[42] = (c5[2] *= b[45]);
+   a[43] = (c5[3] *= b[45]);
+   a[44] = (c5[4] *= b[45]);
+   a[45] = (c5[5] *= b[45]);
+   a[46] = (c5[6] *= b[45]);
+   a[47] = (c5[7] *= b[45]);
+   VbS0 = vec_splat(Vb[23], 0);
+   VbS1 = vec_splat(Vb[23], 1);
+   Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]);
+   Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]);
+
+   a[48] = (c6[0] *= b[54]);
+   a[49] = (c6[1] *= b[54]);
+   a[50] = (c6[2] *= b[54]);
+   a[51] = (c6[3] *= b[54]);
+   a[52] = (c6[4] *= b[54]);
+   a[53] = (c6[5] *= b[54]);
+   a[54] = (c6[6] *= b[54]);
+   a[55] = (c6[7] *= b[54]);
+   VbS0 = vec_splat(Vb[27], 1);
+   Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]);
+   Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]);
+   Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]);
+   Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]);
+
+   a[56] = (c7[0] *= b[63]);
+   a[57] = (c7[1] *= b[63]);
+   a[58] = (c7[2] *= b[63]);
+   a[59] = (c7[3] *= b[63]);
+   a[60] = (c7[4] *= b[63]);
+   a[61] = (c7[5] *= b[63]);
+   a[62] = (c7[6] *= b[63]);
+   a[63] = (c7[7] *= b[63]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   VbS0 = vec_splat(Vb[0], 0);
+   VbS1 = vec_splat(Vb[0], 1);
+   VbS2 = vec_splat(Vb[0], 2);
+   VbS3 = vec_splat(Vb[0], 3);
+   VbS4 = vec_splat(Vb[1], 0);
+   VbS5 = vec_splat(Vb[1], 1);
+   VbS6 = vec_splat(Vb[1], 2);
+   VbS7 = vec_splat(Vb[1], 3);
+   
+   Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]);
+   Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]);
+   Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]);
+   Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]);
+   Va[0] = Vc0[0];
+   Va[1] = Vc0[1];
+   Va[2] = Vc0[2];
+   Va[3] = Vc0[3];
+   Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[2], 1);
+   VbS1 = vec_splat(Vb[2], 2);
+   VbS2 = vec_splat(Vb[2], 3);
+   VbS3 = vec_splat(Vb[3], 0);
+   VbS4 = vec_splat(Vb[3], 1);
+   VbS5 = vec_splat(Vb[3], 2);
+   VbS6 = vec_splat(Vb[3], 3);
+   
+   Vc1[0] = vec_mul(VbS0, Vc1[0]);
+   Vc1[1] = vec_mul(VbS0, Vc1[1]);
+   Vc1[2] = vec_mul(VbS0, Vc1[2]);
+   Vc1[3] = vec_mul(VbS0, Vc1[3]);
+   Va[4] = Vc1[0];
+   Va[5] = Vc1[1];
+   Va[6] = Vc1[2];
+   Va[7] = Vc1[3];
+   Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[4], 2);
+   VbS1 = vec_splat(Vb[4], 3);
+   VbS2 = vec_splat(Vb[5], 0);
+   VbS3 = vec_splat(Vb[5], 1);
+   VbS4 = vec_splat(Vb[5], 2);
+   VbS5 = vec_splat(Vb[5], 3);
+   
+   Vc2[0] = vec_mul(VbS0, Vc2[0]);
+   Vc2[1] = vec_mul(VbS0, Vc2[1]);
+   Vc2[2] = vec_mul(VbS0, Vc2[2]);
+   Vc2[3] = vec_mul(VbS0, Vc2[3]);
+   Va[ 8] = Vc2[0];
+   Va[ 9] = Vc2[1];
+   Va[10] = Vc2[2];
+   Va[11] = Vc2[3];
+   Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[6], 3);
+   VbS1 = vec_splat(Vb[7], 0);
+   VbS2 = vec_splat(Vb[7], 1);
+   VbS3 = vec_splat(Vb[7], 2);
+   VbS4 = vec_splat(Vb[7], 3);
+   
+   Vc3[0] = vec_mul(VbS0, Vc3[0]);
+   Vc3[1] = vec_mul(VbS0, Vc3[1]);
+   Vc3[2] = vec_mul(VbS0, Vc3[2]);
+   Vc3[3] = vec_mul(VbS0, Vc3[3]);
+   Va[12] = Vc3[0];
+   Va[13] = Vc3[1];
+   Va[14] = Vc3[2];
+   Va[15] = Vc3[3];
+   Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[9], 0);
+   VbS1 = vec_splat(Vb[9], 1);
+   VbS2 = vec_splat(Vb[9], 2);
+   VbS3 = vec_splat(Vb[9], 3);
+   
+   Vc4[0] = vec_mul(VbS0, Vc4[0]);
+   Vc4[1] = vec_mul(VbS0, Vc4[1]);
+   Vc4[2] = vec_mul(VbS0, Vc4[2]);
+   Vc4[3] = vec_mul(VbS0, Vc4[3]);
+   Va[16] = Vc4[0];
+   Va[17] = Vc4[1];
+   Va[18] = Vc4[2];
+   Va[19] = Vc4[3];
+   Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[11], 1);
+   VbS1 = vec_splat(Vb[11], 2);
+   VbS2 = vec_splat(Vb[11], 3);
+   
+   Vc5[0] = vec_mul(VbS0, Vc5[0]);
+   Vc5[1] = vec_mul(VbS0, Vc5[1]);
+   Vc5[2] = vec_mul(VbS0, Vc5[2]);
+   Vc5[3] = vec_mul(VbS0, Vc5[3]);
+   Va[20] = Vc5[0];
+   Va[21] = Vc5[1];
+   Va[22] = Vc5[2];
+   Va[23] = Vc5[3];
+   Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]);
+   Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[13], 2);
+   VbS1 = vec_splat(Vb[13], 3);
+   
+   Vc6[0] = vec_mul(VbS0, Vc6[0]);
+   Vc6[1] = vec_mul(VbS0, Vc6[1]);
+   Vc6[2] = vec_mul(VbS0, Vc6[2]);
+   Vc6[3] = vec_mul(VbS0, Vc6[3]);
+   Va[24] = Vc6[0];
+   Va[25] = Vc6[1];
+   Va[26] = Vc6[2];
+   Va[27] = Vc6[3];
+   Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]);
+   Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]);
+   Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]);
+   Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]);
+
+   VbS0 = vec_splat(Vb[15], 3);
+   
+   Vc7[0] = vec_mul(VbS0, Vc7[0]);
+   Vc7[1] = vec_mul(VbS0, Vc7[1]);
+   Vc7[2] = vec_mul(VbS0, Vc7[2]);
+   Vc7[3] = vec_mul(VbS0, Vc7[3]);
+   Va[28] = Vc7[0];
+   Va[29] = Vc7[1];
+   Va[30] = Vc7[2];
+   Va[31] = Vc7[3];
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa, bb;
+
+  int i, j, k;
+
+  for (i = 0; i < n; i++) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a  = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = i + 1; k < n; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b += n;
+  }
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  for (i = 0; i < n; i++) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1 + aa2 * bb2;
+      cc2 = -aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = i + 1; k < n; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b += n * 2;
+  }
+}
+
+#endif
+
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+  BLASLONG i, j, jj;
+
+#if 0
+  fprintf(stderr, "TRSM RN KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+  jj = 0;
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+  kk = -offset;
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  while (j > 0) {
+
+    aa = a;
+    cc = c;
+
+    i = (m >> GEMM_UNROLL_M_SHIFT);
+
+    if (i > 0) {
+      do {
+	if (kk > 0) {
+	  GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+		      ZERO,
+#endif
+		      aa, b, cc, ldc);
+	}
+
+	if (well_aligned) {
+#ifdef DOUBLE
+	  solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		   b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		   b  + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	}
+	else {
+	solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+	      aa + kk * GEMM_UNROLL_M * COMPSIZE,
+	      b  + kk * GEMM_UNROLL_N * COMPSIZE,
+	      cc, ldc);
+	}
+
+	aa += GEMM_UNROLL_M * k * COMPSIZE;
+	cc += GEMM_UNROLL_M     * COMPSIZE;
+	i --;
+      } while (i > 0);
+    }
+
+
+    if (m & (GEMM_UNROLL_M - 1)) {
+      i = (GEMM_UNROLL_M >> 1);
+      while (i > 0) {
+	if (m & i) {
+	    if (kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa, b, cc, ldc);
+	    }
+	  solve(i, GEMM_UNROLL_N,
+		aa + kk * i             * COMPSIZE,
+		b  + kk * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+
+	  aa += i * k * COMPSIZE;
+	  cc += i     * COMPSIZE;
+	}
+	i >>= 1;
+      }
+    }
+
+    kk += GEMM_UNROLL_N;
+    b += GEMM_UNROLL_N * k   * COMPSIZE;
+    c += GEMM_UNROLL_N * ldc * COMPSIZE;
+    j --;
+    jj += GEMM_UNROLL_M;
+  }
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = (GEMM_UNROLL_N >> 1);
+    while (j > 0) {
+      if (n & j) {
+
+	aa = a;
+	cc = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+
+	while (i > 0) {
+	  if (kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa,
+			b,
+			cc,
+			ldc);
+	  }
+
+	  solve(GEMM_UNROLL_M, j,
+		aa + kk * GEMM_UNROLL_M * COMPSIZE,
+		b  + kk * j             * COMPSIZE, cc, ldc);
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  i --;
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  while (i > 0) {
+	    if (m & i) {
+	      if (kk > 0) {
+		GEMM_KERNEL(i, j, kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa,
+			    b,
+			    cc,
+			    ldc);
+	      }
+
+	      solve(i, j,
+		    aa + kk * i * COMPSIZE,
+		    b  + kk * j * COMPSIZE, cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+	      }
+	    i >>= 1;
+	  }
+	}
+
+	b += j * k   * COMPSIZE;
+	c += j * ldc * COMPSIZE;
+	kk += j;
+      }
+      j >>= 1;
+    }
+  }
+
+  return 0;
+}
diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c
new file mode 100644
index 000000000..529590f37
--- /dev/null
+++ b/kernel/power/trsm_kernel_RT_power10.c
@@ -0,0 +1,855 @@
+/*********************************************************************/
+/* Copyright 2009, 2010 The University of Texas at Austin.           */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/* The views and conclusions contained in the software and           */
+/* documentation are those of the authors and should not be          */
+/* interpreted as representing official policies, either expressed   */
+/* or implied, of The University of Texas at Austin.                 */
+/*********************************************************************/
+
+#include "common.h"
+#include <altivec.h>
+
+static FLOAT dm1 = -1.;
+
+#ifdef CONJ
+#define GEMM_KERNEL   GEMM_KERNEL_R
+#else
+#define GEMM_KERNEL   GEMM_KERNEL_N
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 1
+#define GEMM_UNROLL_M_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 2
+#define GEMM_UNROLL_M_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 4
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 6
+#define GEMM_UNROLL_M_SHIFT 2
+#endif
+
+
+#if GEMM_DEFAULT_UNROLL_M == 8
+#define GEMM_UNROLL_M_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_M == 16
+#define GEMM_UNROLL_M_SHIFT 4
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 1
+#define GEMM_UNROLL_N_SHIFT 0
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 2
+#define GEMM_UNROLL_N_SHIFT 1
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 4
+#define GEMM_UNROLL_N_SHIFT 2
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 8
+#define GEMM_UNROLL_N_SHIFT 3
+#endif
+
+#if GEMM_DEFAULT_UNROLL_N == 16
+#define GEMM_UNROLL_N_SHIFT 4
+#endif
+
+#ifndef COMPLEX
+
+#ifdef DOUBLE
+
+static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6;
+
+   a[56] = (c7[0] *= b[63]);
+   a[57] = (c7[1] *= b[63]);
+   a[58] = (c7[2] *= b[63]);
+   a[59] = (c7[3] *= b[63]);
+   a[60] = (c7[4] *= b[63]);
+   a[61] = (c7[5] *= b[63]);
+   a[62] = (c7[6] *= b[63]);
+   a[63] = (c7[7] *= b[63]);
+   VbS0 = vec_splat(Vb[28], 0);
+   VbS1 = vec_splat(Vb[28], 1);
+   VbS2 = vec_splat(Vb[29], 0);
+   VbS3 = vec_splat(Vb[29], 1);
+   VbS4 = vec_splat(Vb[30], 0);
+   VbS5 = vec_splat(Vb[30], 1);
+   VbS6 = vec_splat(Vb[31], 0);
+   Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]);
+   Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]);
+   Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]);
+   Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]);
+   Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]);
+
+   a[48] = (c6[0] *= b[54]);
+   a[49] = (c6[1] *= b[54]);
+   a[50] = (c6[2] *= b[54]);
+   a[51] = (c6[3] *= b[54]);
+   a[52] = (c6[4] *= b[54]);
+   a[53] = (c6[5] *= b[54]);
+   a[54] = (c6[6] *= b[54]);
+   a[55] = (c6[7] *= b[54]);
+   VbS0 = vec_splat(Vb[24], 0);
+   VbS1 = vec_splat(Vb[24], 1);
+   VbS2 = vec_splat(Vb[25], 0);
+   VbS3 = vec_splat(Vb[25], 1);
+   VbS4 = vec_splat(Vb[26], 0);
+   VbS5 = vec_splat(Vb[26], 1);
+   Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]);
+   Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]);
+   Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]);
+   Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]);
+   Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]);
+
+   a[40] = (c5[0] *= b[45]);
+   a[41] = (c5[1] *= b[45]);
+   a[42] = (c5[2] *= b[45]);
+   a[43] = (c5[3] *= b[45]);
+   a[44] = (c5[4] *= b[45]);
+   a[45] = (c5[5] *= b[45]);
+   a[46] = (c5[6] *= b[45]);
+   a[47] = (c5[7] *= b[45]);
+   VbS0 = vec_splat(Vb[20], 0);
+   VbS1 = vec_splat(Vb[20], 1);
+   VbS2 = vec_splat(Vb[21], 0);
+   VbS3 = vec_splat(Vb[21], 1);
+   VbS4 = vec_splat(Vb[22], 0);
+   Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]);
+   Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]);
+   Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]);
+   Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]);
+   Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]);
+
+   a[32] = (c4[0] *= b[36]);
+   a[33] = (c4[1] *= b[36]);
+   a[34] = (c4[2] *= b[36]);
+   a[35] = (c4[3] *= b[36]);
+   a[36] = (c4[4] *= b[36]);
+   a[37] = (c4[5] *= b[36]);
+   a[38] = (c4[6] *= b[36]);
+   a[39] = (c4[7] *= b[36]);
+   VbS0 = vec_splat(Vb[16], 0);
+   VbS1 = vec_splat(Vb[16], 1);
+   VbS2 = vec_splat(Vb[17], 0);
+   VbS3 = vec_splat(Vb[17], 1);
+   Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]);
+   Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]);
+   Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]);
+   Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]);
+   Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]);
+
+   a[24] = (c3[0] *= b[27]);
+   a[25] = (c3[1] *= b[27]);
+   a[26] = (c3[2] *= b[27]);
+   a[27] = (c3[3] *= b[27]);
+   a[28] = (c3[4] *= b[27]);
+   a[29] = (c3[5] *= b[27]);
+   a[30] = (c3[6] *= b[27]);
+   a[31] = (c3[7] *= b[27]);
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[13], 0);
+   Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]);
+   Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]);
+   Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]);
+   Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]);
+   Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]);
+
+   a[16] = (c2[0] *= b[18]);
+   a[17] = (c2[1] *= b[18]);
+   a[18] = (c2[2] *= b[18]);
+   a[19] = (c2[3] *= b[18]);
+   a[20] = (c2[4] *= b[18]);
+   a[21] = (c2[5] *= b[18]);
+   a[22] = (c2[6] *= b[18]);
+   a[23] = (c2[7] *= b[18]);
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]);
+   Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]);
+   Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]);
+   Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]);
+   Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]);
+
+   a[ 8] = (c1[0] *= b[9]);
+   a[ 9] = (c1[1] *= b[9]);
+   a[10] = (c1[2] *= b[9]);
+   a[11] = (c1[3] *= b[9]);
+   a[12] = (c1[4] *= b[9]);
+   a[13] = (c1[5] *= b[9]);
+   a[14] = (c1[6] *= b[9]);
+   a[15] = (c1[7] *= b[9]);
+   VbS0 = vec_splat(Vb[4], 0);
+   Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]);
+   Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]);
+   Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]);
+   Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]);
+
+   a[0] = (c0[0] *= b[0]);
+   a[1] = (c0[1] *= b[0]);
+   a[2] = (c0[2] *= b[0]);
+   a[3] = (c0[3] *= b[0]);
+   a[4] = (c0[4] *= b[0]);
+   a[5] = (c0[5] *= b[0]);
+   a[6] = (c0[6] *= b[0]);
+   a[7] = (c0[7] *= b[0]);
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+   FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7;
+   c0 = &c[0*ldc];
+   c1 = &c[1*ldc];
+   c2 = &c[2*ldc];
+   c3 = &c[3*ldc];
+   c4 = &c[4*ldc];
+   c5 = &c[5*ldc];
+   c6 = &c[6*ldc];
+   c7 = &c[7*ldc];
+
+   vector FLOAT *Va = (vector FLOAT *) a;
+   vector FLOAT *Vb = (vector FLOAT *) b;
+   vector FLOAT *Vc0 = (vector FLOAT *) c0;
+   vector FLOAT *Vc1 = (vector FLOAT *) c1;
+   vector FLOAT *Vc2 = (vector FLOAT *) c2;
+   vector FLOAT *Vc3 = (vector FLOAT *) c3;
+   vector FLOAT *Vc4 = (vector FLOAT *) c4;
+   vector FLOAT *Vc5 = (vector FLOAT *) c5;
+   vector FLOAT *Vc6 = (vector FLOAT *) c6;
+   vector FLOAT *Vc7 = (vector FLOAT *) c7;
+   vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7;
+
+   VbS0 = vec_splat(Vb[14], 0);
+   VbS1 = vec_splat(Vb[14], 1);
+   VbS2 = vec_splat(Vb[14], 2);
+   VbS3 = vec_splat(Vb[14], 3);
+   VbS4 = vec_splat(Vb[15], 0);
+   VbS5 = vec_splat(Vb[15], 1);
+   VbS6 = vec_splat(Vb[15], 2);
+   VbS7 = vec_splat(Vb[15], 3);
+
+   Vc7[0] = vec_mul(VbS7, Vc7[0]);
+   Vc7[1] = vec_mul(VbS7, Vc7[1]);
+   Vc7[2] = vec_mul(VbS7, Vc7[2]);
+   Vc7[3] = vec_mul(VbS7, Vc7[3]);
+   Va[28] = Vc7[0];
+   Va[29] = Vc7[1];
+   Va[30] = Vc7[2];
+   Va[31] = Vc7[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]);
+   Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]);
+   Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]);
+   Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]);
+   Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]);
+
+   VbS0 = vec_splat(Vb[12], 0);
+   VbS1 = vec_splat(Vb[12], 1);
+   VbS2 = vec_splat(Vb[12], 2);
+   VbS3 = vec_splat(Vb[12], 3);
+   VbS4 = vec_splat(Vb[13], 0);
+   VbS5 = vec_splat(Vb[13], 1);
+   VbS6 = vec_splat(Vb[13], 2);
+
+   Vc6[0] = vec_mul(VbS6, Vc6[0]);
+   Vc6[1] = vec_mul(VbS6, Vc6[1]);
+   Vc6[2] = vec_mul(VbS6, Vc6[2]);
+   Vc6[3] = vec_mul(VbS6, Vc6[3]);
+   Va[24] = Vc6[0];
+   Va[25] = Vc6[1];
+   Va[26] = Vc6[2];
+   Va[27] = Vc6[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]);
+   Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]);
+   Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]);
+   Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]);
+   Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]);
+
+   VbS0 = vec_splat(Vb[10], 0);
+   VbS1 = vec_splat(Vb[10], 1);
+   VbS2 = vec_splat(Vb[10], 2);
+   VbS3 = vec_splat(Vb[10], 3);
+   VbS4 = vec_splat(Vb[11], 0);
+   VbS5 = vec_splat(Vb[11], 1);
+
+   Vc5[0] = vec_mul(VbS5, Vc5[0]);
+   Vc5[1] = vec_mul(VbS5, Vc5[1]);
+   Vc5[2] = vec_mul(VbS5, Vc5[2]);
+   Vc5[3] = vec_mul(VbS5, Vc5[3]);
+   Va[20] = Vc5[0];
+   Va[21] = Vc5[1];
+   Va[22] = Vc5[2];
+   Va[23] = Vc5[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]);
+   Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]);
+   Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]);
+   Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]);
+   Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]);
+
+   VbS0 = vec_splat(Vb[8], 0);
+   VbS1 = vec_splat(Vb[8], 1);
+   VbS2 = vec_splat(Vb[8], 2);
+   VbS3 = vec_splat(Vb[8], 3);
+   VbS4 = vec_splat(Vb[9], 0);
+
+   Vc4[0] = vec_mul(VbS4, Vc4[0]);
+   Vc4[1] = vec_mul(VbS4, Vc4[1]);
+   Vc4[2] = vec_mul(VbS4, Vc4[2]);
+   Vc4[3] = vec_mul(VbS4, Vc4[3]);
+   Va[16] = Vc4[0];
+   Va[17] = Vc4[1];
+   Va[18] = Vc4[2];
+   Va[19] = Vc4[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]);
+   Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]);
+   Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]);
+   Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]);
+   Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]);
+
+   VbS0 = vec_splat(Vb[6], 0);
+   VbS1 = vec_splat(Vb[6], 1);
+   VbS2 = vec_splat(Vb[6], 2);
+   VbS3 = vec_splat(Vb[6], 3);
+
+   Vc3[0] = vec_mul(VbS3, Vc3[0]);
+   Vc3[1] = vec_mul(VbS3, Vc3[1]);
+   Vc3[2] = vec_mul(VbS3, Vc3[2]);
+   Vc3[3] = vec_mul(VbS3, Vc3[3]);
+   Va[12] = Vc3[0];
+   Va[13] = Vc3[1];
+   Va[14] = Vc3[2];
+   Va[15] = Vc3[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]);
+   Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]);
+   Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]);
+   Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]);
+   Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]);
+
+   VbS0 = vec_splat(Vb[4], 0);
+   VbS1 = vec_splat(Vb[4], 1);
+   VbS2 = vec_splat(Vb[4], 2);
+
+   Vc2[0] = vec_mul(VbS2, Vc2[0]);
+   Vc2[1] = vec_mul(VbS2, Vc2[1]);
+   Vc2[2] = vec_mul(VbS2, Vc2[2]);
+   Vc2[3] = vec_mul(VbS2, Vc2[3]);
+   Va[ 8] = Vc2[0];
+   Va[ 9] = Vc2[1];
+   Va[10] = Vc2[2];
+   Va[11] = Vc2[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]);
+   Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]);
+   Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]);
+   Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]);
+   Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]);
+
+   VbS0 = vec_splat(Vb[2], 0);
+   VbS1 = vec_splat(Vb[2], 1);
+
+   Vc1[0] = vec_mul(VbS1, Vc1[0]);
+   Vc1[1] = vec_mul(VbS1, Vc1[1]);
+   Vc1[2] = vec_mul(VbS1, Vc1[2]);
+   Vc1[3] = vec_mul(VbS1, Vc1[3]);
+   Va[4] = Vc1[0];
+   Va[5] = Vc1[1];
+   Va[6] = Vc1[2];
+   Va[7] = Vc1[3];
+   Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]);
+   Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]);
+   Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]);
+   Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]);
+
+   VbS0 = vec_splat(Vb[0], 0);
+
+   Vc0[0] = vec_mul(VbS0, Vc0[0]);
+   Vc0[1] = vec_mul(VbS0, Vc0[1]);
+   Vc0[2] = vec_mul(VbS0, Vc0[2]);
+   Vc0[3] = vec_mul(VbS0, Vc0[3]);
+   Va[0] = Vc0[0];
+   Va[1] = Vc0[1];
+   Va[2] = Vc0[2];
+   Va[3] = Vc0[3];
+}
+
+#endif
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa,  bb;
+
+  int i, j, k;
+
+  a += (n - 1) * m;
+  b += (n - 1) * n;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb = *(b + i);
+
+    for (j = 0; j < m; j ++) {
+      aa = *(c + j + i * ldc);
+      aa *= bb;
+      *a   = aa;
+      *(c + j + i * ldc) = aa;
+      a ++;
+
+      for (k = 0; k < i; k ++){
+	*(c + j + k * ldc) -= aa * *(b + k);
+      }
+
+    }
+    b -= n;
+    a -= 2 * m;
+  }
+
+}
+
+#else
+
+static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) {
+
+  FLOAT aa1, aa2;
+  FLOAT bb1, bb2;
+  FLOAT cc1, cc2;
+
+  int i, j, k;
+
+  ldc *= 2;
+
+  a += (n - 1) * m * 2;
+  b += (n - 1) * n * 2;
+
+  for (i = n - 1; i >= 0; i--) {
+
+    bb1 = *(b + i * 2 + 0);
+    bb2 = *(b + i * 2 + 1);
+
+    for (j = 0; j < m; j ++) {
+
+      aa1 = *(c + j * 2 + 0 + i * ldc);
+      aa2 = *(c + j * 2 + 1 + i * ldc);
+
+#ifndef CONJ
+      cc1 = aa1 * bb1 - aa2 * bb2;
+      cc2 = aa1 * bb2 + aa2 * bb1;
+#else
+      cc1 =  aa1 * bb1  + aa2 * bb2;
+      cc2 = - aa1 * bb2 + aa2 * bb1;
+#endif
+
+      *(a + 0) = cc1;
+      *(a + 1) = cc2;
+
+      *(c + j * 2 + 0 + i * ldc) = cc1;
+      *(c + j * 2 + 1 + i * ldc) = cc2;
+      a += 2;
+
+      for (k = 0; k < i; k ++){
+#ifndef CONJ
+	*(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#else
+	*(c + j * 2 + 0 + k * ldc) -=   cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1);
+	*(c + j * 2 + 1 + k * ldc) -=  -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0);
+#endif
+      }
+
+    }
+    b -= n * 2;
+    a -= 4 * m;
+  }
+
+}
+
+#endif
+
+int CNAME(BLASLONG m, BLASLONG n, BLASLONG k,  FLOAT dummy1,
+#ifdef COMPLEX
+	   FLOAT dummy2,
+#endif
+	   FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){
+
+  BLASLONG i, j;
+  FLOAT *aa, *cc;
+  BLASLONG  kk;
+
+#if 0
+  fprintf(stderr, "TRSM RT KERNEL m = %3ld  n = %3ld  k = %3ld offset = %3ld\n",
+	  m, n, k, offset);
+#endif
+
+#ifdef DOUBLE
+  int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#else
+  int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0);
+#endif
+
+  kk = n - offset;
+  c += n * ldc * COMPSIZE;
+  b += n * k   * COMPSIZE;
+
+  if (n & (GEMM_UNROLL_N - 1)) {
+
+    j = 1;
+    while (j < GEMM_UNROLL_N) {
+      if (n & j) {
+
+	aa  = a;
+	b -= j * k  * COMPSIZE;
+	c -= j * ldc* COMPSIZE;
+	cc  = c;
+
+	i = (m >> GEMM_UNROLL_M_SHIFT);
+	if (i > 0) {
+
+	  do {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			  b  +  j            * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(GEMM_UNROLL_M, j,
+		  aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE,
+		  b  + (kk - j) * j             * COMPSIZE,
+		  cc, ldc);
+
+	    aa += GEMM_UNROLL_M * k * COMPSIZE;
+	    cc += GEMM_UNROLL_M     * COMPSIZE;
+	    i --;
+	  } while (i > 0);
+	}
+
+	if (m & (GEMM_UNROLL_M - 1)) {
+	  i = (GEMM_UNROLL_M >> 1);
+	  do {
+	    if (m & i) {
+
+	      if (k - kk > 0) {
+		GEMM_KERNEL(i, j, k - kk, dm1,
+#ifdef COMPLEX
+			    ZERO,
+#endif
+			    aa + i * kk * COMPSIZE,
+			    b  + j * kk * COMPSIZE,
+			    cc, ldc);
+	      }
+
+	      solve(i, j,
+		    aa + (kk - j) * i * COMPSIZE,
+		    b  + (kk - j) * j * COMPSIZE,
+		    cc, ldc);
+
+	      aa += i * k * COMPSIZE;
+	      cc += i     * COMPSIZE;
+
+	    }
+	    i >>= 1;
+	  } while (i > 0);
+	}
+	kk -= j;
+      }
+      j <<= 1;
+    }
+  }
+
+  j = (n >> GEMM_UNROLL_N_SHIFT);
+
+  if (j > 0) {
+
+    do {
+      aa  = a;
+      b -= GEMM_UNROLL_N * k   * COMPSIZE;
+      c -= GEMM_UNROLL_N * ldc * COMPSIZE;
+      cc  = c;
+
+      i = (m >> GEMM_UNROLL_M_SHIFT);
+      if (i > 0) {
+	do {
+	  if (k - kk > 0) {
+	    GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			ZERO,
+#endif
+			aa + GEMM_UNROLL_M * kk * COMPSIZE,
+			b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			cc,
+			ldc);
+	  }
+
+	  if (well_aligned) { 
+#ifdef DOUBLE
+	  solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		   b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#else
+	  solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		   b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc);
+#endif
+	  }
+	  else {
+	  solve(GEMM_UNROLL_M, GEMM_UNROLL_N,
+		aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE,
+		b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		cc, ldc);
+	  }
+
+	  aa += GEMM_UNROLL_M * k * COMPSIZE;
+	  cc += GEMM_UNROLL_M     * COMPSIZE;
+	  i --;
+	} while (i > 0);
+      }
+
+      if (m & (GEMM_UNROLL_M - 1)) {
+	i = (GEMM_UNROLL_M >> 1);
+	do {
+	  if (m & i) {
+	    if (k - kk > 0) {
+	      GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1,
+#ifdef COMPLEX
+			  ZERO,
+#endif
+			  aa + i             * kk * COMPSIZE,
+			  b  + GEMM_UNROLL_N * kk * COMPSIZE,
+			  cc,
+			  ldc);
+	    }
+
+	    solve(i, GEMM_UNROLL_N,
+		  aa + (kk - GEMM_UNROLL_N) * i             * COMPSIZE,
+		  b  + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE,
+		  cc, ldc);
+
+	    aa += i * k * COMPSIZE;
+	    cc += i     * COMPSIZE;
+	  }
+	  i >>= 1;
+	} while (i > 0);
+      }
+
+      kk -= GEMM_UNROLL_N;
+      j --;
+    } while (j > 0);
+  }
+
+  return 0;
+}
+
+

From 65de6f5957f9940ed338c1fdef251dbad70eb908 Mon Sep 17 00:00:00 2001
From: Jin Bo <jinbo@loongson.cn>
Date: Sat, 5 Dec 2020 15:06:12 +0800
Subject: [PATCH 098/121] Fix test errors reported by cblas_cgemm & cblas_ctrmm

The file cgemm_kernel_8x4_msa.c holds the MSA optimization
codes of cblas_cgemm and cblas_ctrmm. It defines two
macros: CGEMM_SCALE_1X2 and CGEMM_TRMM_SCALE_1X2. The pc1
array index in the two macros should be 0 and 1.
---
 kernel/mips/cgemm_kernel_8x4_msa.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c
index 4b3637c7c..8b624be88 100644
--- a/kernel/mips/cgemm_kernel_8x4_msa.c
+++ b/kernel/mips/cgemm_kernel_8x4_msa.c
@@ -758,10 +758,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     pc0[1] += alphar * res1;  \
     pc0[1] += alphai * res0;  \
                               \
-    pc1[2] += alphar * res2;  \
-    pc1[2] -= alphai * res3;  \
-    pc1[3] += alphar * res3;  \
-    pc1[3] += alphai * res2;  \
+    pc1[0] += alphar * res2;  \
+    pc1[0] -= alphai * res3;  \
+    pc1[1] += alphar * res3;  \
+    pc1[1] += alphai * res2;  \
 }
 
 #define CGEMM_SCALE_1X1       \
@@ -1067,10 +1067,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     pc0[1] = alphar * res1;   \
     pc0[1] += alphai * res0;  \
                               \
-    pc1[2] = alphar * res2;   \
-    pc1[2] -= alphai * res3;  \
-    pc1[3] = alphar * res3;   \
-    pc1[3] += alphai * res2;  \
+    pc1[0] = alphar * res2;   \
+    pc1[0] -= alphai * res3;  \
+    pc1[1] = alphar * res3;   \
+    pc1[1] += alphai * res2;  \
 }
 
 #define CGEMM_TRMM_SCALE_1X1  \

From 04fa17322c09c497ad8f69ab12ec8684a0847c60 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:05:27 +0100
Subject: [PATCH 099/121] Fix build options for SolarisStudio compilers

---
 Makefile.sparc | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/Makefile.sparc b/Makefile.sparc
index 8895b96dd..61c7aa36d 100644
--- a/Makefile.sparc
+++ b/Makefile.sparc
@@ -3,21 +3,29 @@ RANLIB  = ranlib
 
 ifdef BINARY64
 
+ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT += -mcpu=v9 -m64
+else
+CCOMMON_OPT +=  -m64
+endif
 ifeq ($(COMPILER_F77), g77)
 FCOMMON_OPT += -mcpu=v9 -m64
 endif
-ifeq ($(COMPILER_F77), f90)
-FCOMMON_OPT += -xarch=v9
+ifeq ($(COMPILER_F77), f95)
+FCOMMON_OPT += -m64
 endif
 else
 
+ifeq ($(C_COMPILER), GCC)
 CCOMMON_OPT += -mcpu=v9
+else
+CCOMMON_OPT +=  -xarch=v9
+endif
 
 ifeq ($(COMPILER_F77), g77)
 FCOMMON_OPT += -mcpu=v9
 endif
-ifeq ($(COMPILER_F77), f90)
+ifeq ($(COMPILER_F77), f95)
 FCOMMON_OPT += -xarch=v8plusb
 endif
 
@@ -37,4 +45,4 @@ LIBSUNPERF	= -L/opt/SUNWspro/lib/v9 -L/opt/SUNWspro/prod/lib/v9 \
 else
 LIBSUNPERF	= -L/opt/SUNWspro/lib -L/opt/SUNWspro/prod/lib \
 		-Wl,-R,/opt/SUNWspro/lib -lsunperf -lompstubs -lfui -lfsu -lsunmath
-endif
\ No newline at end of file
+endif

From da6d5d675c3db0cfd4926704a9b72f89dc4963b8 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:07:45 +0100
Subject: [PATCH 100/121] Fix hostarch detection for sparc

---
 c_check | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/c_check b/c_check
index a841df153..fe9c53f0e 100644
--- a/c_check
+++ b/c_check
@@ -6,7 +6,8 @@
 # Checking cross compile
 $hostos   = `uname -s | sed -e s/\-.*//`;    chop($hostos);
 $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch);
-$hostarch = `uname -p` if ($hostos eq "AIX");
+$hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS");
+chop($hostarch);
 $hostarch = "x86_64" if ($hostarch eq "amd64");
 $hostarch = "arm" if ($hostarch ne "arm64" && $hostarch =~ /^arm.*/);
 $hostarch = "arm64" if ($hostarch eq "aarch64");

From 3a1b1b7c8cc7081155a1f0d9411c9d68ab7559fa Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:08:43 +0100
Subject: [PATCH 101/121] Fix complex ABI for 32bit SolarisStudio builds

---
 common_sparc.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/common_sparc.h b/common_sparc.h
index 85e29fffa..90a24ebf1 100644
--- a/common_sparc.h
+++ b/common_sparc.h
@@ -78,6 +78,12 @@ static __inline unsigned long rpcc(void){
 #define __BIG_ENDIAN__
 #endif
 
+#ifdef C_SUN
+#ifndef __64BIT
+#define RETURN_BY_STACK
+#endif
+#endif
+
 #ifdef DOUBLE
 #define GET_IMAGE(res)  __asm__ __volatile__("fmovd %%f2, %0" : "=f"(res)  : : "memory")
 #else

From b0b14f4e9ba13331ab484010b7150495dccb8e83 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:12:02 +0100
Subject: [PATCH 102/121] Change comments to C style for compatibility

---
 param.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/param.h b/param.h
index ee5ad17fb..a0d45c573 100644
--- a/param.h
+++ b/param.h
@@ -1454,22 +1454,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define SGEMM_DEFAULT_P 768
 #define SGEMM_DEFAULT_R sgemm_r
-//#define SGEMM_DEFAULT_R 1024
+/*#define SGEMM_DEFAULT_R 1024*/
 
 #define DGEMM_DEFAULT_P 512
 #define DGEMM_DEFAULT_R dgemm_r
-//#define DGEMM_DEFAULT_R 1024
+/*#define DGEMM_DEFAULT_R 1024*/
 
 #define QGEMM_DEFAULT_P 504
 #define QGEMM_DEFAULT_R qgemm_r
 
 #define CGEMM_DEFAULT_P 768
 #define CGEMM_DEFAULT_R cgemm_r
-//#define CGEMM_DEFAULT_R 1024
+/*#define CGEMM_DEFAULT_R 1024*/
 
 #define ZGEMM_DEFAULT_P 512
 #define ZGEMM_DEFAULT_R zgemm_r
-//#define ZGEMM_DEFAULT_R 1024
+/*#define ZGEMM_DEFAULT_R 1024*/
 
 #define XGEMM_DEFAULT_P 252
 #define XGEMM_DEFAULT_R xgemm_r
@@ -2571,7 +2571,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 #ifdef LOONGSON3A
-////Copy from SICORTEX
+/*Copy from SICORTEX*/
 #define SNUMOPT		2
 #define DNUMOPT		2
 
@@ -2863,7 +2863,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-// Common ARMv8 parameters
+/* Common ARMv8 parameters */
 #if defined(ARMV8)
 
 #define SNUMOPT		2
@@ -3066,7 +3066,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#else // Other/undetected ARMv8 cores
+#else /* Other/undetected ARMv8 cores */
 
 #define SGEMM_DEFAULT_UNROLL_M  16
 #define SGEMM_DEFAULT_UNROLL_N  4
@@ -3095,9 +3095,9 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 4096
 
-#endif // Cores
+#endif /* Cores */
 
-#endif // ARMv8
+#endif /* ARMv8 */
 
 #if defined(ARMV5)
 #define SNUMOPT		2

From 93473174d6f59b989f36ae0ce6994d347d9c33bb Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:12:56 +0100
Subject: [PATCH 103/121] Fix utest build with SolarisStudio compilers

---
 utest/Makefile | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utest/Makefile b/utest/Makefile
index 1fc30d088..fad3607de 100644
--- a/utest/Makefile
+++ b/utest/Makefile
@@ -35,6 +35,9 @@ endif
 ifeq ($(C_COMPILER), PGI)
 OBJS = utest_main2.o
 endif
+ifeq ($(C_COMPILER), SUN)
+OBJS = utest_main2.o
+endif
 ifeq ($(OSNAME), AIX)
 OBJS = utest_main2.o
 endif

From f8346603cf1794826cc2b04cd4708bb890f805b0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:14:16 +0100
Subject: [PATCH 104/121] Fix compilation with SolarisStudio

---
 kernel/arm/zdot.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/arm/zdot.c b/kernel/arm/zdot.c
index 73ae3acd7..9249b54f8 100644
--- a/kernel/arm/zdot.c
+++ b/kernel/arm/zdot.c
@@ -48,7 +48,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 
 	dot[0]=0.0;
 	dot[1]=0.0;
-#if !defined(__PPC__)
+#if !defined(__PPC__) && !defined(__SunOS)
 	CREAL(result) = 0.0 ;
 	CIMAG(result) = 0.0 ;
 #else
@@ -73,7 +73,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA
 		i++ ;
 
 	}
-#if !defined(__PPC__)	
+#if !defined(__PPC__)	&& !defined(__SunOS)
         CREAL(result) = dot[0];
 	CIMAG(result) = dot[1];
 #else

From b660008c7ef479d83f329e1aefbcf3dbed1653a6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:15:37 +0100
Subject: [PATCH 105/121] Work around DOT and SWAP test failures

---
 kernel/sparc/KERNEL.sparc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/kernel/sparc/KERNEL.sparc b/kernel/sparc/KERNEL.sparc
index 2e8319ce5..1a2e9671a 100644
--- a/kernel/sparc/KERNEL.sparc
+++ b/kernel/sparc/KERNEL.sparc
@@ -54,3 +54,13 @@ ZTRSMKERNEL_LN	=  ztrsm_kernel_LN.S
 ZTRSMKERNEL_LT	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RN	=  ztrsm_kernel_LT.S
 ZTRSMKERNEL_RT	=  ztrsm_kernel_RT.S
+
+
+SDOTKERNEL = ../generic/dot.c
+SDSDOTKERNEL = ../generic/dot.c
+DSDOTKERNEL = ../generic/dot.c
+DDOTKERNEL = ../generic/dot.c
+CDOTKERNEL = ../arm/zdot.c
+ZDOTKERNEL = ../arm/zdot.c
+CSWAPKERNEL = ../arm/zswap.c
+ZSWAPKERNEL = ../arm/zswap.c

From 6c7d557a166aaad44be389acb0ef6bf73935cdc3 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sun, 6 Dec 2020 19:20:50 +0100
Subject: [PATCH 106/121] Fix compiler options for 32 and 64bit SPARC builds
 with SolarisStudio

---
 Makefile.system | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/Makefile.system b/Makefile.system
index b5974f872..c17cd3bd1 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -1131,16 +1131,25 @@ CCOMMON_OPT  += -w
 ifeq ($(ARCH), x86)
 CCOMMON_OPT  += -m32
 else
-FCOMMON_OPT  += -m64
+ifdef BINARY64
+CCOMMON_OPT  += -m64
+else
+CCOMMON_OPT  += -m32
+endif
 endif
 endif
 
 ifeq ($(F_COMPILER), SUN)
 CCOMMON_OPT  += -DF_INTERFACE_SUN
+FCOMMON_OPT  += -ftrap=%none -xrecursive
 ifeq ($(ARCH), x86)
 FCOMMON_OPT  += -m32
 else
+ifdef BINARY64
 FCOMMON_OPT  += -m64
+else
+FCOMMON_OPT  += -m32
+endif
 endif
 ifeq ($(USE_OPENMP), 1)
 FCOMMON_OPT += -xopenmp=parallel
@@ -1313,8 +1322,10 @@ KERNELDIR	= $(TOPDIR)/kernel/$(ARCH)
 include $(TOPDIR)/Makefile.$(ARCH)
 
 ifneq ($(C_COMPILER), PGI)
+ifneq ($(C_COMPILER), SUN)
 CCOMMON_OPT     += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME
 endif
+endif
 CCOMMON_OPT	+= -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\"
 
 ifeq ($(CORE), PPC440)

From 47b639cc9b4ff900f7b83751af9d1c4ff9dea3c1 Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Mon, 7 Dec 2020 10:04:00 +0800
Subject: [PATCH 107/121] Fix failed sswap and dswap case by using msa
 optimization

The swap test case will call sswap_msa.c and dswap_msa.c files in MIPS environmnet.
When inc_x or inc_y is equal to zero, the calculation result of the two functions will be wrong.
This patch adds the processing of inc_x or inc_y equal to zero, and the swap test case has passed.
---
 kernel/mips/dswap_msa.c | 30 ++++++++++++++++++++++++++++--
 kernel/mips/sswap_msa.c | 29 ++++++++++++++++++++++++++++-
 2 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/kernel/mips/dswap_msa.c b/kernel/mips/dswap_msa.c
index 7b1f02477..67e97f710 100644
--- a/kernel/mips/dswap_msa.c
+++ b/kernel/mips/dswap_msa.c
@@ -184,7 +184,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-    else
+    else if ((inc_x != 0) && (inc_y != 0))
     {
         for (i = (n >> 3); i--;)
         {
@@ -248,6 +248,32 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-
+    else
+    {
+        if (inc_x == inc_y)
+        {
+            if (n & 1)
+            {
+                x0 = *srcx;
+                *srcx  = *srcy;
+                *srcy  = x0;
+            }
+            else
+                return (0);
+        }
+        else
+        {
+            BLASLONG ix = 0, iy = 0;
+            while (i < n)
+            {
+                x0 = srcx[ix];
+                srcx[ix] = srcy[iy];
+                srcy[iy] = x0;
+                ix += inc_x;
+                iy += inc_y;
+                i++;
+            }
+        }
+    }
     return (0);
 }
diff --git a/kernel/mips/sswap_msa.c b/kernel/mips/sswap_msa.c
index 46fa8aa87..d412285b0 100644
--- a/kernel/mips/sswap_msa.c
+++ b/kernel/mips/sswap_msa.c
@@ -198,7 +198,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
-    else
+    else if ((inc_x != 0) && (inc_y != 0))
     {
         for (i = (n >> 3); i--;)
         {
@@ -262,6 +262,33 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
             }
         }
     }
+    else
+    {
+        if (inc_x == inc_y)
+        {
+            if (n & 1)
+            {
+                x0 = *srcx;
+                *srcx  = *srcy;
+                *srcy  = x0;
+            }
+            else
+                return (0);
+        }
+        else
+        {
+            BLASLONG ix = 0, iy = 0;
+            while (i < n)
+            {
+                x0 = srcx[ix];
+                srcx[ix] = srcy[iy];
+                srcy[iy] = x0;
+                ix += inc_x;
+                iy += inc_y;
+                i++;
+            }
+        }
+    }
 
     return (0);
 }

From ad38bd0e89c4507476f1ad4ba566d27bb0dd6f9d Mon Sep 17 00:00:00 2001
From: Hao Chen <chenhao@loongson.cn>
Date: Mon, 7 Dec 2020 10:18:51 +0800
Subject: [PATCH 108/121] Fix failed cgemv and zgemv test case after using msa
 optimization

The cgemv and zgemv test case will call cgemv_n/t_msa.c zgemv_n/t_msa.c files in MIPS environment.
When the macro CONJ is defined, the calculation result will be wrong due to the wrong definition of OP2.
This patch updates the value of OP2 and passes the corresponding test.
---
 kernel/mips/cgemv_n_msa.c |  4 ++--
 kernel/mips/cgemv_t_msa.c | 26 +++++++++++++++++++-------
 kernel/mips/zgemv_n_msa.c |  4 ++--
 kernel/mips/zgemv_t_msa.c | 26 +++++++++++++++++++-------
 4 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/kernel/mips/cgemv_n_msa.c b/kernel/mips/cgemv_n_msa.c
index 12fa7ca02..c1eb9bbfd 100644
--- a/kernel/mips/cgemv_n_msa.c
+++ b/kernel/mips/cgemv_n_msa.c
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     #if !defined(XCONJ)
         #define OP0  +=
         #define OP1  -=
-        #define OP2  -=
+        #define OP2  +=
     #else
         #define OP0  -=
         #define OP1  -=
-        #define OP2  +=
+        #define OP2  -=
     #endif
 #endif
 
diff --git a/kernel/mips/cgemv_t_msa.c b/kernel/mips/cgemv_t_msa.c
index 584e3de75..800667b6e 100644
--- a/kernel/mips/cgemv_t_msa.c
+++ b/kernel/mips/cgemv_t_msa.c
@@ -32,14 +32,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #undef OP1
 #undef OP2
 
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    #define OP0  -=
-    #define OP1  +=
-    #define OP2  +=
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
 #else
-    #define OP0  +=
-    #define OP1  +=
-    #define OP2  -=
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  +=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  -=
+    #endif
 #endif
 
 #define CGEMV_T_8x4()                        \
diff --git a/kernel/mips/zgemv_n_msa.c b/kernel/mips/zgemv_n_msa.c
index 669c25758..97a80b4ba 100644
--- a/kernel/mips/zgemv_n_msa.c
+++ b/kernel/mips/zgemv_n_msa.c
@@ -56,11 +56,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     #if !defined(XCONJ)
         #define OP0  +=
         #define OP1  -=
-        #define OP2  -=
+        #define OP2  +=
     #else
         #define OP0  -=
         #define OP1  -=
-        #define OP2  +=
+        #define OP2  -=
     #endif
 #endif
 
diff --git a/kernel/mips/zgemv_t_msa.c b/kernel/mips/zgemv_t_msa.c
index e6febb577..6492f90be 100644
--- a/kernel/mips/zgemv_t_msa.c
+++ b/kernel/mips/zgemv_t_msa.c
@@ -34,14 +34,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #undef OP3
 #undef OP4
 
-#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
-    #define OP0  -=
-    #define OP1  +=
-    #define OP2  +=
+#if !defined(CONJ)
+    #if !defined(XCONJ)
+        #define OP0  -=
+        #define OP1  +=
+        #define OP2  +=
+    #else
+        #define OP0  +=
+        #define OP1  +=
+        #define OP2  -=
+    #endif
 #else
-    #define OP0  +=
-    #define OP1  +=
-    #define OP2  -=
+    #if !defined(XCONJ)
+        #define OP0  +=
+        #define OP1  -=
+        #define OP2  +=
+    #else
+        #define OP0  -=
+        #define OP1  -=
+        #define OP2  -=
+    #endif
 #endif
 
 #define ZGEMV_T_8x1()                     \

From 7834c10e2f6288d0c7fe339375540ebe765f7efc Mon Sep 17 00:00:00 2001
From: Xianyi Zhang <xianyi@perfxlab.com>
Date: Mon, 7 Dec 2020 16:55:05 +0800
Subject: [PATCH 109/121] Add PingTouGe contribution credit.

---
 CONTRIBUTORS.md | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 7b994885a..be9a32a7c 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -190,4 +190,7 @@ In chronological order:
   * [2020-09-07] Fix builds with clang on IBM z, including dynamic architecture support
 
 * Danfeng Zhang <https://github.com/craft-zhang>
-  * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
\ No newline at end of file
+  * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53
+
+* PingTouGe Semiconductor Co., Ltd.
+  * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910

From d67babf34536ffd0cba4142aa1ea4496394438cd Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Tue, 8 Dec 2020 19:16:39 +0800
Subject: [PATCH 110/121] Remove gcc unrecognized option '-msched-weight' when
 check msa

---
 c_check | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/c_check b/c_check
index fe9c53f0e..970d475d7 100644
--- a/c_check
+++ b/c_check
@@ -199,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) {
     } else {
 	$tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$code = '"addvi.b $w0, $w1, 1"';
-	$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
+	$msa_flags = "-mmsa -mfp64 -mload-store-pairs";
 	print $tmpf "#include <msa.h>\n\n";
 	print $tmpf "void main(void){ __asm__ volatile($code); }\n";
 

From 5d26223f4a91e14ec711168f6e4a40f21729be38 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Dec 2020 20:59:56 +0100
Subject: [PATCH 111/121] remove extra/intermediate size step of min_jj from
 PR747

---
 driver/level3/level3.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/driver/level3/level3.c b/driver/level3/level3.c
index a38506585..9b44deb85 100644
--- a/driver/level3/level3.c
+++ b/driver/level3/level3.c
@@ -339,8 +339,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #else
         if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
         else
-        	if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
+/*
+		if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
         	else
+*/
           		if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
 

From a5547124393a3ea7538998e98356cb052dc652d0 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Dec 2020 21:01:36 +0100
Subject: [PATCH 112/121] remove extra/intermediate size step for min_jj
 introduced in PR747

---
 driver/level3/level3_thread.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c
index 6e1fd9e99..2b33c9589 100644
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #else
 	if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N;
 	else
+/*
           if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N;
           else
+*/
             if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
 #endif
         /* Copy part of local region of B into workspace */

From d71fe4ed4eff491a9e6aae87fbd46cf9d2914d9e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 8 Dec 2020 21:07:57 +0100
Subject: [PATCH 113/121] Remove GEMM_DEFAULT_UNROLL_MN parameters for Haswell
 and ZEN (introduced in PR747)

---
 param.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/param.h b/param.h
index a0d45c573..42f63b4b5 100644
--- a/param.h
+++ b/param.h
@@ -644,9 +644,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
-
+/*
 #define SGEMM_DEFAULT_UNROLL_MN 32
 #define DGEMM_DEFAULT_UNROLL_MN 32
+*/
 #endif
 
 #ifdef ARCH_X86
@@ -1552,9 +1553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2
 #define XGEMM_DEFAULT_UNROLL_N 1
-
+/*
 #define SGEMM_DEFAULT_UNROLL_MN 32
 #define DGEMM_DEFAULT_UNROLL_MN 32
+*/
 #endif
 
 #ifdef ARCH_X86

From 4b548857d64e6f0fb3aefbd0bd5bd4d14f2a22d7 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 26 Nov 2020 14:59:41 +0800
Subject: [PATCH 114/121] Add msa support for loongson

1. Using core loongson3r3 and loongson3r4 for loongson
2. Add DYNAMIC_ARCH for loongson

Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1
---
 Makefile.system                               |  27 +-
 common_linux.h                                |   8 -
 common_mips64.h                               |   9 +-
 cpuid_mips64.c                                |  91 +++----
 driver/others/Makefile                        |   8 +
 driver/others/blas_server.c                   |   2 +
 driver/others/dynamic_mips64.c                | 230 ++++++++++++++++++
 driver/others/parameter.c                     |  16 +-
 getarch.c                                     |  24 +-
 kernel/Makefile                               |   5 +
 kernel/Makefile.L3                            |   4 -
 kernel/mips/cgemm_kernel_8x4_msa.c            |   4 +-
 kernel/mips/crot_msa.c                        |   6 +-
 kernel/mips/cscal_msa.c                       |   6 +-
 kernel/mips/dscal_msa.c                       |   4 +-
 kernel/mips/dtrsm_kernel_LN_8x4_msa.c         |  38 +--
 kernel/mips/dtrsm_kernel_LT_8x4_msa.c         |  36 +--
 kernel/mips/dtrsm_kernel_RN_8x4_msa.c         |  21 +-
 kernel/mips/dtrsm_kernel_RT_8x4_msa.c         |  21 +-
 kernel/mips/macros_msa.h                      |   8 +-
 kernel/mips/srot_msa.c                        |   6 +-
 kernel/mips/sscal_msa.c                       |   6 +-
 kernel/mips/zscal_msa.c                       |   8 +-
 kernel/mips64/KERNEL.LOONGSON3B               |  64 -----
 .../{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} |  27 +-
 kernel/mips64/KERNEL.LOONGSON3R4              | 192 +++++++++++++++
 kernel/setparam-ref.c                         |  72 ++++++
 param.h                                       |  48 ++--
 28 files changed, 656 insertions(+), 335 deletions(-)
 create mode 100644 driver/others/dynamic_mips64.c
 delete mode 100644 kernel/mips64/KERNEL.LOONGSON3B
 rename kernel/mips64/{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} (75%)
 create mode 100644 kernel/mips64/KERNEL.LOONGSON3R4

diff --git a/Makefile.system b/Makefile.system
index c17cd3bd1..6377f66ea 100644
--- a/Makefile.system
+++ b/Makefile.system
@@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180
 DYNAMIC_CORE += THUNDERX3T110
 endif
 
+ifeq ($(ARCH), mips64)
+DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
+endif
+
 ifeq ($(ARCH), zarch)
 DYNAMIC_CORE = ZARCH_GENERIC
 
@@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32
 BINARY_DEFINED = 1
 endif
 
-ifeq ($(CORE), LOONGSON3A)
-CCOMMON_OPT += -march=mips64
-FCOMMON_OPT += -march=mips64
-endif
-
-ifeq ($(CORE), LOONGSON3B)
-CCOMMON_OPT += -march=mips64
-FCOMMON_OPT += -march=mips64
+ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
+CCOMMON_OPT += -march=loongson3a
+FCOMMON_OPT += -march=loongson3a
 endif
 
 ifeq ($(CORE), MIPS24K)
@@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32
 else
 FCOMMON_OPT += -n64
 endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3R3)
 FCOMMON_OPT += -loongson3 -static
 endif
 
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3R4)
 FCOMMON_OPT += -loongson3 -static
 endif
 
@@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32
 else
 CCOMMON_OPT += -n64
 endif
-ifeq ($(CORE), LOONGSON3A)
+ifeq ($(CORE), LOONGSON3R3)
 CCOMMON_OPT += -loongson3 -static
 endif
 
-ifeq ($(CORE), LOONGSON3B)
+ifeq ($(CORE), LOONGSON3R4)
 CCOMMON_OPT += -loongson3 -static
 endif
 
@@ -1223,10 +1222,8 @@ ifdef SMP
 CCOMMON_OPT	+= -DSMP_SERVER
 
 ifeq ($(ARCH), mips64)
-ifneq ($(CORE), LOONGSON3B)
 USE_SIMPLE_THREADED_LEVEL3 = 1
 endif
-endif
 
 ifeq ($(USE_OPENMP), 1)
 # USE_SIMPLE_THREADED_LEVEL3 = 1
@@ -1342,11 +1339,9 @@ endif
 
 ifneq ($(ARCH), x86_64)
 ifneq ($(ARCH), x86)
-ifneq ($(CORE), LOONGSON3B)
 NO_AFFINITY = 1
 endif
 endif
-endif
 
 ifdef NO_AFFINITY
 ifeq ($(NO_AFFINITY), 0)
diff --git a/common_linux.h b/common_linux.h
index 35f3fb658..5a1c4e150 100644
--- a/common_linux.h
+++ b/common_linux.h
@@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
 // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
         return 0;
 #else
-#if defined (LOONGSON3B)
-#if defined (__64BIT__)
-	return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
-#else
-	return 0; //NULL Implementation on Loongson 3B 32bit.
-#endif
-#else
 //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
 //	unsigned long null_nodemask=0;
 	return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
 #endif
-#endif
 }
 
 static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {
diff --git a/common_mips64.h b/common_mips64.h
index a06edfe08..287459e7d 100644
--- a/common_mips64.h
+++ b/common_mips64.h
@@ -229,12 +229,7 @@ REALNAME: ;\
 
 #define BUFFER_SIZE     ( 32 << 21)
 
-#if defined(LOONGSON3A)
-#define PAGESIZE	(16UL << 10)
-#define FIXED_PAGESIZE	(16UL << 10)
-#endif
-
-#if defined(LOONGSON3B)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #define PAGESIZE	(16UL << 10)
 #define FIXED_PAGESIZE	(16UL << 10)
 #endif
@@ -250,7 +245,7 @@ REALNAME: ;\
 #define MAP_ANONYMOUS MAP_ANON
 #endif
 
-#if defined(LOONGSON3A) || defined(LOONGSON3B)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #define PREFETCHD_(x) ld $0, x
 #define PREFETCHD(x)  PREFETCHD_(x)
 #else
diff --git a/cpuid_mips64.c b/cpuid_mips64.c
index 0c19ac1e7..674b65908 100644
--- a/cpuid_mips64.c
+++ b/cpuid_mips64.c
@@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/
 
-#define CPU_UNKNOWN     0
-#define CPU_SICORTEX    1
-#define CPU_LOONGSON3A  2
-#define CPU_LOONGSON3B  3
-#define CPU_I6400       4
-#define CPU_P6600       5
-#define CPU_I6500       6
+#define CPU_UNKNOWN      0
+#define CPU_SICORTEX     1
+#define CPU_LOONGSON3R3  2
+#define CPU_LOONGSON3R4  3
+#define CPU_I6400        4
+#define CPU_P6600        5
+#define CPU_I6500        6
 
 static char *cpuname[] = {
   "UNKNOWN",
   "SICORTEX",
-  "LOONGSON3A",
-  "LOONGSON3B",
+  "LOONGSON3R3",
+  "LOONGSON3R4",
   "I6400",
   "P6600",
   "I6500"
@@ -90,48 +90,13 @@ static char *cpuname[] = {
 
 int detect(void){
 
-#ifdef __linux
+#ifdef linux
   FILE *infile;
   char buffer[512], *p;
 
   p = (char *)NULL;
-  infile = fopen("/proc/cpuinfo", "r");
-  while (fgets(buffer, sizeof(buffer), infile)){
-    if (!strncmp("cpu", buffer, 3)){
-	p = strchr(buffer, ':') + 2;
-#if 0
-	fprintf(stderr, "%s\n", p);
-#endif
-	break;
-      }
-  }
-
-  fclose(infile);
-
-  if(p != NULL){
-  if (strstr(p, "Loongson-3A")){
-    return CPU_LOONGSON3A;
-  }else if(strstr(p, "Loongson-3B")){
-    return CPU_LOONGSON3B;
-  }else if (strstr(p, "Loongson-3")){
-    infile = fopen("/proc/cpuinfo", "r");
-    p = (char *)NULL;
-    while (fgets(buffer, sizeof(buffer), infile)){
-      if (!strncmp("system type", buffer, 11)){
-	p = strchr(buffer, ':') + 2;
-	break;
-      }
-    }
-    fclose(infile);
-    if (strstr(p, "loongson3a"))
-      return CPU_LOONGSON3A;
-  }else{
-    return CPU_SICORTEX;
-  }
-  }
   //Check model name for Loongson3
   infile = fopen("/proc/cpuinfo", "r");
-  p = (char *)NULL;
   while (fgets(buffer, sizeof(buffer), infile)){
     if (!strncmp("model name", buffer, 10)){
       p = strchr(buffer, ':') + 2;
@@ -140,14 +105,16 @@ int detect(void){
   }
   fclose(infile);
   if(p != NULL){
-  if (strstr(p, "Loongson-3A")){
-    return CPU_LOONGSON3A;
-  }else if(strstr(p, "Loongson-3B")){
-    return CPU_LOONGSON3B;
-  }
+  if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
+    return CPU_LOONGSON3R3;
+  }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
+    return CPU_LOONGSON3R4;
+  } else{
+    return CPU_SICORTEX;
   }
 #endif
     return CPU_UNKNOWN;
+  }
 }
 
 char *get_corename(void){
@@ -159,10 +126,10 @@ void get_architecture(void){
 }
 
 void get_subarchitecture(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("LOONGSON3A");
-  }else if(detect()==CPU_LOONGSON3B){
-    printf("LOONGSON3B");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("LOONGSON3R3");
+  }else if(detect()==CPU_LOONGSON3R4){
+    printf("LOONGSON3R4");
   }else if(detect()==CPU_I6400){
     printf("I6400");
   }else if(detect()==CPU_P6600){
@@ -179,8 +146,8 @@ void get_subdirname(void){
 }
 
 void get_cpuconfig(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("#define LOONGSON3A\n");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("#define LOONGSON3R3\n");
     printf("#define L1_DATA_SIZE 65536\n");
     printf("#define L1_DATA_LINESIZE 32\n");
     printf("#define L2_SIZE 512488\n");
@@ -188,8 +155,8 @@ void get_cpuconfig(void){
     printf("#define DTB_DEFAULT_ENTRIES 64\n");
     printf("#define DTB_SIZE 4096\n");
     printf("#define L2_ASSOCIATIVE 4\n");
-  }else if(detect()==CPU_LOONGSON3B){
-    printf("#define LOONGSON3B\n");
+  }else if(detect()==CPU_LOONGSON3R4){
+    printf("#define LOONGSON3R4\n");
     printf("#define L1_DATA_SIZE 65536\n");
     printf("#define L1_DATA_LINESIZE 32\n");
     printf("#define L2_SIZE 512488\n");
@@ -237,10 +204,10 @@ void get_cpuconfig(void){
 }
 
 void get_libname(void){
-  if(detect()==CPU_LOONGSON3A) {
-    printf("loongson3a\n");
-  }else if(detect()==CPU_LOONGSON3B) {
-    printf("loongson3b\n");
+  if(detect()==CPU_LOONGSON3R3) {
+    printf("loongson3r3\n");
+  }else if(detect()==CPU_LOONGSON3R4) {
+    printf("loongson3r4\n");
   }else if(detect()==CPU_I6400) {
     printf("i6400\n");
   }else if(detect()==CPU_P6600) {
diff --git a/driver/others/Makefile b/driver/others/Makefile
index d09444f56..4a421ef31 100644
--- a/driver/others/Makefile
+++ b/driver/others/Makefile
@@ -24,10 +24,14 @@ else
 ifeq ($(ARCH),zarch)
 COMMONOBJS += dynamic_zarch.$(SUFFIX)
 else
+ifeq ($(ARCH),mips64)
+COMMONOBJS += dynamic_mips64.$(SUFFIX)
+else
 COMMONOBJS	+=  dynamic.$(SUFFIX)
 endif
 endif
 endif
+endif
 else
 COMMONOBJS	+=  parameter.$(SUFFIX)
 endif
@@ -92,10 +96,14 @@ else
 ifeq ($(ARCH),zarch)
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
 else
+ifeq ($(ARCH),mips64)
+HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
+else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
 endif
 endif
 endif
+endif
 else
 HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
 endif
diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c
index 30e0cc6c2..5e0943c2e 100644
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) {
   blas_cpu_number  = num_threads;
 
 #if defined(ARCH_MIPS64)
+#ifndef DYNAMIC_ARCH
   //set parameters for different number of threads.
   blas_set_parameter();
 #endif
+#endif
 
 }
 
diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c
new file mode 100644
index 000000000..9fd19d739
--- /dev/null
+++ b/driver/others/dynamic_mips64.c
@@ -0,0 +1,230 @@
+/*****************************************************************************
+Copyright (c) 2020, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of
+      its contributors may be used to endorse or promote products
+      derived from this software without specific prior written
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#include <sys/wait.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/resource.h>
+#include "common.h"
+
+extern gotoblas_t  gotoblas_LOONGSON3R3;
+extern gotoblas_t  gotoblas_LOONGSON3R4;
+
+extern void openblas_warning(int verbose, const char * msg);
+
+#define NUM_CORETYPES    2
+
+static char *corename[] = {
+  "loongson3r3",
+  "loongson3r4",
+  "UNKNOWN"
+};
+
+char *gotoblas_corename(void) {
+  if (gotoblas == &gotoblas_LOONGSON3R3)    return corename[0];
+  if (gotoblas == &gotoblas_LOONGSON3R4)    return corename[1];
+  return corename[NUM_CORETYPES];
+}
+
+static gotoblas_t *force_coretype(char *coretype) {
+  int i;
+  int found = -1;
+  char message[128];
+
+  for ( i=0 ; i < NUM_CORETYPES; i++)
+  {
+    if (!strncasecmp(coretype, corename[i], 20))
+    {
+        found = i;
+        break;
+    }
+  }
+
+  switch (found)
+  {
+    case  0: return (&gotoblas_LOONGSON3R3);
+    case  1: return (&gotoblas_LOONGSON3R4);
+  }
+  snprintf(message, 128, "Core not found: %s\n", coretype);
+  openblas_warning(1, message);
+  return NULL;
+}
+
+#define MMI_MASK    0x00000010
+#define MSA_MASK    0x00000020
+
+int fd[2];
+int support_cpucfg;
+
+static void handler(int signum)
+{
+    close(fd[1]);
+    exit(1);
+}
+
+/* Brief :  Function to check if cpucfg supported on loongson
+ * Return:  1   supported
+ *          0   not supported
+ */
+static int cpucfg_test(void) {
+    pid_t pid;
+    int status = 0;
+
+    support_cpucfg = 0;
+    pipe(fd);
+    pid = fork();
+    if (pid == 0) { /* Subprocess */
+        struct sigaction act;
+        close(fd[0]);
+        /* Set signal action for SIGILL. */
+        act.sa_handler = handler;
+        sigaction(SIGILL,&act,NULL);
+
+        /* Execute cpucfg in subprocess. */
+        __asm__ volatile(
+            ".insn              \n\t"
+            ".word (0xc8080118) \n\t"
+            :::
+        );
+        support_cpucfg = 1;
+        write(fd[1],&support_cpucfg,sizeof(support_cpucfg));
+        close(fd[1]);
+        exit(0);
+    } else if (pid > 0){ /* Parent process*/
+        close(fd[1]);
+        if ((waitpid(pid,&status,0) <= 0) ||
+            (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0))
+            support_cpucfg = 0;
+        close(fd[0]);
+    } else {
+        support_cpucfg = 0;
+    }
+
+    return support_cpucfg;
+}
+
+static gotoblas_t *get_coretype_from_cpucfg(void) {
+    int flag = 0;
+    __asm__ volatile(
+        ".insn                     \n\t"
+        "dli    $8,    0x01        \n\t"
+        ".word (0xc9084918)        \n\t"
+        "usw    $9,    0x00(%0)    \n\t"
+        :
+        : "r"(&flag)
+        : "memory"
+    );
+    if (flag & MSA_MASK)
+        return (&gotoblas_LOONGSON3R4);
+    if (flag & MMI_MASK)
+        return (&gotoblas_LOONGSON3R3);
+    return NULL;
+}
+
+static gotoblas_t *get_coretype_from_cpuinfo(void) {
+#ifdef linux
+  FILE *infile;
+  char buffer[512], *p;
+
+  p = (char *)NULL;
+  //Check model name for Loongson3
+  infile = fopen("/proc/cpuinfo", "r");
+  while (fgets(buffer, sizeof(buffer), infile)){
+    if (!strncmp("model name", buffer, 10)){
+      p = strchr(buffer, ':') + 2;
+      break;
+    }
+  }
+  fclose(infile);
+  if(p != NULL){
+   if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000"))
+     return (&gotoblas_LOONGSON3R3);
+   else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000"))
+     return (&gotoblas_LOONGSON3R4);
+   else
+     return NULL;
+  }
+#endif
+    return NULL;
+}
+
+static gotoblas_t *get_coretype(void) {
+    int ret = 0;
+
+    ret = cpucfg_test();
+    if (ret == 1)
+        return get_coretype_from_cpucfg();
+    else
+        return get_coretype_from_cpuinfo();
+}
+
+void gotoblas_dynamic_init(void) {
+  char coremsg[128];
+  char coren[22];
+  char *p;
+
+  if (gotoblas) return;
+
+  p = getenv("OPENBLAS_CORETYPE");
+  if ( p )
+  {
+    gotoblas = force_coretype(p);
+  }
+  else
+  {
+    gotoblas = get_coretype();
+  }
+
+  if (gotoblas == NULL)
+  {
+    snprintf(coremsg, 128, "Falling back to loongson3r3 core\n");
+    openblas_warning(1, coremsg);
+    gotoblas = &gotoblas_LOONGSON3R3;
+  }
+
+  if (gotoblas && gotoblas->init) {
+    strncpy(coren, gotoblas_corename(), 20);
+    sprintf(coremsg, "Core: %s\n", coren);
+    openblas_warning(2, coremsg);
+    gotoblas -> init();
+  } else {
+    openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
+    exit(1);
+  }
+
+}
+
+void gotoblas_dynamic_quit(void) {
+  gotoblas = NULL;
+}
diff --git a/driver/others/parameter.c b/driver/others/parameter.c
index 35fc0a253..36da13369 100644
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -717,7 +717,7 @@ void blas_set_parameter(void){
 
 #if defined(ARCH_MIPS64)
 void blas_set_parameter(void){
-#if defined(LOONGSON3A)
+#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
 #ifdef SMP
   if(blas_num_threads == 1){
 #endif
@@ -731,20 +731,6 @@ void blas_set_parameter(void){
 #endif
 #endif
 
-#if defined(LOONGSON3B)
-#ifdef SMP
-  if(blas_num_threads == 1 || blas_num_threads == 2){
-#endif
-    //single thread
-    dgemm_r = 640;
-#ifdef SMP
-  }else{
-    //multi thread
-    dgemm_r = 160;
-  }
-#endif
-#endif
-
 }
 #endif
 
diff --git a/getarch.c b/getarch.c
index 9344defb5..e59a4e9b7 100644
--- a/getarch.c
+++ b/getarch.c
@@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_PPC440FP2	*/
 /* #define FORCE_CELL		*/
 /* #define FORCE_SICORTEX	*/
-/* #define FORCE_LOONGSON3A	*/
-/* #define FORCE_LOONGSON3B	*/
+/* #define FORCE_LOONGSON3R3	*/
+/* #define FORCE_LOONGSON3R4	*/
 /* #define FORCE_I6400		*/
 /* #define FORCE_P6600		*/
 /* #define FORCE_P5600		*/
@@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-#ifdef FORCE_LOONGSON3A
+#ifdef FORCE_LOONGSON3R3
 #define FORCE
 #define ARCHITECTURE    "MIPS"
-#define SUBARCHITECTURE "LOONGSON3A"
+#define SUBARCHITECTURE "LOONGSON3R3"
 #define SUBDIRNAME      "mips64"
-#define ARCHCONFIG   "-DLOONGSON3A " \
+#define ARCHCONFIG   "-DLOONGSON3R3 " \
        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "loongson3a"
-#define CORENAME  "LOONGSON3A"
+#define LIBNAME   "loongson3r3"
+#define CORENAME  "LOONGSON3R3"
 #else
 #endif
 
-#ifdef FORCE_LOONGSON3B
+#ifdef FORCE_LOONGSON3R4
 #define FORCE
 #define ARCHITECTURE    "MIPS"
-#define SUBARCHITECTURE "LOONGSON3B"
+#define SUBARCHITECTURE "LOONGSON3R4"
 #define SUBDIRNAME      "mips64"
-#define ARCHCONFIG   "-DLOONGSON3B " \
+#define ARCHCONFIG   "-DLOONGSON3R4 " \
        "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
        "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
        "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
-#define LIBNAME   "loongson3b"
-#define CORENAME  "LOONGSON3B"
+#define LIBNAME   "loongson3r4"
+#define CORENAME  "LOONGSON3R4"
 #else
 #endif
 
diff --git a/kernel/Makefile b/kernel/Makefile
index fb1d5d39a..4e86546b9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
  endif
 else ifeq ($(TARGET_CORE), HASWELL)
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
+else ifeq ($(TARGET_CORE), LOONGSON3R4)
+ override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
 else
  override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
 endif
@@ -68,6 +70,9 @@ else
 TARGET_CORE = $(CORE)
 KDIR =
 TSUFFIX =
+ifeq ($(TARGET_CORE), LOONGSON3R4)
+  override CFLAGS += $(MSA_FLAGS)
+endif
 endif
 
 -include $(KERNELDIR)/KERNEL.$(TARGET_CORE)
diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index 893713769..d8d739965 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64)
 USE_TRMM = 1
 endif
 
-ifeq ($(TARGET), LOONGSON3B)
-USE_TRMM = 1
-endif
-
 ifneq ($(DYNAMIC_ARCH), 1)
 ifeq ($(TARGET), GENERIC)
 USE_TRMM = 1
diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c
index 8b624be88..aa3f1dcfa 100644
--- a/kernel/mips/cgemm_kernel_8x4_msa.c
+++ b/kernel/mips/cgemm_kernel_8x4_msa.c
@@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
 {                                                                     \
     LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3);               \
-    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0));        \
     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
                                                                       \
     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
@@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4)                 \
 {                                                                     \
     LD_SP2_INC(pa0, 4, src_a0, src_a1);                               \
-    src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0));  \
+    src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0));        \
     SPLATI_W2_SP(src_bi, 0, src_br, src_bi);                          \
                                                                       \
     PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i);                  \
diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c
index 5273e38a3..84eb54d6d 100644
--- a/kernel/mips/crot_msa.c
+++ b/kernel/mips/crot_msa.c
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
     {
         if ((0 == c) && (0 == s))
         {
-            v4f32 zero = __msa_cast_to_vector_float(0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
+            v4f32 zero = {0.0, 0.0, 0.0, 0.0};
 
             /* process 2 elements */
             for (j = (n >> 1); j--;)
diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c
index 11a1450cf..451d0c921 100644
--- a/kernel/mips/cscal_msa.c
+++ b/kernel/mips/cscal_msa.c
@@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
     {
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v4f32 zero_v = __msa_cast_to_vector_float(0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+            v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
 
             for (i = (n >> 5); i--;)
             {
diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c
index 6ce0375ab..2e41d8bef 100644
--- a/kernel/mips/dscal_msa.c
+++ b/kernel/mips/dscal_msa.c
@@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
     {
         if (0.0 == da)
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 5); i--;)
             {
diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
index 9fb5141ca..e2cd3aa4b 100644
--- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c
@@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
     ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
 
-    src_a54 = __msa_cast_to_vector_double(*(a + 54));
-    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+    src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54));
     src_a62 = LD_DP(a + 62);
     src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
     src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
@@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a44 = LD_DP(a + 44);
     src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
     src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
-    src_a36 = __msa_cast_to_vector_double(*(a + 36));
-    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+    src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36));
 
     res_c7 *= src_a63;
     res_c6 -= res_c7 * src_a62;
@@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a26 = LD_DP(a + 26);
     src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
     src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
-    src_a18 = __msa_cast_to_vector_double(*(a + 18));
-    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
+    src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18));
 
     res_c3 -= res_c7 * src_a59;
     res_c2 -= res_c7 * src_a58;
@@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_a8 = LD_DP(a + 8);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
     src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     res_c1 -= res_c2 * src_a17;
     res_c1 *= src_a9;
@@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a52 = LD_DP(a - 12);
     src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
     src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
-    src_a54 = __msa_cast_to_vector_double(*(a - 10));
-    src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
+    src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10));
 
     src_a40 = LD_DP(a - 24);
     src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
@@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a34 = LD_DP(a - 30);
     src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
     src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
-    src_a36 = __msa_cast_to_vector_double(*(a - 28));
-    src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
+    src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28));
 
     res_c4 *= src_a36;
     res_c3 -= res_c4 * src_a35;
@@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a16 = LD_DP(a - 48);
     src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
     src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
-    src_a18 = __msa_cast_to_vector_double(*(a - 46));
-    src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
-    src_a0 = __msa_cast_to_vector_double(*(a - 64));
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64));
     src_a8 = LD_DP(a - 56);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
     src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
@@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
 
-    src_a8 = __msa_cast_to_vector_double(*(a + 8));
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-
-    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     src_a4 = LD_DP(a + 4);
     src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
@@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
     src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
 
-    src_a8 = __msa_cast_to_vector_double(*(a + 8));
-    src_a0 = __msa_cast_to_vector_double(*(a + 0));
-
-    src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
-    src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
+    src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
+    src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
 
     src_a4 = LD_DP(a + 4);
     src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
index 525fc8585..74cc1278a 100644
--- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c
@@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c8 * src_a6;
     res_c15 -= res_c8 * src_a7;
 
-    src_a9 = __msa_cast_to_vector_double(*(a + 9));
-    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+    src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c10 * src_a22;
     res_c15 -= res_c10 * src_a23;
 
-    src_a27 = __msa_cast_to_vector_double(*(a + 27));
-    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+    src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
     src_a28 = LD_DP(a + 28);
     src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
     src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     res_c14 -= res_c12 * src_a38;
     res_c15 -= res_c12 * src_a39;
 
-    src_a45 = __msa_cast_to_vector_double(*(a + 45));
-    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+    src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
     src_a46 = LD_DP(a + 46);
     src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
     src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
     ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
 
-    src_a63 = __msa_cast_to_vector_double(*(a + 63));
-    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+    src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
     src_a54 = LD_DP(a + 54);
     src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
     src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c0 * src_a6;
     res_c7 -= res_c0 * src_a7;
 
-    src_a9 = __msa_cast_to_vector_double(*(a + 9));
-    src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
+    src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c2 * src_a22;
     res_c7 -= res_c2 * src_a23;
 
-    src_a27 = __msa_cast_to_vector_double(*(a + 27));
-    src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
+    src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
     src_a28 = LD_DP(a + 28);
     src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
     src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c4 * src_a38;
     res_c7 -= res_c4 * src_a39;
 
-    src_a45 = __msa_cast_to_vector_double(*(a + 45));
-    src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
+    src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
     src_a46 = LD_DP(a + 46);
     src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
     src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c5 * src_a46;
     res_c7 -= res_c5 * src_a47;
 
-    src_a63 = __msa_cast_to_vector_double(*(a + 63));
-    src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
+    src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
     src_a54 = LD_DP(a + 54);
     src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
     src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c6 -= res_c4 * src_a2;
     res_c7 -= res_c4 * src_a3;
 
-    src_a5 = __msa_cast_to_vector_double(*(a + 5));
-    src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+    src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
     src_a6 = LD_DP(a + 6);
     src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
     src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
-    src_a15 = __msa_cast_to_vector_double(*(a + 15));
-    src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+    src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
 
     res_c2 *= src_a10;
     res_c3 -= res_c2 * src_a11;
@@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     res_c2 -= res_c0 * src_a2;
     res_c3 -= res_c0 * src_a3;
 
-    src_a5 = __msa_cast_to_vector_double(*(a + 5));
-    src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
+    src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
     src_a6 = LD_DP(a + 6);
     src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
     src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_a10 = LD_DP(a + 10);
     src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
     src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
-    src_a15 = __msa_cast_to_vector_double(*(a + 15));
-    src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
+    src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
 
     res_c2 *= src_a10;
     res_c3 -= res_c2 * src_a11;
diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
index cb361c511..03036f1c7 100644
--- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c
@@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
-    src_b5 = __msa_cast_to_vector_double(*(b + 5));
-    src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+    src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
     src_b6 = LD_DP(b + 6);
     src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
     src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
     src_b10 = LD_DP(b + 10);
     src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
     src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
-    src_b15 = __msa_cast_to_vector_double(*(b + 15));
-    src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+    src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b0 = LD_DP(b + 0);
     src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
     src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
-    src_b3 = __msa_cast_to_vector_double(*(b + 3));
-    src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+    src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
         }
     }
 
-    src_b0 = __msa_cast_to_vector_double(*b);
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
-    src_b5 = __msa_cast_to_vector_double(*(b + 5));
-    src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
+    src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
     src_b6 = LD_DP(b + 6);
     src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
     src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
     src_b10 = LD_DP(b + 10);
     src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
     src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
-    src_b15 = __msa_cast_to_vector_double(*(b + 15));
-    src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
+    src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b0 = LD_DP(b + 0);
     src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
     src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
-    src_b3 = __msa_cast_to_vector_double(*(b + 3));
-    src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
+    src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
index 581a90f71..4c55a0f37 100644
--- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
+++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c
@@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
     src_b8 = LD_DP(b + 8);
     src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
     src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
-    src_b10 = __msa_cast_to_vector_double(*(b + 10));
-    src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+    src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b4 = LD_DP(b + 4);
     src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
     src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 16;
     b -= 4;
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
@@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
     a -= 8;
     b -= 1;
 
-    src_b0 = __msa_cast_to_vector_double(*b);
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
 
     src_c0 *= src_b0;
     src_c1 *= src_b0;
@@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     src_b8 = LD_DP(b + 8);
     src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
     src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
-    src_b10 = __msa_cast_to_vector_double(*(b + 10));
-    src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
+    src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b4 = LD_DP(b + 4);
     src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
     src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
     a -= 8;
     b -= 4;
 
-    src_b0 = __msa_cast_to_vector_double(*(b + 0));
-    src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
+    src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
     src_b2 = LD_DP(b + 2);
     src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
     src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h
index ee0dea0b7..b887800ed 100644
--- a/kernel/mips/macros_msa.h
+++ b/kernel/mips/macros_msa.h
@@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src)
 #define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
 
 #define COPY_FLOAT_TO_VECTOR(a) ( {                \
-    v4f32  out;                                    \
-    out = __msa_cast_to_vector_float(a);           \
-    out = (v4f32) __msa_splati_w((v4i32) out, 0);  \
+    v4f32  out = {a, a, a, a};                     \
     out;                                           \
 } )
 
 #define COPY_DOUBLE_TO_VECTOR(a) ( {               \
-    v2f64  out;                                    \
-    out = __msa_cast_to_vector_double(a);          \
-    out = (v2f64) __msa_splati_d((v2i64) out, 0);  \
+    v2f64  out = {a, a};                           \
     out;                                           \
 } )
 
diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c
index 75730241a..79d921b7a 100644
--- a/kernel/mips/srot_msa.c
+++ b/kernel/mips/srot_msa.c
@@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
     {
         if ((0 == c) && (0 == s))
         {
-            v4f32 zero = __msa_cast_to_vector_float(0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
-            zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
+            v4f32 zero = {0.0, 0.0, 0.0, 0.0};
 
             /* process 4 floats */
             for (j = (n >> 2); j--;)
diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c
index 64b62d659..66e17b844 100644
--- a/kernel/mips/sscal_msa.c
+++ b/kernel/mips/sscal_msa.c
@@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
     {
         if (0.0 == da)
         {
-            v4f32 zero_v = __msa_cast_to_vector_float(0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
-            zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
+            v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
 
             for (i = (n >> 6); i--;)
             {
diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c
index 5a8766d3c..a45c3cecd 100644
--- a/kernel/mips/zscal_msa.c
+++ b/kernel/mips/zscal_msa.c
@@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
     {
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 4); i--;)
             {
@@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 
         if ((0.0 == da_r) && (0.0 == da_i))
         {
-            v2f64 zero_v = __msa_cast_to_vector_double(0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
-            zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
+            v2f64 zero_v = {0.0, 0.0};
 
             for (i = (n >> 4); i--;)
             {
diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B
deleted file mode 100644
index e476c631e..000000000
--- a/kernel/mips64/KERNEL.LOONGSON3B
+++ /dev/null
@@ -1,64 +0,0 @@
-SAXPYKERNEL=axpy_loongson3a.S
-DAXPYKERNEL=daxpy_loongson3a_simd.S
-
-SGEMVNKERNEL = gemv_n_loongson3a.c
-SGEMVTKERNEL = gemv_t_loongson3a.c
-DGEMVNKERNEL = gemv_n_loongson3a.c
-DGEMVTKERNEL = gemv_t_loongson3a.c
-CGEMVNKERNEL = zgemv_n_loongson3a.c
-CGEMVTKERNEL = zgemv_t_loongson3a.c
-ZGEMVNKERNEL = zgemv_n_loongson3a.c
-ZGEMVTKERNEL = zgemv_t_loongson3a.c
-
-STRMMKERNEL	= ../generic/trmmkernel_2x2.c
-DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
-CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
-
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
-
-DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
-DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
-
-CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
-
-ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
-ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
-ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-
-
-
diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3
similarity index 75%
rename from kernel/mips64/KERNEL.LOONGSON3A
rename to kernel/mips64/KERNEL.LOONGSON3R3
index 0298faaad..904828d57 100644
--- a/kernel/mips64/KERNEL.LOONGSON3A
+++ b/kernel/mips64/KERNEL.LOONGSON3R3
@@ -16,32 +16,32 @@ SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-SGEMMINCOPYOBJ =  sgemm_incopy.o
-SGEMMITCOPYOBJ =  sgemm_itcopy.o
-SGEMMONCOPYOBJ =  sgemm_oncopy.o
-SGEMMOTCOPYOBJ =  sgemm_otcopy.o
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 DGEMMKERNEL    =  dgemm_kernel_loongson3a_4x4.S
 DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
-DGEMMONCOPYOBJ = dgemm_oncopy.o
-DGEMMOTCOPYOBJ = dgemm_otcopy.o
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 CGEMMKERNEL    =  cgemm_kernel_loongson3a_4x2_ps.S
 CGEMMINCOPY    = ../generic/zgemm_ncopy_4.c
 CGEMMITCOPY    = ../generic/zgemm_tcopy_4.c
 CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-CGEMMINCOPYOBJ =  cgemm_incopy.o
-CGEMMITCOPYOBJ =  cgemm_itcopy.o
-CGEMMONCOPYOBJ =  cgemm_oncopy.o
-CGEMMOTCOPYOBJ =  cgemm_otcopy.o
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 ZGEMMKERNEL    =  zgemm_kernel_loongson3a_2x2.S
 ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
 ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
-ZGEMMONCOPYOBJ =  zgemm_oncopy.o
-ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
 STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
@@ -64,6 +64,3 @@ ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
 ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
 
 DSDOTKERNEL     = ../mips/dot.c
-
-
-
diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4
new file mode 100644
index 000000000..b81e5441d
--- /dev/null
+++ b/kernel/mips64/KERNEL.LOONGSON3R4
@@ -0,0 +1,192 @@
+ifdef HAVE_MSA
+SAXPYKERNEL = ../mips/saxpy_msa.c
+DAXPYKERNEL = ../mips/daxpy_msa.c
+CAXPYKERNEL = ../mips/caxpy_msa.c
+ZAXPYKERNEL = ../mips/zaxpy_msa.c
+else
+SAXPYKERNEL = axpy_loongson3a.S
+DAXPYKERNEL = daxpy_loongson3a_simd.S
+endif
+
+ifdef HAVE_MSA
+SCOPYKERNEL  = ../mips/scopy_msa.c
+DCOPYKERNEL  = ../mips/dcopy_msa.c
+CCOPYKERNEL  = ../mips/ccopy_msa.c
+ZCOPYKERNEL  = ../mips/zcopy_msa.c
+endif
+
+ifdef HAVE_MSA
+SDOTKERNEL   = ../mips/sdot_msa.c
+DDOTKERNEL   = ../mips/ddot_msa.c
+CDOTKERNEL   = ../mips/cdot_msa.c
+ZDOTKERNEL   = ../mips/zdot_msa.c
+endif
+DSDOTKERNEL  = ../mips/dot.c
+
+ifdef HAVE_MSA
+SROTKERNEL   = ../mips/srot_msa.c
+DROTKERNEL   = ../mips/drot_msa.c
+CROTKERNEL   = ../mips/crot_msa.c
+ZROTKERNEL   = ../mips/zrot_msa.c
+endif
+
+ifdef HAVE_MSA
+SSCALKERNEL  = ../mips/sscal_msa.c
+DSCALKERNEL  = ../mips/dscal_msa.c
+CSCALKERNEL  = ../mips/cscal_msa.c
+ZSCALKERNEL  = ../mips/zscal_msa.c
+endif
+
+ifdef HAVE_MSA
+SGEMVNKERNEL = ../mips/sgemv_n_msa.c
+DGEMVNKERNEL = ../mips/dgemv_n_msa.c
+SGEMVTKERNEL = ../mips/sgemv_t_msa.c
+DGEMVTKERNEL = ../mips/dgemv_t_msa.c
+CGEMVNKERNEL = ../mips/cgemv_n_msa.c
+CGEMVTKERNEL = ../mips/cgemv_t_msa.c
+ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
+ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
+else
+SGEMVNKERNEL = gemv_n_loongson3a.c
+SGEMVTKERNEL = gemv_t_loongson3a.c
+DGEMVNKERNEL = gemv_n_loongson3a.c
+DGEMVTKERNEL = gemv_t_loongson3a.c
+CGEMVNKERNEL = zgemv_n_loongson3a.c
+CGEMVTKERNEL = zgemv_t_loongson3a.c
+ZGEMVNKERNEL = zgemv_n_loongson3a.c
+ZGEMVTKERNEL = zgemv_t_loongson3a.c
+endif
+
+ifdef HAVE_MSA
+SASUMKERNEL  = ../mips/sasum_msa.c
+DASUMKERNEL  = ../mips/dasum_msa.c
+CASUMKERNEL  = ../mips/casum_msa.c
+ZASUMKERNEL  = ../mips/zasum_msa.c
+endif
+
+ifdef HAVE_MSA
+SSWAPKERNEL  = ../mips/sswap_msa.c
+DSWAPKERNEL  = ../mips/dswap_msa.c
+CSWAPKERNEL  = ../mips/cswap_msa.c
+ZSWAPKERNEL  = ../mips/zswap_msa.c
+endif
+
+ifdef HAVE_MSA
+SGEMMKERNEL    = ../mips/sgemm_kernel_8x8_msa.c
+SGEMMONCOPY    = ../mips/sgemm_ncopy_8_msa.c
+SGEMMOTCOPY    = ../mips/sgemm_tcopy_8_msa.c
+SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+SGEMMKERNEL    =  sgemm_kernel_8x4_ps.S
+SGEMMINCOPY    =  ../generic/gemm_ncopy_8.c
+SGEMMITCOPY    =  ../generic/gemm_tcopy_8.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
+SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
+SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+DGEMMKERNEL    = ../mips/dgemm_kernel_8x4_msa.c
+DGEMMINCOPY    = ../mips/dgemm_ncopy_8_msa.c
+DGEMMITCOPY    = ../mips/dgemm_tcopy_8_msa.c
+DGEMMONCOPY    = ../mips/dgemm_ncopy_4_msa.c
+DGEMMOTCOPY    = ../mips/dgemm_tcopy_4_msa.c
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+DGEMMKERNEL    =  dgemm_kernel_loongson3a_4x4.S
+DGEMMONCOPY    = ../generic/gemm_ncopy_4.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_4.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+CGEMMKERNEL    = ../mips/cgemm_kernel_8x4_msa.c
+CGEMMINCOPY    = ../mips/cgemm_ncopy_8_msa.c
+CGEMMITCOPY    = ../mips/cgemm_tcopy_8_msa.c
+CGEMMONCOPY    = ../mips/cgemm_ncopy_4_msa.c
+CGEMMOTCOPY    = ../mips/cgemm_tcopy_4_msa.c
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+CGEMMKERNEL    =  cgemm_kernel_loongson3a_4x2_ps.S
+CGEMMINCOPY    = ../generic/zgemm_ncopy_4.c
+CGEMMITCOPY    = ../generic/zgemm_tcopy_4.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+ZGEMMKERNEL    = ../mips/zgemm_kernel_4x4_msa.c
+ZGEMMONCOPY    = ../mips/zgemm_ncopy_4_msa.c
+ZGEMMOTCOPY    = ../mips/zgemm_tcopy_4_msa.c
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+else
+ZGEMMKERNEL    =  zgemm_kernel_loongson3a_2x2.S
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+endif
+
+ifdef HAVE_MSA
+STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
+STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
+STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
+STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
+else
+STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
+DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
+DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
+DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
+else
+DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
+
+ifdef HAVE_MSA
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+else
+ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
+endif
diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c
index d0317a745..1e846a61c 100644
--- a/kernel/setparam-ref.c
+++ b/kernel/setparam-ref.c
@@ -933,6 +933,77 @@ static void init_parameter(void) {
 
 }
 #else // (ARCH_ARM64)
+#if defined(ARCH_MIPS64)
+static void init_parameter(void) {
+  TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
+  TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
+  TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
+  TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
+
+  TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
+  TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
+  TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
+  TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
+
+  TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
+  TABLE_NAME.dgemm_r = 640;
+  TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
+  TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
+
+#ifdef EXPRECISION
+  TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
+  TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
+  TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
+  TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
+  TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R;
+  TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R;
+#endif
+
+#if defined(USE_GEMM3M)
+#ifdef CGEMM3M_DEFAULT_P
+  TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
+#else
+  TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_P
+  TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
+#else
+  TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
+#endif
+
+#ifdef CGEMM3M_DEFAULT_Q
+  TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
+#else
+  TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_Q
+  TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
+#else
+  TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q;
+#endif
+
+#ifdef CGEMM3M_DEFAULT_R
+  TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R;
+#else
+  TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r;
+#endif
+
+#ifdef ZGEMM3M_DEFAULT_R
+  TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R;
+#else
+  TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r;
+#endif
+
+#ifdef EXPRECISION
+  TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
+  TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q;
+  TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
+#endif
+#endif
+}
+#else // (ARCH_MIPS64)
 #if (ARCH_POWER)
 static void init_parameter(void) {
 
@@ -1780,4 +1851,5 @@ static void init_parameter(void) {
 }
 #endif //POWER
 #endif //ZARCH
+#endif //(ARCH_MIPS64)
 #endif //(ARCH_ARM64)
diff --git a/param.h b/param.h
index a0d45c573..6946c2b41 100644
--- a/param.h
+++ b/param.h
@@ -2570,8 +2570,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-#ifdef LOONGSON3A
-/*Copy from SICORTEX*/
+#if defined(LOONGSON3R4)
 #define SNUMOPT		2
 #define DNUMOPT		2
 
@@ -2579,6 +2578,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
+#ifdef HAVE_MSA
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  8
+
+#define DGEMM_DEFAULT_UNROLL_M  8
+#define DGEMM_DEFAULT_UNROLL_N  4
+
+#define CGEMM_DEFAULT_UNROLL_M  8
+#define CGEMM_DEFAULT_UNROLL_N  4
+
+#define ZGEMM_DEFAULT_UNROLL_M  4
+#define ZGEMM_DEFAULT_UNROLL_N  4
+#else
 #define SGEMM_DEFAULT_UNROLL_M  8
 #define SGEMM_DEFAULT_UNROLL_N  4
 
@@ -2590,6 +2602,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
+#endif
 
 #define SGEMM_DEFAULT_P	64
 #define DGEMM_DEFAULT_P	44
@@ -2612,7 +2625,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SYMV_P	16
 #endif
 
-#ifdef LOONGSON3B
+#if defined(LOONGSON3R3)
+////Copy from SICORTEX
 #define SNUMOPT		2
 #define DNUMOPT		2
 
@@ -2620,32 +2634,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
+#define SGEMM_DEFAULT_UNROLL_M  8
+#define SGEMM_DEFAULT_UNROLL_N  4
 
-#define DGEMM_DEFAULT_UNROLL_M  2
-#define DGEMM_DEFAULT_UNROLL_N  2
+#define DGEMM_DEFAULT_UNROLL_M  4
+#define DGEMM_DEFAULT_UNROLL_N  4
 
-#define CGEMM_DEFAULT_UNROLL_M  2
+#define CGEMM_DEFAULT_UNROLL_M  4
 #define CGEMM_DEFAULT_UNROLL_N  2
 
 #define ZGEMM_DEFAULT_UNROLL_M  2
 #define ZGEMM_DEFAULT_UNROLL_N  2
 
 #define SGEMM_DEFAULT_P	64
-#define DGEMM_DEFAULT_P	24
-#define CGEMM_DEFAULT_P 24
-#define ZGEMM_DEFAULT_P 20
+#define DGEMM_DEFAULT_P	44
+#define CGEMM_DEFAULT_P 64
+#define ZGEMM_DEFAULT_P 32
 
 #define SGEMM_DEFAULT_Q 192
-#define DGEMM_DEFAULT_Q 128
+#define DGEMM_DEFAULT_Q 92
 #define CGEMM_DEFAULT_Q 128
-#define ZGEMM_DEFAULT_Q 64
+#define ZGEMM_DEFAULT_Q 80
 
-#define SGEMM_DEFAULT_R 512
-#define DGEMM_DEFAULT_R 512
-#define CGEMM_DEFAULT_R 512
-#define ZGEMM_DEFAULT_R 512
+#define SGEMM_DEFAULT_R 640
+#define DGEMM_DEFAULT_R dgemm_r
+#define CGEMM_DEFAULT_R 640
+#define ZGEMM_DEFAULT_R 640
 
 #define GEMM_OFFSET_A1	0x10000
 #define	GEMM_OFFSET_B1	0x100000

From be24c66a7c3b746dd9c27db09e4b0e28785025f2 Mon Sep 17 00:00:00 2001
From: gxw <guxiwei-hf@loongson.cn>
Date: Thu, 10 Dec 2020 10:48:53 +0800
Subject: [PATCH 115/121] Keep LOONGSON3A and LOONGSON3B for loongson

---
 getarch.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/getarch.c b/getarch.c
index e59a4e9b7..29671736e 100644
--- a/getarch.c
+++ b/getarch.c
@@ -814,7 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif
 
 
-#ifdef FORCE_LOONGSON3R3
+#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B
 #define FORCE
 #define ARCHITECTURE    "MIPS"
 #define SUBARCHITECTURE "LOONGSON3R3"

From 346e30a46a4758eb4d9b8e5783c0b9c3c6b3ce6f Mon Sep 17 00:00:00 2001
From: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
Date: Thu, 10 Dec 2020 11:51:42 -0600
Subject: [PATCH 116/121] POWER10: Improve axpy performance

This patch aligns the stores to 32 byte boundary for saxpy and daxpy
before entering into vector pair loop. Fox caxpy, changed the store
instructions to stxv to improve performance of unaligned cases.
---
 kernel/power/caxpy_microk_power10.c | 24 ++++++++++++++++--------
 kernel/power/daxpy_power10.c        | 17 ++++++++++++-----
 kernel/power/saxpy_power10.c        | 14 ++++++++++----
 3 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c
index 0d13416b3..56a5ab47a 100644
--- a/kernel/power/caxpy_microk_power10.c
+++ b/kernel/power/caxpy_microk_power10.c
@@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "xvmaddasp	38, 58, 33	\n\t"
        "xvmaddasp	39, 59, 33	\n\t"
 
-       "stxvp		48, 0(%4)	\n\t"
-       "stxvp		50, 32(%4)	\n\t"
-       "stxvp		34, 64(%4)	\n\t"
-       "stxvp		38, 96(%4)	\n\t"
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		35, 64(%4)	\n\t"
+       "stxv		34, 80(%4)	\n\t"
+       "stxv		39, 96(%4)	\n\t"
+       "stxv		38, 112(%4)	\n\t"
 
        "addi		%4, %4, 128	\n\t"
        "xxperm 52, 40, %x10 \n\t"       // exchange real and imag part
@@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
        "xvmaddasp	38, 58, 33	\n\t"
        "xvmaddasp	39, 59, 33	\n\t"
 
-       "stxvp		48, 0(%4)	\n\t"
-       "stxvp		50, 32(%4)	\n\t"
-       "stxvp		34, 64(%4)	\n\t"
-       "stxvp		38, 96(%4)	\n\t"
+       "stxv		49, 0(%4)	\n\t"
+       "stxv		48, 16(%4)	\n\t"
+       "stxv		51, 32(%4)	\n\t"
+       "stxv		50, 48(%4)	\n\t"
+       "stxv		35, 64(%4)	\n\t"
+       "stxv		34, 80(%4)	\n\t"
+       "stxv		39, 96(%4)	\n\t"
+       "stxv		38, 112(%4)	\n\t"
 
      "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
      :
diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c
index ebe91a80f..8640efcfd 100644
--- a/kernel/power/daxpy_power10.c
+++ b/kernel/power/daxpy_power10.c
@@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-		BLASLONG n1 = n & -16;
+                if ( n >= 16 )
+                {
+                       BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
+                        for (i = 0; i < align; i++) {
+                          y[i] += da * x[i] ;
+                        }
+                }
+                BLASLONG n1 = (n-i) & -16;
+                if ( n1 )
+                      daxpy_kernel_8(n1, &x[i], &y[i], da);
+
+                i += n1;
 
-		if ( n1 )
-			daxpy_kernel_8(n1, x, y, da);
-
-		i = n1;
 		while(i < n)
 		{
 
diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c
index 8c7c22390..4a13c1f88 100644
--- a/kernel/power/saxpy_power10.c
+++ b/kernel/power/saxpy_power10.c
@@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 	if ( (inc_x == 1) && (inc_y == 1) )
 	{
 
-		BLASLONG n1 = n & -64;
-
+		if ( n >= 64 )
+		{
+			BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
+			for (i = 0; i < align; i++) {
+				y[i] += da * x[i] ;
+			}
+		}
+		BLASLONG n1 = (n-i) & -64;
 		if ( n1 )
-			saxpy_kernel_64(n1, x, y, da);
+			saxpy_kernel_64(n1, &x[i], &y[i], da);
 
-		i = n1;
+		i += n1;
 		while(i < n)
 		{
 

From 6232237dba7bdd7e185216f7bb0d733ba4c0486e Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 11 Dec 2020 23:41:17 +0100
Subject: [PATCH 117/121] Make fallback from P10 to P9 conditional on suitable
 compiler

---
 driver/others/dynamic_power.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c
index d60ae68fc..a2f56d839 100644
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@@ -53,8 +53,10 @@ static gotoblas_t *get_coretype(void) {
 		return &gotoblas_POWER10;
 #endif
 	/* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */
+#if (!defined __GNUC__) || ( __GNUC__ >= 6)
 	if (__builtin_cpu_is("power10"))
 		return &gotoblas_POWER9;
+#endif	
 	return NULL;
 }
 

From 77edf82c7faf9af1412b0f0c9de7a7543341b2e2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 12 Dec 2020 01:25:20 +0100
Subject: [PATCH 118/121] Update Changelog.txt for 0.3.13

---
 Changelog.txt | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/Changelog.txt b/Changelog.txt
index edd3563ec..807c5ff20 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -1,4 +1,54 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.13
+ 12-Dec-2020
+ 
+ common:
+	* Added a generic bfloat16 SBGEMV kernel
+	* Fixed a potentially severe memory leak after fork in OpenMP builds
+	  that was introduces in 0.3.12
+	* Added detection of the Fujitsu Fortran compiler
+	* Added detection of the (e)gfortran compiler on OpenBSD
+	* Added support for overriding the default name of the library independently
+	  from symbol suffixing in the gmake builds (already supported in cmake)
+
+RISCV:
+	* Added a RISC V port optimized for C910V
+
+POWER:
+	* Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N
+	* Improved DGEMM performance on POWER10
+	* Improved STRSM and DTRSM performance on POWER9 and POWER10
+	* Fixed segmemtation faults in DYNAMIC_ARCH builds
+ 	* Fixed compilation with the PGI compiler
+
+x86:
+	* Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12
+	
+x86_64:
+	* Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake
+	* Improved the performance of SASUM and DASUM kernels through parallelization
+	* Improved the performance of SROT and DROT kernels
+	* Improved the performance of multithreaded xSYRK
+	* Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran
+	  (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or
+	  wrong results)
+	* Fixed miscompilations by old gcc 4.6
+	* Fixed misdetection of AVX2 capability in some Sandybridge cpus
+	* Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD
+
+ARM64:
+	* Fixed segmemtation faults in DYNAMIC_ARCH builds
+
+MIPS:
+	* Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA
+	* Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV
+	* Added handling of zero increments in the MSA kernels for SSWAP and DSWAP
+	* Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only)
+
+SPARC:
+	* Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers
+
 ====================================================================
 Version 0.3.12
  24-Oct-2020

From 3dec81200cdac01651681a3e36f77179a0815eb4 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 12 Dec 2020 14:27:37 +0100
Subject: [PATCH 119/121] Update Changelog.txt

Co-authored-by: h-vetinari <h.vetinari@gmx.com>
---
 Changelog.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Changelog.txt b/Changelog.txt
index 807c5ff20..cbc7007ac 100644
--- a/Changelog.txt
+++ b/Changelog.txt
@@ -6,7 +6,7 @@ Version 0.3.13
  common:
 	* Added a generic bfloat16 SBGEMV kernel
 	* Fixed a potentially severe memory leak after fork in OpenMP builds
-	  that was introduces in 0.3.12
+	  that was introduced in 0.3.12
 	* Added detection of the Fujitsu Fortran compiler
 	* Added detection of the (e)gfortran compiler on OpenBSD
 	* Added support for overriding the default name of the library independently

From d3ec787f774bc678ec13f0ed87fe2f3d67af1a11 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 12 Dec 2020 18:14:49 +0100
Subject: [PATCH 120/121] Update version to 0.3.13 for release

---
 Makefile.rule | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile.rule b/Makefile.rule
index 1a0965d08..e4b82104e 100644
--- a/Makefile.rule
+++ b/Makefile.rule
@@ -3,7 +3,7 @@
 #
 
 # This library's version
-VERSION = 0.3.12.dev
+VERSION = 0.3.13
 
 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library

From 7bc0e4a2e001117d7e51f0ef8ea1abc4b734d079 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 12 Dec 2020 18:15:33 +0100
Subject: [PATCH 121/121] Update version to 0.3.13 for release

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aeb4399e4..12730e0e3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
 project(OpenBLAS C ASM)
 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 12.dev)
+set(OpenBLAS_PATCH_VERSION 13)
 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
 
 # Adhere to GNU filesystem layout conventions