Fix #686. Merge branch 'ashwinyes-develop' into develop

10 years ago · e31948ceb0
--- a/.gitignore
+++ b/.gitignore
@@ -68,3 +68,4 @@ test/zblat2
 test/zblat3
 build
 build.*
 *.swp
--- a/Makefile.arm64
+++ b/Makefile.arm64
@@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
 FCOMMON_OPT += -march=armv8-a
 endif
 ifeq ($(CORE), CORTEXA57)
 CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
 FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
 endif
--- a/TargetList.txt
+++ b/TargetList.txt
@@ -74,3 +74,5 @@ ARMV5
 7.ARM 64-bit CPU:
 ARMV8
 CORTEXA57
--- a/benchmark/gemm.c
+++ b/benchmark/gemm.c
@@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
  srandom(getpid());
 #endif
 	for(j = 0; j < m; j++){
 	for(j = 0; j < to; j++){
      		for(i = 0; i < to * COMPSIZE; i++){
 			a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
 			b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
--- a/common.h
+++ b/common.h
@@ -86,6 +86,7 @@ extern "C" {
 #if !defined(_MSC_VER)
 #include <unistd.h>
 #endif
 #include <time.h>
 #ifdef OS_LINUX
 #include <malloc.h>
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -89,8 +89,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 #define PROLOGUE \
 	.text ;\
 	.align	4 ;\
 	.global	REALNAME ;\
 	.func	REALNAME  ;\
 	.type	REALNAME, %function ;\
 REALNAME:
 #define EPILOGUE
@@ -107,7 +109,11 @@ REALNAME:
 #endif
 #define HUGE_PAGESIZE   ( 4 << 20)
 #if defined(CORTEXA57)
 #define BUFFER_SIZE     (40 << 20)
 #else
 #define BUFFER_SIZE     (16 << 20)
 #endif
 #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
--- a/cpuid_arm64.c
+++ b/cpuid_arm64.c
@@ -29,12 +29,19 @@
 #define CPU_UNKNOWN     	0
 #define CPU_ARMV8       	1
 #define CPU_CORTEXA57       	2
 static char *cpuname[] = {
  "UNKOWN",
  "ARMV8"
  "UNKNOWN",
  "ARMV8" ,
  "CORTEXA57"
 };
 static char *cpuname_lower[] = {
  "unknown",
  "armv8" ,
  "cortexa57"
 };
 int get_feature(char *search)
 {
@@ -53,13 +60,13 @@ int get_feature(char *search)
 		{
 			p = strchr(buffer, ':') + 2;
 			break;
      		}
  	}
 		}
 	}
  	fclose(infile);
 	fclose(infile);
 	if( p == NULL ) return;
 	if( p == NULL ) return 0;
 	t = strtok(p," ");
 	while( t = strtok(NULL," "))
@@ -82,11 +89,30 @@ int detect(void)
  	p = (char *) NULL ;
  	infile = fopen("/proc/cpuinfo", "r");
 	while (fgets(buffer, sizeof(buffer), infile))
 	{
 		if (!strncmp("CPU part", buffer, 8))
 		{
 			p = strchr(buffer, ':') + 2;
 			break;
 		}
 	}
 	fclose(infile);
 	if(p != NULL) {
 	  if (strstr(p, "0xd07")) {
 	    return CPU_CORTEXA57;
 	  }
 	}
 	p = (char *) NULL ;
 	infile = fopen("/proc/cpuinfo", "r");
 	while (fgets(buffer, sizeof(buffer), infile))
 	{
 		if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
 		if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) ||
 		    (!strncmp("CPU architecture", buffer, 16)))
 		{
 			p = strchr(buffer, ':') + 2;
 			break;
@@ -100,7 +126,7 @@ int detect(void)
 		if (strstr(p, "AArch64"))
 		{
 			 	return CPU_ARMV8;
 			return CPU_ARMV8;
 		}
@@ -118,23 +144,13 @@ char *get_corename(void)
 void get_architecture(void)
 {
 	printf("ARM");
 	printf("ARM64");
 }
 void get_subarchitecture(void)
 {
 	int d = detect();
 	switch (d)
 	{
 		case CPU_ARMV8:
 			printf("ARMV8");
 			break;
 		default:
 			printf("UNKNOWN");
 			break;
 	}
 	printf("%s", cpuname[d]);
 }
 void get_subdirname(void)
@@ -160,26 +176,32 @@ void get_cpuconfig(void)
    			printf("#define L2_ASSOCIATIVE 4\n");
 			break;
 		case CPU_CORTEXA57:
 			printf("#define CORTEXA57\n");
 			printf("#define HAVE_VFP\n");
 			printf("#define HAVE_VFPV3\n");
 			printf("#define HAVE_NEON\n");
 			printf("#define HAVE_VFPV4\n");
 			printf("#define L1_CODE_SIZE 49152\n");
 			printf("#define L1_CODE_LINESIZE 64\n");
 			printf("#define L1_CODE_ASSOCIATIVE 3\n");
 			printf("#define L1_DATA_SIZE 32768\n");
 			printf("#define L1_DATA_LINESIZE 64\n");
 			printf("#define L1_DATA_ASSOCIATIVE 2\n");
 			printf("#define L2_SIZE 2097152\n");
 			printf("#define L2_LINESIZE 64\n");
 			printf("#define L2_ASSOCIATIVE 16\n");
 			break;
 	}
 }
 void get_libname(void)
 {
 	int d = detect();
 	switch (d)
 	{
 		case CPU_ARMV8:
    			printf("armv8\n");
 			break;
 	}
 	printf("%s", cpuname_lower[d]);
 }
 void get_features(void)
 {
--- a/driver/level2/spmv_thread.c
+++ b/driver/level2/spmv_thread.c
@@ -55,7 +55,7 @@
 static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
  FLOAT *a, *x, *y;
  BLASLONG incx, incy;
  BLASLONG incx;
  BLASLONG m_from, m_to, i;
 #ifndef COMPLEX
  FLOAT result;
@@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
  y = (FLOAT *)args -> c;
  incx = args -> ldb;
  incy = args -> ldc;
  m_from = 0;
  m_to   = args -> m;
--- a/driver/level2/spr2_thread.c
+++ b/driver/level2/spr2_thread.c
@@ -43,7 +43,7 @@
 static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
  FLOAT *a, *x, *y;
  BLASLONG lda, incx, incy;
  BLASLONG incx, incy;
  BLASLONG i, m_from, m_to;
  FLOAT alpha_r;
 #ifdef COMPLEX
@@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
  incx = args -> lda;
  incy = args -> ldb;
  lda  = args -> ldc;
  alpha_r = *((FLOAT *)args -> alpha + 0);
 #ifdef COMPLEX
--- a/driver/level2/spr_thread.c
+++ b/driver/level2/spr_thread.c
@@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
  BLASLONG incx;
  BLASLONG i, m_from, m_to;
  FLOAT alpha_r;
 #if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
 #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
  FLOAT alpha_i;
 #endif
@@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
  incx = args -> lda;
  alpha_r = *((FLOAT *)args -> alpha + 0);
 #if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
 #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
  alpha_i = *((FLOAT *)args -> alpha + 1);
 #endif
--- a/driver/level2/symv_thread.c
+++ b/driver/level2/symv_thread.c
@@ -55,7 +55,7 @@
 static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){
  FLOAT *a, *x, *y;
  BLASLONG lda, incx, incy;
  BLASLONG lda, incx;
  BLASLONG m_from, m_to;
  a = (FLOAT *)args -> a;
@@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
  lda  = args -> lda;
  incx = args -> ldb;
  incy = args -> ldc;
  m_from = 0;
  m_to   = args -> m;
--- a/driver/level2/tbmv_L.c
+++ b/driver/level2/tbmv_L.c
@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/tbmv_U.c
+++ b/driver/level2/tbmv_U.c
@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/tbsv_L.c
+++ b/driver/level2/tbsv_L.c
@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/tbsv_U.c
+++ b/driver/level2/tbsv_U.c
@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/tpsv_L.c
+++ b/driver/level2/tpsv_L.c
@@ -43,12 +43,10 @@
 int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }
--- a/driver/level2/tpsv_U.c
+++ b/driver/level2/tpsv_U.c
@@ -43,12 +43,10 @@
 int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }
--- a/driver/level2/ztbmv_L.c
+++ b/driver/level2/ztbmv_L.c
@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
 #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/ztbmv_U.c
+++ b/driver/level2/ztbmv_U.c
@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
 #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/ztbsv_L.c
+++ b/driver/level2/ztbsv_L.c
@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
 #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/ztbsv_U.c
+++ b/driver/level2/ztbsv_U.c
@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
 int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){
  BLASLONG i;
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  BLASLONG length;
 #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
    COPY_K(n, b, incb, buffer, 1);
  }
--- a/driver/level2/ztpmv_L.c
+++ b/driver/level2/ztpmv_L.c
@@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
 #ifndef UNIT
  FLOAT atemp1, atemp2, btemp1, btemp2;
 #endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }
--- a/driver/level2/ztpmv_U.c
+++ b/driver/level2/ztpmv_U.c
@@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
 #ifndef UNIT
  FLOAT atemp1, atemp2, btemp1, btemp2;
 #endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }
--- a/driver/level2/ztpsv_L.c
+++ b/driver/level2/ztpsv_L.c
@@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
 #ifndef UNIT
  FLOAT ar, ai, br, bi, ratio, den;
 #endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }
--- a/driver/level2/ztpsv_U.c
+++ b/driver/level2/ztpsv_U.c
@@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
 #ifndef UNIT
  FLOAT ar, ai, br, bi, ratio, den;
 #endif
  FLOAT *gemvbuffer = (FLOAT *)buffer;
  FLOAT *B = b;
  if (incb != 1) {
    B = buffer;
    gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
    COPY_K(m, b, incb, buffer, 1);
  }
--- a/driver/level3/gemm_thread_mn.c
+++ b/driver/level3/gemm_thread_mn.c
@@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
  blas_queue_t queue[MAX_CPU_NUMBER];
  BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
  BLASLONG procs, total_procs, num_cpu_m, num_cpu_n;
  BLASLONG procs, num_cpu_m, num_cpu_n;
  BLASLONG width, i, j;
  BLASLONG divM, divN;
--- a/driver/level3/level3_thread.c
+++ b/driver/level3/level3_thread.c
@@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
  BLASLONG is, min_i, div_n;
  BLASLONG i, current;
  BLASLONG l1stride, l2size;
  BLASLONG l1stride;
 #ifdef TIMING
  BLASULONG rpcc_counter;
@@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
 #endif
      ) return 0;
  l2size = GEMM_P * GEMM_Q;
 #if 0
  fprintf(stderr, "Thread[%ld]  m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
 	  mypos, m_from, m_to, n_from, n_to, N_from, N_to);
@@ -706,7 +704,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
    n = n_to - n_from;
  }
  if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
  if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) {
    GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
    return 0;
  }
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@@ -914,7 +914,6 @@ static volatile struct {
 } memory[NUM_BUFFERS];
 static int memory_initialized = 0;
 static void gotoblas_memory_init(void);
 /*       Memory allocation routine           */
 /* procpos ... indicates where it comes from */
--- a/getarch.c
+++ b/getarch.c
@@ -819,10 +819,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " 
 #define LIBNAME   "armv8"
 #define CORENAME  "XGENE1"
 #else
 #define CORENAME  "ARMV8"
 #endif
 #ifdef FORCE_CORTEXA57
 #define FORCE
 #define ARCHITECTURE    "ARM64"
 #define SUBARCHITECTURE "ARMV8"
 #define SUBDIRNAME      "arm64"
 #define ARCHCONFIG   "-DCORTEXA57 " \
       "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
       "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
       "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
 #define LIBNAME   "cortexa57"
 #define CORENAME  "CORTEXA57"
 #else
 #endif
 #ifndef FORCE
--- a/interface/symm.c
+++ b/interface/symm.c
@@ -91,6 +91,27 @@
 #endif
 #endif
 #ifdef SMP
 #ifndef COMPLEX
 #ifdef XDOUBLE
 #define MODE	(BLAS_XDOUBLE | BLAS_REAL)
 #elif defined(DOUBLE)
 #define MODE	(BLAS_DOUBLE  | BLAS_REAL)
 #else
 #define MODE	(BLAS_SINGLE  | BLAS_REAL)
 #endif
 #else
 #ifdef XDOUBLE
 #define MODE	(BLAS_XDOUBLE | BLAS_COMPLEX)
 #elif defined(DOUBLE)
 #define MODE	(BLAS_DOUBLE  | BLAS_COMPLEX)
 #else
 #define MODE	(BLAS_SINGLE  | BLAS_COMPLEX)
 #endif
 #endif
 #endif
 static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
 #ifndef GEMM3M
 #ifndef HEMM
@@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO,
  FLOAT *buffer;
  FLOAT *sa, *sb;
 #ifdef SMP
 #ifndef COMPLEX
 #ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
 #elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
 #else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
 #endif
 #else
 #ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
 #elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
 #else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
 #endif
 #endif
 #endif
 #if defined(SMP) && !defined(NO_AFFINITY)
  int nodes;
 #endif
@@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
  FLOAT *buffer;
  FLOAT *sa, *sb;
 #ifdef SMP
 #ifndef COMPLEX
 #ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_REAL;
 #elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_REAL;
 #else
  int mode  =  BLAS_SINGLE  | BLAS_REAL;
 #endif
 #else
 #ifdef XDOUBLE
  int mode  =  BLAS_XDOUBLE | BLAS_COMPLEX;
 #elif defined(DOUBLE)
  int mode  =  BLAS_DOUBLE  | BLAS_COMPLEX;
 #else
  int mode  =  BLAS_SINGLE  | BLAS_COMPLEX;
 #endif
 #endif
 #endif
 #if defined(SMP) && !defined(NO_AFFINITY)
  int nodes;
 #endif
@@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
      args.nthreads /= nodes;
      gemm_thread_mn(mode, &args, NULL, NULL,
      gemm_thread_mn(MODE, &args, NULL, NULL,
 		     symm[4 | (side << 1) | uplo ], sa, sb, nodes);
    } else {
@@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
 #else
      GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
      GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
 #endif
--- a/interface/syr.c
+++ b/interface/syr.c
@@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT  *ALPHA,
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
  FLOAT *buffer;
  int trans, uplo;
  int uplo;
  blasint info;
 #ifdef SMP
  int nthreads;
@@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
  PRINT_DEBUG_CNAME;
  trans = -1;
  uplo  = -1;
  info  =  0;
--- a/interface/syr2.c
+++ b/interface/syr2.c
@@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT  *ALPHA,
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) {
  FLOAT *buffer;
  int trans, uplo;
  int uplo;
  blasint info;
 #ifdef SMP
  int nthreads;
@@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
  PRINT_DEBUG_CNAME;
  trans = -1;
  uplo  = -1;
  info  =  0;
--- a/interface/zhemv.c
+++ b/interface/zhemv.c
@@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
  FLOAT beta_i	= BETA[1];
  FLOAT *buffer;
  int trans, uplo;
  int uplo;
  blasint info;
 #ifdef SMP
  int nthreads;
@@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
  PRINT_DEBUG_CNAME;
  trans = -1;
  uplo  = -1;
  info  =  0;
--- a/interface/zher.c
+++ b/interface/zher.c
@@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT  *ALPHA,
 void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {
  FLOAT *buffer;
  int trans, uplo;
  int uplo;
  blasint info;
 #ifdef SMP
  int nthreads;
@@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,
  PRINT_DEBUG_CNAME;
  trans = -1;
  uplo  = -1;
  info  =  0;
--- a/interface/zher2.c
+++ b/interface/zher2.c
@@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
  FLOAT alpha_r	= ALPHA[0];
  FLOAT alpha_i	= ALPHA[1];
  FLOAT *buffer;
  int trans, uplo;
  int uplo;
  blasint info;
 #ifdef SMP
  int nthreads;
@@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
  PRINT_DEBUG_CNAME;
  trans = -1;
  uplo  = -1;
  info  =  0;
--- a/kernel/Makefile.L1
+++ b/kernel/Makefile.L1
@@ -637,49 +637,49 @@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX)  : $(KE
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
 $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
 $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(DAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
 $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(QAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
 $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(CAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
 $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(ZAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
 $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@
 $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX)  $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(CAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@
 $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX)  $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(ZAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@
 $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(XAXPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@
 $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@
 $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@
 $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@
 $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@
 $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@
 $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@
 $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
@@ -799,15 +799,15 @@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX)  $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX)  : $(KE
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@
 $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX)  $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(SAXPBYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@
 $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX)  $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(DAXPBYKERNEL)
 	$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
 $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX)  $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(CAXPBYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
 $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX)  $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX)  : $(KERNELDIR)/$(ZAXPBYKERNEL)
 	$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
 	$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
--- a/kernel/arm/amax.c
+++ b/kernel/arm/amax.c
@@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG ix=0;
 	FLOAT maxf=0.0;
 	if (n < 0 || inc_x < 1 ) return(maxf);
 	if (n <= 0 || inc_x <= 0) return(maxf);
 	maxf=ABS(x[0]);
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
 		if( ABS(x[ix]) > ABS(maxf) )
 		if( ABS(x[ix]) > maxf )
 		{
 			maxf = ABS(x[ix]);
 		}
--- a/kernel/arm/amin.c
+++ b/kernel/arm/amin.c
@@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG ix=0;
 	FLOAT minf=0.0;
 	if (n < 0 || inc_x < 1 ) return(minf);
 	if (n <= 0 || inc_x <= 0) return(minf);
 	minf=ABS(x[0]);
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
 		if( ABS(x[ix]) < ABS(minf) )
 		if( ABS(x[ix]) < minf )
 		{
 			minf = ABS(x[ix]);
 		}
--- a/kernel/arm/asum.c
+++ b/kernel/arm/asum.c
@@ -53,7 +53,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	if (n < 0 || inc_x < 1 ) return(sumf);
 	if (n <= 0 || inc_x <= 0) return(sumf);
 	n *= inc_x;
 	while(i < n)
--- a/kernel/arm/iamax.c
+++ b/kernel/arm/iamax.c
@@ -55,13 +55,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=0.0;
 	BLASLONG max=0;
 	if (n < 0 || inc_x < 1 ) return(max);
 	if (n <= 0 || inc_x <= 0) return(max);
 	maxf=ABS(x[0]);
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
 		if( ABS(x[ix]) > ABS(maxf) )
 		if( ABS(x[ix]) > maxf )
 		{
 			max = i;
 			maxf = ABS(x[ix]);
--- a/kernel/arm/iamin.c
+++ b/kernel/arm/iamin.c
@@ -55,9 +55,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=0.0;
 	BLASLONG min=0;
 	if (n < 0 || inc_x < 1 ) return(min);
 	if (n <= 0 || inc_x <= 0) return(min);
 	minf=ABS(x[0]);
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
--- a/kernel/arm/imax.c
+++ b/kernel/arm/imax.c
@@ -47,9 +47,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT maxf=0.0;
 	BLASLONG max=0;
 	if (n < 0 || inc_x < 1 ) return(max);
 	if (n <= 0 || inc_x <= 0) return(max);
 	maxf=x[0];
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
--- a/kernel/arm/imin.c
+++ b/kernel/arm/imin.c
@@ -45,9 +45,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT minf=0.0;
 	BLASLONG min=0;
 	if (n < 0 || inc_x < 1 ) return(min);
 	if (n <= 0 || inc_x <= 0) return(min);
 	minf=x[0];
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
--- a/kernel/arm/izamax.c
+++ b/kernel/arm/izamax.c
@@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf[2];
 	FLOAT maxf;
 	BLASLONG max=0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(max);
 	if (n <= 0 || inc_x <= 0) return(max);
 	inc_x2 = 2 * inc_x;
 	maxf[0] = ABS(x[ix]);
 	maxf[1] = ABS(x[ix+1]);
 	maxf = CABS1(x,0);
 	ix += inc_x2;
 	i++;
 	while(i < n)
 	{
 		if( CABS1(x,ix) > CABS1(maxf,0) )
 		if( CABS1(x,ix) > maxf )
 		{
 			max = i;
 			maxf[0] = ABS(x[ix]);
 			maxf[1] = ABS(x[ix+1]);
 			maxf = CABS1(x,ix);
 		}
 		ix += inc_x2;
 		i++;
--- a/kernel/arm/izamin.c
+++ b/kernel/arm/izamin.c
@@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf[2];
 	FLOAT minf;
 	BLASLONG min=0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(min);
 	if (n <= 0 || inc_x <= 0) return(min);
 	inc_x2 = 2 * inc_x;
 	minf[0] = ABS(x[ix]);
 	minf[1] = ABS(x[ix+1]);
 	minf = CABS1(x,0);
 	ix += inc_x2;
 	i++;
 	while(i < n)
 	{
 		if( CABS1(x,ix) < CABS1(minf,0) )
 		if( CABS1(x,ix) < minf )
 		{
 			min = i;
 			minf[0] = ABS(x[ix]);
 			minf[1] = ABS(x[ix+1]);
 			minf = CABS1(x,ix);
 		}
 		ix += inc_x2;
 		i++;
--- a/kernel/arm/max.c
+++ b/kernel/arm/max.c
@@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG ix=0;
 	FLOAT maxf=0.0;
 	if (n < 0 || inc_x < 1 ) return(maxf);
 	if (n <= 0 || inc_x <= 0) return(maxf);
 	maxf=x[0];
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
--- a/kernel/arm/min.c
+++ b/kernel/arm/min.c
@@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG ix=0;
 	FLOAT minf=0.0;
 	if (n < 0 || inc_x < 1 ) return(minf);
 	if (n <= 0 || inc_x <= 0) return(minf);
 	minf=x[0];
 	ix += inc_x;
 	i++;
 	while(i < n)
 	{
--- a/kernel/arm/nrm2.c
+++ b/kernel/arm/nrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	FLOAT absxi = 0.0;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	if (n <= 0 || inc_x <= 0) return(0.0);
 	if ( n == 1 ) return( ABS(x[0]) );
 	n *= inc_x;
--- a/kernel/arm/zamax.c
+++ b/kernel/arm/zamax.c
@@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT maxf[2];
 	BLASLONG max=0;
 	FLOAT maxf;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	if (n <= 0 || inc_x <= 0) return(0.0);
 	inc_x2 = 2 * inc_x;
 	maxf[0] = ABS(x[ix]);
 	maxf[1] = ABS(x[ix+1]);
 	maxf = CABS1(x,0);
 	ix += inc_x2;
 	i++;
 	while(i < n)
 	{
 		if( CABS1(x,ix) > CABS1(maxf,0) )
 		if( CABS1(x,ix) > maxf )
 		{
 			max = i;
 			maxf[0] = ABS(x[ix]);
 			maxf[1] = ABS(x[ix+1]);
 			maxf = CABS1(x,ix);
 		}
 		ix += inc_x2;
 		i++;
 	}
 	return(CABS1(maxf,0));
 	return(maxf);
 }
--- a/kernel/arm/zamin.c
+++ b/kernel/arm/zamin.c
@@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	BLASLONG ix=0;
 	FLOAT minf[2];
 	BLASLONG min=0;
 	FLOAT minf;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	if (n <= 0 || inc_x <= 0) return(0.0);
 	inc_x2 = 2 * inc_x;
 	minf[0] = ABS(x[ix]);
 	minf[1] = ABS(x[ix+1]);
 	minf = CABS1(x,0);
 	ix += inc_x2;
 	i++;
 	while(i < n)
 	{
 		if( CABS1(x,ix) < CABS1(minf,0) )
 		if( CABS1(x,ix) < minf )
 		{
 			min = i;
 			minf[0] = ABS(x[ix]);
 			minf[1] = ABS(x[ix+1]);
 			minf = CABS1(x,ix);
 		}
 		ix += inc_x2;
 		i++;
 	}
 	return(CABS1(minf,0));
 	return(minf);
 }
--- a/kernel/arm/zasum.c
+++ b/kernel/arm/zasum.c
@@ -55,7 +55,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
 	BLASLONG inc_x2;
 	if (n < 0 || inc_x < 1 ) return(sumf);
 	if (n <= 0 || inc_x <= 0) return(sumf);
 	inc_x2 = 2 * inc_x;
--- a/kernel/arm/zaxpby.c
+++ b/kernel/arm/zaxpby.c
@@ -37,11 +37,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
 	BLASLONG i=0;
 	BLASLONG ix,iy;
 	FLOAT temp;
 	BLASLONG inc_x2, inc_y2;
 	BLASLONG inc_x2;
 	BLASLONG inc_y2;
 	if ( n < 0     )  return(0);
 	if ( n <= 0     )  return(0);
 	ix = 0;
 	iy = 0;
--- a/kernel/arm/znrm2.c
+++ b/kernel/arm/znrm2.c
@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG inc_x2;
 	FLOAT temp;
 	if (n < 0 || inc_x < 1 ) return(0.0);
 	if (n <= 0 || inc_x <= 0) return(0.0);
 	inc_x2 = 2 * inc_x;
--- a/kernel/arm64/KERNEL.CORTEXA57
+++ b/kernel/arm64/KERNEL.CORTEXA57
@@ -0,0 +1,91 @@
 include $(KERNELDIR)/KERNEL.ARMV8
 SAMAXKERNEL  = amax.S
 DAMAXKERNEL  = amax.S
 CAMAXKERNEL  = zamax.S
 ZAMAXKERNEL  = zamax.S
 ISAMAXKERNEL = isamax.S
 IDAMAXKERNEL = idamax.S
 ICAMAXKERNEL = izamax.S
 IZAMAXKERNEL = izamax.S
 SASUMKERNEL  = asum.S
 DASUMKERNEL  = asum.S
 CASUMKERNEL  = casum.S
 ZASUMKERNEL  = zasum.S
 SAXPYKERNEL  = axpy.S
 DAXPYKERNEL  = axpy.S
 CAXPYKERNEL  = zaxpy.S
 ZAXPYKERNEL  = zaxpy.S
 SCOPYKERNEL  = copy.S
 DCOPYKERNEL  = copy.S
 CCOPYKERNEL  = copy.S
 ZCOPYKERNEL  = copy.S
 DOTKERNEL    = dot.S
 DDOTKERNEL   = dot.S
 CDOTKERNEL   = zdot.S
 ZDOTKERNEL   = zdot.S
 SNRM2KERNEL  = snrm2.S
 DNRM2KERNEL  = dnrm2.S
 CNRM2KERNEL  = znrm2.S
 ZNRM2KERNEL  = znrm2.S
 SROTKERNEL   = rot.S
 DROTKERNEL   = rot.S
 CROTKERNEL   = zrot.S
 ZROTKERNEL   = zrot.S
 SCALKERNEL  = scal.S
 DSCALKERNEL  = scal.S
 CSCALKERNEL  = zscal.S
 ZSCALKERNEL  = zscal.S
 SSWAPKERNEL  = swap.S
 DSWAPKERNEL  = swap.S
 CSWAPKERNEL  = swap.S
 ZSWAPKERNEL  = swap.S
 SGEMVNKERNEL = gemv_n.S
 DGEMVNKERNEL = gemv_n.S
 CGEMVNKERNEL = zgemv_n.S
 ZGEMVNKERNEL = zgemv_n.S
 SGEMVTKERNEL = gemv_t.S
 DGEMVTKERNEL = gemv_t.S
 CGEMVTKERNEL = zgemv_t.S
 ZGEMVTKERNEL = zgemv_t.S
 STRMMKERNEL = strmm_kernel_4x4.S
 DTRMMKERNEL = dtrmm_kernel_4x4.S
 CTRMMKERNEL = ctrmm_kernel_4x4.S
 ZTRMMKERNEL = ztrmm_kernel_4x4.S
 SGEMMKERNEL    =  sgemm_kernel_4x4.S
 SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 DGEMMKERNEL    =  dgemm_kernel_4x4.S
 DGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
 DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 DGEMMONCOPYOBJ =  dgemm_oncopy.o
 DGEMMOTCOPYOBJ =  dgemm_otcopy.o
 CGEMMKERNEL    =  cgemm_kernel_4x4.S
 CGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
 CGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
 CGEMMONCOPYOBJ =  cgemm_oncopy.o
 CGEMMOTCOPYOBJ =  cgemm_otcopy.o
 ZGEMMKERNEL    =  zgemm_kernel_4x4.S
 ZGEMMONCOPY    =  ../generic/zgemm_ncopy_4.c
 ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_4.c
 ZGEMMONCOPYOBJ =  zgemm_oncopy.o
 ZGEMMOTCOPYOBJ =  zgemm_otcopy.o
--- a/kernel/arm64/amax.S
+++ b/kernel/arm64/amax.S
@@ -0,0 +1,249 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if defined(USE_MIN)
 #define COND	le
 #else
 #define COND	ge
 #endif
 #if !defined(DOUBLE)
 #define REG0	wzr
 #define MAXF	s0
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4
 #else
 #define REG0	xzr
 #define MAXF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro INIT_F1
 	ldr	MAXF, [X], #SZ
 #if defined(USE_ABS)
 	fabs	MAXF, MAXF
 #endif
 .endm
 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 #if defined(USE_ABS)
 	fabs	TMPF, TMPF
 #endif
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 .endm
 .macro INIT_F4
 #if !defined(DOUBLE)
 	ld1	{v0.4s}, [X], #16
 #if defined(USE_ABS)
 	fabs	v0.4s, v0.4s
 #endif
 #if defined(USE_MIN)
 	fminv	MAXF, v0.4s
 #else
 	fmaxv	MAXF, v0.4s
 #endif
 #else // DOUBLE
 	ld2	{v0.2d,v1.2d}, [X], #32
 #if defined(USE_ABS)
 	fabs	v0.2d, v0.2d
 	fabs	v1.2d, v1.2d
 #endif
 #if defined(USE_MIN)
 	fmin	v0.2d, v0.2d, v1.2d
 	fminp	MAXF, v0.2d
 #else
 	fmax	v0.2d, v0.2d, v1.2d
 	fmaxp	MAXF, v0.2d
 #endif
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v1.4s}, [X], #16
 #if defined(USE_ABS)
 	fabs	v1.4s, v1.4s
 #endif
 #if defined(USE_MIN)
 	fminv	TMPF, v1.4s
 #else
 	fmaxv	TMPF, v1.4s
 #endif
 #else // DOUBLE
 	ld2	{v1.2d,v2.2d}, [X], #32
 #if defined(USE_ABS)
 	fabs	v1.2d, v1.2d
 	fabs	v2.2d, v2.2d
 #endif
 #if defined(USE_MIN)
 	fmin	v1.2d, v1.2d, v2.2d
 	fminp	TMPF, v1.2d
 #else
 	fmax	v1.2d, v1.2d, v2.2d
 	fmaxp	TMPF, v1.2d
 #endif
 #endif
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 	ld1	{v0.s}[0], [X], INC_X
 #else
 	lsl	INC_X, INC_X, #3
 	ld1	{v0.d}[0], [X], INC_X
 #endif
 #if defined(USE_ABS)
 	fabs	MAXF, MAXF
 #endif
 .endm
 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 #if defined(USE_ABS)
 	fabs	TMPF, TMPF
 #endif
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	amax_kernel_zero
 	cmp	INC_X, xzr
 	ble	amax_kernel_zero
 	cmp	INC_X, #1
 	bne	amax_kernel_S_BEGIN
 amax_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	amax_kernel_F1_INIT
 	INIT_F4
 	subs	I, I, #1
 	beq	amax_kernel_F1
 amax_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	amax_kernel_F4
 amax_kernel_F1:
 	ands	I, N, #3
 	ble	amax_kernel_L999
 amax_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     amax_kernel_F10
 	ret
 amax_kernel_F1_INIT:
 	INIT_F1
 	subs	N, N, #1
 	b	amax_kernel_F1
 amax_kernel_S_BEGIN:
 	INIT_S
 	subs	N, N, #1
 	ble	amax_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	amax_kernel_S1
 amax_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	amax_kernel_S4
 amax_kernel_S1:
 	ands	I, N, #3
 	ble	amax_kernel_L999
 amax_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     amax_kernel_S10
 amax_kernel_L999:
 	ret
 amax_kernel_zero:
 	fmov	MAXF, REG0
 	ret
 	EPILOGUE
--- a/kernel/arm64/asum.S
+++ b/kernel/arm64/asum.S
@@ -0,0 +1,194 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define REG0	wzr
 #define SUMF	s0
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4
 #else
 #define REG0	xzr
 #define SUMF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 	fabs	TMPF, TMPF
 	fadd	SUMF, SUMF, TMPF
 .endm
 .macro KERNEL_F8
 #if !defined(DOUBLE)
 	ld1	{v1.4s, v2.4s}, [X], #32	// Load [X3, X2, X1, X0]
 	fabs	v1.4s, v1.4s			// ABS() each value
 	fabs	v2.4s, v2.4s			// ABS() each value
 	fadd	v1.4s, v1.4s, v2.4s		// [X3+X1, X2+X0]
 	fadd	v0.4s, v0.4s, v1.4s		// [X3+X1, X2+X0]
 	PRFM	PLDL1KEEP, [X, #1024]
 #else // DOUBLE
 	ld1	{v2.2d, v3.2d, v4.2d, v5.2d}, [X]
 	add	X, X, #64
 	fabs	v2.2d, v2.2d
 	fabs	v3.2d, v3.2d
 	fabs	v4.2d, v4.2d
 	fabs	v5.2d, v5.2d
 	PRFM	PLDL1KEEP, [X, #1024]
 	fadd	v2.2d, v2.2d, v3.2d
 	fadd	v4.2d, v4.2d, v5.2d
 	fadd	v0.2d, v0.2d, v2.2d
 	fadd	v0.2d, v0.2d, v4.2d
 #endif
 .endm
 .macro KERNEL_F8_FINALIZE
 #if !defined(DOUBLE)
 	ext	v1.16b, v0.16b, v0.16b, #8
 	fadd	v0.2s, v0.2s, v1.2s
 	faddp	SUMF, v0.2s
 #else
 	faddp	SUMF, v0.2d
 #endif
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 #else
 	lsl	INC_X, INC_X, #3
 #endif
 .endm
 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	fabs	TMPF, TMPF
 	fadd	SUMF, SUMF, TMPF
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	fmov	SUMF, REG0
 #if !defined(DOUBLE)
 	fmov	s1, SUMF
 #else
 	fmov	d1, SUMF
 #endif
 	cmp	N, xzr
 	ble	asum_kernel_L999
 	cmp	INC_X, xzr
 	ble	asum_kernel_L999
 	cmp	INC_X, #1
 	bne	asum_kernel_S_BEGIN
 asum_kernel_F_BEGIN:
 	asr	I, N, #3
 	cmp	I, xzr
 	beq	asum_kernel_F1
 asum_kernel_F8:
 	KERNEL_F8
 	subs	I, I, #1
 	bne	asum_kernel_F8
 	KERNEL_F8_FINALIZE
 asum_kernel_F1:
 	ands	I, N, #7
 	ble	asum_kernel_L999
 asum_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     asum_kernel_F10
 asum_kernel_L999:
 	ret
 asum_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	asum_kernel_S1
 asum_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	asum_kernel_S4
 asum_kernel_S1:
 	ands	I, N, #3
 	ble	asum_kernel_L999
 asum_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     asum_kernel_S10
 	ret
 	EPILOGUE
--- a/kernel/arm64/axpy.S
+++ b/kernel/arm64/axpy.S
@@ -0,0 +1,209 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x3	/* X vector address */
 #define	INC_X	x4	/* X stride */
 #define	Y	x5	/* Y vector address */
 #define	INC_Y	x6	/* Y stride */
 #define I	x1	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define	DA	s0	/* scale input value */
 #define TMPX	s1
 #define TMPVX	{v1.s}[0]
 #define TMPY	s2
 #define TMPVY	{v2.s}[0]
 #define SZ	4
 #else
 #define	DA	d0	/* scale input value */
 #define TMPX	d1
 #define TMPVX	{v1.d}[0]
 #define TMPY	d2
 #define TMPVY	{v2.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 	ldr	TMPX, [X], #SZ
 	ldr	TMPY, [Y]
 	fmadd	TMPY, TMPX, DA, TMPY
 	str	TMPY, [Y], #SZ
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v1.4s}, [X], #16
 	ld1	{v2.4s}, [Y]
 	fmla	v2.4s, v1.4s, v0.s[0]
 	st1	{v2.4s}, [Y], #16
 #else // DOUBLE
 	ld1	{v1.2d, v2.2d}, [X], #32
 	ld1	{v3.2d, v4.2d}, [Y]
 	fmla	v3.2d, v1.2d, v0.d[0]
 	fmla	v4.2d, v2.2d, v0.d[0]
 	st1	{v3.2d, v4.2d}, [Y], #32
 #endif
 .endm
 .macro KERNEL_F8
 #if !defined(DOUBLE)
 	ld1	{v1.4s, v2.4s}, [X], #32
 	ld1	{v3.4s, v4.4s}, [Y]
 	fmla	v3.4s, v1.4s, v0.s[0]
 	fmla	v4.4s, v2.4s, v0.s[0]
 	st1	{v3.4s, v4.4s}, [Y], #32
 #else // DOUBLE
 	ld1	{v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
 	ld1	{v16.2d, v17.2d, v18.2d, v19.2d}, [Y]
 	fmla	v16.2d, v1.2d, v0.d[0]
 	fmla	v17.2d, v2.2d, v0.d[0]
 	fmla	v18.2d, v3.2d, v0.d[0]
 	fmla	v19.2d, v4.2d, v0.d[0]
 	st1	{v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64
 #endif
 	PRFM	PLDL1KEEP, [X, #512]
 	PRFM	PLDL1KEEP, [Y, #512]
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 	lsl	INC_Y, INC_Y, #2
 #else
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #endif
 .endm
 .macro KERNEL_S1
 	ld1	TMPVX, [X], INC_X
 	ldr	TMPY, [Y]
 	fmadd	TMPY, TMPX, DA, TMPY
 	st1	TMPVY, [Y], INC_Y
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	axpy_kernel_L999
 	fcmp	DA, #0.0
 	beq	axpy_kernel_L999
 	cmp	INC_X, #1
 	bne	axpy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	axpy_kernel_S_BEGIN
 axpy_kernel_F_BEGIN:
 	asr	I, N, #3
 	cmp	I, xzr
 	beq	axpy_kernel_F1
 axpy_kernel_F8:
 	KERNEL_F8
 	subs	I, I, #1
 	bne	axpy_kernel_F8
 axpy_kernel_F1:
 	ands	I, N, #7
 	ble	axpy_kernel_L999
 axpy_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     axpy_kernel_F10
 	mov	w0, wzr
 	ret
 axpy_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	axpy_kernel_S1
 axpy_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	axpy_kernel_S4
 axpy_kernel_S1:
 	ands	I, N, #3
 	ble	axpy_kernel_L999
 axpy_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     axpy_kernel_S10
 axpy_kernel_L999:
 	mov	w0, wzr
 	ret
--- a/kernel/arm64/casum.S
+++ b/kernel/arm64/casum.S
@@ -0,0 +1,170 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #define REG0	wzr
 #define SUMF	s0
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4
 /******************************************************************************/
 .macro KERNEL_F1
 	ld1	{v1.2s}, [X], #8
 	fabs	v1.2s, v1.2s
 	ext	v2.8b, v1.8b, v1.8b, #4
 	fadd	TMPF, TMPF, s2
 	fadd	SUMF, SUMF, TMPF
 .endm
 .macro KERNEL_F8
 	ld1	{v1.4s, v2.4s, v3.4s, v4.4s}, [X]
 	add	X, X, #64
 	fabs	v1.4s, v1.4s
 	fabs	v2.4s, v2.4s
 	fabs	v3.4s, v3.4s
 	fabs	v4.4s, v4.4s
 	PRFM	PLDL1KEEP, [X, #1024]
 	fadd	v1.4s, v1.4s, v2.4s
 	fadd	v3.4s, v3.4s, v4.4s
 	fadd	v0.4s, v0.4s, v1.4s
 	fadd	v0.4s, v0.4s, v3.4s
 .endm
 .macro KERNEL_F8_FINALIZE
 	ext	v1.16b, v0.16b, v0.16b, #8
 	fadd	v0.2s, v0.2s, v1.2s
 	faddp	SUMF, v0.2s
 .endm
 .macro INIT_S
 	lsl	INC_X, INC_X, #3
 .endm
 .macro KERNEL_S1
 	ld1	{v1.2s}, [X], INC_X
 	fabs	v1.2s, v1.2s
 	ext	v2.8b, v1.8b, v1.8b, #4
 	fadd	TMPF, TMPF, s2
 	fadd	SUMF, SUMF, TMPF
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	fmov	SUMF, REG0
 	fmov	s1, SUMF
 	cmp	N, xzr
 	ble	asum_kernel_L999
 	cmp	INC_X, xzr
 	ble	asum_kernel_L999
 	cmp	INC_X, #1
 	bne	asum_kernel_S_BEGIN
 asum_kernel_F_BEGIN:
 	asr	I, N, #3
 	cmp	I, xzr
 	beq	asum_kernel_F1
 asum_kernel_F8:
 	KERNEL_F8
 	subs	I, I, #1
 	bne	asum_kernel_F8
 	KERNEL_F8_FINALIZE
 asum_kernel_F1:
 	ands	I, N, #7
 	ble	asum_kernel_L999
 asum_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     asum_kernel_F10
 asum_kernel_L999:
 	ret
 asum_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	asum_kernel_S1
 asum_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	asum_kernel_S4
 asum_kernel_S1:
 	ands	I, N, #3
 	ble	asum_kernel_L999
 asum_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     asum_kernel_S10
 	ret
 	EPILOGUE
--- a/kernel/arm64/cgemm_kernel_4x4.S
+++ b/kernel/arm64/cgemm_kernel_4x4.S
--- a/kernel/arm64/copy.S
+++ b/kernel/arm64/copy.S
@@ -0,0 +1,232 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define	Y	x3	/* Y vector address */
 #define	INC_Y	x4	/* Y stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define TMPF	s0
 #define TMPVF	{v0.s}[0]
 #define SZ	4
 #else
 #define TMPF	d0
 #define TMPVF	{v0.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 #if !defined(COMPLEX)
 	ldr	TMPF, [X], #SZ
 	str	TMPF, [Y], #SZ
 #else
 #if !defined(DOUBLE)
 	ld1	{v0.2s}, [X], #8
 	st1	{v0.2s}, [Y], #8
 #else
 	ld1	{v0.2d}, [X], #16
 	st1	{v0.2d}, [Y], #16
 #endif
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 	ld1	{v0.4s}, [X], #16
 	st1	{v0.4s}, [Y], #16
 #else // DOUBLE
 	ld1	{v0.4s}, [X], #16
 	ld1	{v1.4s}, [X], #16
 	st1	{v0.4s}, [Y], #16
 	st1	{v1.4s}, [Y], #16
 #endif
 #else // COMPLEX
 #if !defined(DOUBLE)
 	ld1	{v0.4s}, [X], #16
 	ld1	{v1.4s}, [X], #16
 	st1	{v0.4s}, [Y], #16
 	st1	{v1.4s}, [Y], #16
 #else // DOUBLE
 	ld1	{v0.4s}, [X], #16
 	ld1	{v1.4s}, [X], #16
 	ld1	{v2.4s}, [X], #16
 	ld1	{v3.4s}, [X], #16
 	st1	{v0.4s}, [Y], #16
 	st1	{v1.4s}, [Y], #16
 	st1	{v2.4s}, [Y], #16
 	st1	{v3.4s}, [Y], #16
 #endif
 #endif
 .endm
 .macro INIT_S
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 	lsl	INC_Y, INC_Y, #2
 #else
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #endif
 #else
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #else
 	lsl	INC_X, INC_X, #4
 	lsl	INC_Y, INC_Y, #4
 #endif
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 	ldr	w10, [X]
 	add	X, X, INC_X
 	str	w10, [Y]
 	add	Y, Y, INC_Y
 #else
 	ldr	x10, [X]
 	add	X, X, INC_X
 	str	x10, [Y]
 	add	Y, Y, INC_Y
 #endif
 #else
 #if !defined(DOUBLE)
 	ld1	{v0.2s}, [X]
 	add	X, X, INC_X
 	st1	{v0.2s}, [Y]
 	add	Y, Y, INC_Y
 #else
 	ld1	{v0.2d}, [X]
 	add	X, X, INC_X
 	st1	{v0.2d}, [Y]
 	add	Y, Y, INC_Y
 #endif
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	copy_kernel_L999
 	cmp	INC_X, #1
 	bne	copy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	copy_kernel_S_BEGIN
 copy_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	copy_kernel_F1
 copy_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	copy_kernel_F4
 copy_kernel_F1:
 	ands	I, N, #3
 	ble	copy_kernel_L999
 copy_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     copy_kernel_F10
 	mov	w0, wzr
 	ret
 copy_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	copy_kernel_S1
 copy_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	copy_kernel_S4
 copy_kernel_S1:
 	ands	I, N, #3
 	ble	copy_kernel_L999
 copy_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     copy_kernel_S10
 copy_kernel_L999:
 	mov	w0, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/ctrmm_kernel_4x4.S
+++ b/kernel/arm64/ctrmm_kernel_4x4.S
--- a/kernel/arm64/dgemm_kernel_4x4.S
+++ b/kernel/arm64/dgemm_kernel_4x4.S
--- a/kernel/arm64/dnrm2.S
+++ b/kernel/arm64/dnrm2.S
@@ -0,0 +1,169 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #define TMPF	d6
 #define SSQ	d0
 #define TMPVF	{v6.d}[0]
 #define SZ	8
 /******************************************************************************/
 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 .endm
 .macro KERNEL_F8
 	ld1	{v1.2d, v2.2d}, [X], #32
 	fmla	v0.2d, v1.2d, v1.2d
 	fmla	v5.2d, v2.2d, v2.2d
 	ld1	{v3.2d, v4.2d}, [X], #32
 	fmla	v0.2d, v3.2d, v3.2d
 	fmla	v5.2d, v4.2d, v4.2d
 	PRFM	PLDL1KEEP, [X, #1024]
 .endm
 .macro nrm2_kernel_F8_FINALIZE
 	fadd	v0.2d, v0.2d, v5.2d
 	faddp	SSQ, v0.2d
 .endm
 .macro INIT_S
 	lsl	INC_X, INC_X, #3
 	ld1	TMPVF, [X], INC_X
 	fmul	SSQ, TMPF, TMPF
 .endm
 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	fmov	SSQ, xzr
 	fmov	d5, SSQ
 	cmp	N, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN
 nrm2_kernel_F_BEGIN:
 	asr	I, N, #3
 	cmp	I, xzr
 	beq	nrm2_kernel_F1_INIT
 nrm2_kernel_F8:
 	KERNEL_F8
 	subs	I, I, #1
 	bne	nrm2_kernel_F8
 	nrm2_kernel_F8_FINALIZE
 nrm2_kernel_F1:
 	ands	I, N, #7
 	ble	nrm2_kernel_L999
 nrm2_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     nrm2_kernel_F10
 	b	nrm2_kernel_L999
 nrm2_kernel_F1_INIT:
 	b	nrm2_kernel_F1
 nrm2_kernel_S_BEGIN:
 	INIT_S
 	subs	N, N, #1
 	ble	nrm2_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	nrm2_kernel_S1
 nrm2_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	nrm2_kernel_S4
 nrm2_kernel_S1:
 	ands	I, N, #3
 	ble	nrm2_kernel_L999
 nrm2_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
 	bne     nrm2_kernel_S10
 nrm2_kernel_L999:
 	fsqrt	SSQ, SSQ
 	ret
 nrm2_kernel_zero:
 	ret
 	EPILOGUE
--- a/kernel/arm64/dot.S
+++ b/kernel/arm64/dot.S
@@ -0,0 +1,227 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define	Y	x3	/* Y vector address */
 #define	INC_Y	x4	/* Y stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #if !defined(DSDOT)
 #define REG0	wzr
 #define DOTF	s0
 #else // DSDOT
 #define REG0	xzr
 #define DOTF	d0
 #endif
 #define DOTI	s1
 #define TMPX	s2
 #define LD1VX	{v2.s}[0]
 #define TMPY	s3
 #define LD1VY	{v3.s}[0]
 #define TMPVY	v3.s[0]
 #define SZ	4
 #else
 #define REG0	xzr
 #define DOTF	d0
 #define DOTI	d1
 #define TMPX	d2
 #define LD1VX	{v2.d}[0]
 #define TMPY	d3
 #define LD1VY	{v3.d}[0]
 #define TMPVY	v3.d[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 	ldr	TMPX, [X], #SZ
 	ldr	TMPY, [Y], #SZ
 #if !defined(DSDOT)
 	fmadd	DOTF, TMPX, TMPY, DOTF
 #else // DSDOT
 	fmul	TMPX, TMPX, TMPY
 	fcvt	d2, TMPX
 	fadd	DOTF, DOTF, d2
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v2.4s}, [X], #16
 	ld1	{v3.4s}, [Y], #16
 #if !defined(DSDOT)
 	fmla	v0.4s, v2.4s, v3.4s
 #else
 	fmul	v2.4s, v2.4s, v3.4s
 	ext	v3.16b, v2.16b, v2.16b, #8
 	fcvtl	v2.2d, v2.2s
 	fcvtl	v3.2d, v3.2s
 	fadd	v0.2d, v0.2d, v2.2d
 	fadd	v0.2d, v0.2d, v3.2d
 #endif
 #else //DOUBLE
 	ld1	{v2.2d, v3.2d}, [X], #32
 	ld1	{v4.2d, v5.2d}, [Y], #32
 	fmul	v2.2d, v2.2d, v4.2d
 	fmul	v3.2d, v3.2d, v5.2d
 	fadd	v0.2d, v0.2d, v2.2d
 	fadd	v0.2d, v0.2d, v3.2d
 #endif
 	PRFM	PLDL1KEEP, [X, #1024]
 	PRFM	PLDL1KEEP, [Y, #1024]
 .endm
 .macro KERNEL_F4_FINALIZE
 #if !defined(DOUBLE)
 #if !defined(DSDOT)
 	ext	v1.16b, v0.16b, v0.16b, #8
 	fadd	v0.2s, v0.2s, v1.2s
 	faddp	DOTF, v0.2s
 #else
 	faddp	DOTF, v0.2d
 #endif
 #else //DOUBLE
 	faddp	DOTF, v0.2d
 #endif
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 	lsl	INC_Y, INC_Y, #2
 #else
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #endif
 .endm
 .macro KERNEL_S1
 	ld1	LD1VX, [X], INC_X
 	ld1	LD1VY, [Y], INC_Y
 #if !defined(DSDOT)
 	fmadd	DOTF, TMPX, TMPY, DOTF
 #else // DSDOT
 	fmul	TMPX, TMPX, TMPY
 	fcvt	d2, TMPX
 	fadd	DOTF, DOTF, d2
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	fmov	DOTF, REG0
 #if defined(DOUBLE)
 	fmov	d6, DOTF
 #endif
 	cmp	N, xzr
 	ble	dot_kernel_L999
 	cmp	INC_X, #1
 	bne	dot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	dot_kernel_S_BEGIN
 dot_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	dot_kernel_F1
 dot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	dot_kernel_F4
 	KERNEL_F4_FINALIZE
 dot_kernel_F1:
 	ands	I, N, #3
 	ble	dot_kernel_L999
 dot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     dot_kernel_F10
 	ret
 dot_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	dot_kernel_S1
 dot_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	dot_kernel_S4
 dot_kernel_S1:
 	ands	I, N, #3
 	ble	dot_kernel_L999
 dot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     dot_kernel_S10
 dot_kernel_L999:
 	ret
 	EPILOGUE
--- a/kernel/arm64/dtrmm_kernel_4x4.S
+++ b/kernel/arm64/dtrmm_kernel_4x4.S
--- a/kernel/arm64/gemv_n.S
+++ b/kernel/arm64/gemv_n.S
@@ -0,0 +1,320 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	M	x0	/* Y vector length */
 #define	N	x1	/* X vector length */
 #define	A	x3	/* A vector address */
 #define	LDA	x4	/* A stride */
 #define	X	x5	/* X vector address */
 #define	INC_X	x6	/* X stride */
 #define	Y	x7	/* Y vector address */
 #define	INC_Y	x2	/* Y stride */
 #define	A_PTR	x9	/* loop A vector address */
 #define	Y_IPTR	x10	/* loop Y vector address */
 #define J	x11	/* loop variable */
 #define I	x12	/* loop variable */
 #define	Y_OPTR	x13	/* loop Y vector address */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define ALPHA	s0
 #define TEMP	s1
 #define TEMPV	{v1.s}[0]
 #define TMP1	s2
 #define TMPV1	{v2.s}[0]
 #define TMP2	s3
 #define TMPV2	{v3.s}[0]
 #define SZ	4
 #define SHZ	2
 #else
 #define ALPHA	d0
 #define TEMP	d1
 #define TEMPV	{v1.d}[0]
 #define TMP1	d2
 #define TMPV1	{v2.d}[0]
 #define TMP2	d3
 #define TMPV2	{v3.d}[0]
 #define SZ	8
 #define SHZ	3
 #endif
 /******************************************************************************/
 .macro SAVE_REGS
 	add	sp, sp, #-(11 * 16)
 	stp	d8, d9, [sp, #(0 * 16)]
 	stp	d10, d11, [sp, #(1 * 16)]
 	stp	d12, d13, [sp, #(2 * 16)]
 	stp	d14, d15, [sp, #(3 * 16)]
 	stp	d16, d17, [sp, #(4 * 16)]
 	stp	x18, x19, [sp, #(5 * 16)]
 	stp	x20, x21, [sp, #(6 * 16)]
 	stp	x22, x23, [sp, #(7 * 16)]
 	stp	x24, x25, [sp, #(8 * 16)]
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 .endm
 .macro RESTORE_REGS
 	ldp	d8, d9, [sp, #(0 * 16)]
 	ldp	d10, d11, [sp, #(1 * 16)]
 	ldp	d12, d13, [sp, #(2 * 16)]
 	ldp	d14, d15, [sp, #(3 * 16)]
 	ldp	d16, d17, [sp, #(4 * 16)]
 	ldp	x18, x19, [sp, #(5 * 16)]
 	ldp	x20, x21, [sp, #(6 * 16)]
 	ldp	x22, x23, [sp, #(7 * 16)]
 	ldp	x24, x25, [sp, #(8 * 16)]
 	ldp	x26, x27, [sp, #(9 * 16)]
 	ldr	x28, [sp, #(10 * 16)]
 	add	sp, sp, #(11*16)
 .endm
 .macro KERNEL_F16
 #if !defined(DOUBLE)
 	ld1	{v2.4s, v3.4s}, [A_PTR], #32
 	ld1	{v4.4s, v5.4s}, [Y_IPTR], #32
 	fmla	v4.4s, v1.4s, v2.4s
 	fmla	v5.4s, v1.4s, v3.4s
 	st1	{v4.4s, v5.4s}, [Y_OPTR], #32
 	ld1	{v6.4s, v7.4s}, [A_PTR], #32
 	ld1	{v8.4s, v9.4s}, [Y_IPTR], #32
 	fmla	v8.4s, v1.4s, v6.4s
 	fmla	v9.4s, v1.4s, v7.4s
 	st1	{v8.4s, v9.4s}, [Y_OPTR], #32
 #else //DOUBLE
 	ld1	{v2.2d, v3.2d}, [A_PTR], #32
 	ld1	{v4.2d, v5.2d}, [Y_IPTR], #32
 	fmla	v4.2d, v1.2d, v2.2d
 	fmla	v5.2d, v1.2d, v3.2d
 	st1	{v4.2d, v5.2d}, [Y_OPTR], #32
 	ld1	{v6.2d, v7.2d}, [A_PTR], #32
 	ld1	{v8.2d, v9.2d}, [Y_IPTR], #32
 	fmla	v8.2d, v1.2d, v6.2d
 	fmla	v9.2d, v1.2d, v7.2d
 	st1	{v8.2d, v9.2d}, [Y_OPTR], #32
 	ld1	{v10.2d, v11.2d}, [A_PTR], #32
 	ld1	{v12.2d, v13.2d}, [Y_IPTR], #32
 	fmla	v12.2d, v1.2d, v10.2d
 	fmla	v13.2d, v1.2d, v11.2d
 	st1	{v12.2d, v13.2d}, [Y_OPTR], #32
 	ld1	{v14.2d, v15.2d}, [A_PTR], #32
 	ld1	{v16.2d, v17.2d}, [Y_IPTR], #32
 	fmla	v16.2d, v1.2d, v14.2d
 	fmla	v17.2d, v1.2d, v15.2d
 	st1	{v16.2d, v17.2d}, [Y_OPTR], #32
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v2.4s}, [A_PTR], #16
 	ld1	{v3.4s}, [Y_IPTR], #16
 	fmla	v3.4s, v1.4s, v2.4s
 	st1	{v3.4s}, [Y_OPTR], #16
 #else
 	ld1	{v2.2d}, [A_PTR], #16
 	ld1	{v3.2d}, [Y_IPTR], #16
 	fmla	v3.2d, v1.2d, v2.2d
 	st1	{v3.2d}, [Y_OPTR], #16
 	ld1	{v4.2d}, [A_PTR], #16
 	ld1	{v5.2d}, [Y_IPTR], #16
 	fmla	v5.2d, v1.2d, v4.2d
 	st1	{v5.2d}, [Y_OPTR], #16
 #endif
 .endm
 .macro KERNEL_F1
 	ld1	TMPV1, [A_PTR], #SZ
 	ld1	TMPV2, [Y_IPTR]
 	fmadd	TMP2, TEMP, TMP1, TMP2
 	st1	TMPV2, [Y_IPTR], #SZ
 .endm
 .macro INIT_S
 	lsl	INC_Y, INC_Y, #SHZ
 .endm
 .macro KERNEL_S1
 	ld1	TMPV1, [A_PTR], #SZ
 	ld1	TMPV2, [Y_IPTR]
 	fmadd	TMP2, TEMP, TMP1, TMP2
 	st1	TMPV2, [Y_IPTR], INC_Y
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	ldr	INC_Y, [sp]
 	SAVE_REGS
 	cmp	N, xzr
 	ble	gemv_n_kernel_L999
 	cmp	M, xzr
 	ble	gemv_n_kernel_L999
 	lsl	LDA, LDA, #SHZ
 	lsl	INC_X, INC_X, #SHZ
 	mov	J, N
 	cmp	INC_Y, #1
 	bne	gemv_n_kernel_S_BEGIN
 gemv_n_kernel_F_LOOP:
 	ld1	TEMPV, [X], INC_X
 	fmul	TEMP, ALPHA, TEMP
 #if !defined(DOUBLE)
 	ins	v1.s[1], v1.s[0]
 	ins	v1.s[2], v1.s[0]
 	ins	v1.s[3], v1.s[0]
 #else
 	ins	v1.d[1], v1.d[0]
 #endif
 	mov	A_PTR, A
 	mov	Y_IPTR, Y
 	mov	Y_OPTR, Y
 gemv_n_kernel_F32:
 	asr	I, M, #5
 	cmp	I, xzr
 	beq	gemv_n_kernel_F4
 gemv_n_kernel_F320:
 	KERNEL_F16
 	KERNEL_F16
 	subs	I, I, #1
 	bne	gemv_n_kernel_F320
 gemv_n_kernel_F4:
 	ands	I, M, #31
 	asr	I, I, #2
 	cmp	I, xzr
 	beq	gemv_n_kernel_F1
 gemv_n_kernel_F40:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	gemv_n_kernel_F40
 gemv_n_kernel_F1:
 	ands	I, M, #3
 	ble	gemv_n_kernel_F_END
 gemv_n_kernel_F10:
 	KERNEL_F1
 	subs	I, I, #1
 	bne	gemv_n_kernel_F10
 gemv_n_kernel_F_END:
 	add	A, A, LDA
 	subs	J, J, #1
 	bne	gemv_n_kernel_F_LOOP
 	b	gemv_n_kernel_L999
 gemv_n_kernel_S_BEGIN:
 	INIT_S
 gemv_n_kernel_S_LOOP:
 	ld1	TEMPV, [X], INC_X
 	fmul	TEMP, ALPHA, TEMP
 	mov	A_PTR, A
 	mov	Y_IPTR, Y
 	asr	I, M, #2
 	cmp	I, xzr
 	ble	gemv_n_kernel_S1
 gemv_n_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	gemv_n_kernel_S4
 gemv_n_kernel_S1:
 	ands	I, M, #3
 	ble	gemv_n_kernel_S_END
 gemv_n_kernel_S10:
 	KERNEL_S1
 	subs	I, I, #1
 	bne	gemv_n_kernel_S10
 gemv_n_kernel_S_END:
 	add	A, A, LDA
 	subs	J, J, #1
 	bne	gemv_n_kernel_S_LOOP
 gemv_n_kernel_L999:
 	mov	w0, wzr
 	RESTORE_REGS
 	ret
 	EPILOGUE
--- a/kernel/arm64/gemv_t.S
+++ b/kernel/arm64/gemv_t.S
@@ -0,0 +1,347 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	M	x0	/* Y vector length */
 #define	N	x1	/* X vector length */
 #define	A	x3	/* A vector address */
 #define	LDA	x4	/* A stride */
 #define	X	x5	/* X vector address */
 #define	INC_X	x6	/* X stride */
 #define	Y	x7	/* Y vector address */
 #define	INC_Y	x2	/* Y stride */
 #define	A_PTR	x9	/* loop A vector address */
 #define	X_PTR	x10	/* loop X vector address */
 #define J	x11	/* loop variable */
 #define I	x12	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define REG0	wzr
 #define ALPHA	s0
 #define TEMP	s1
 #define TEMP1	s2
 #define TEMP2	s3
 #define TEMP3	s4
 #define TEMPV	{v1.s}[0]
 #define TMP1	s2
 #define TMPV1	{v2.s}[0]
 #define TMP2	s3
 #define TMPV2	{v3.s}[0]
 #define SZ	4
 #define SHZ	2
 #else
 #define REG0	xzr
 #define ALPHA	d0
 #define TEMP	d1
 #define TEMP1	d2
 #define TEMP2	d3
 #define TEMP3	d4
 #define TEMPV	{v1.d}[0]
 #define TMP1	d2
 #define TMPV1	{v2.d}[0]
 #define TMP2	d3
 #define TMPV2	{v3.d}[0]
 #define SZ	8
 #define SHZ	3
 #endif
 /******************************************************************************/
 .macro SAVE_REGS
 	add	sp, sp, #-(11 * 16)
 	stp	d8, d9, [sp, #(0 * 16)]
 	stp	d10, d11, [sp, #(1 * 16)]
 	stp	d12, d13, [sp, #(2 * 16)]
 	stp	d14, d15, [sp, #(3 * 16)]
 	stp	d16, d17, [sp, #(4 * 16)]
 	stp	x18, x19, [sp, #(5 * 16)]
 	stp	x20, x21, [sp, #(6 * 16)]
 	stp	x22, x23, [sp, #(7 * 16)]
 	stp	x24, x25, [sp, #(8 * 16)]
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 .endm
 .macro RESTORE_REGS
 	ldp	d8, d9, [sp, #(0 * 16)]
 	ldp	d10, d11, [sp, #(1 * 16)]
 	ldp	d12, d13, [sp, #(2 * 16)]
 	ldp	d14, d15, [sp, #(3 * 16)]
 	ldp	d16, d17, [sp, #(4 * 16)]
 	ldp	x18, x19, [sp, #(5 * 16)]
 	ldp	x20, x21, [sp, #(6 * 16)]
 	ldp	x22, x23, [sp, #(7 * 16)]
 	ldp	x24, x25, [sp, #(8 * 16)]
 	ldp	x26, x27, [sp, #(9 * 16)]
 	ldr	x28, [sp, #(10 * 16)]
 	add	sp, sp, #(11*16)
 .endm
 .macro KERNEL_F32
 #if !defined(DOUBLE)
 	ld1	{v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
 	ld1	{v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
 	fmla	v1.4s, v5.4s, v9.4s
 	fmla	v2.4s, v6.4s, v10.4s
 	fmla	v3.4s, v7.4s, v11.4s
 	fmla	v4.4s, v8.4s, v12.4s
 	ld1	{v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
 	ld1	{v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
 	fmla	v1.4s, v13.4s, v17.4s
 	fmla	v2.4s, v14.4s, v18.4s
 	fmla	v3.4s, v15.4s, v19.4s
 	fmla	v4.4s, v16.4s, v20.4s
 #else
 	ld1	{v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
 	ld1	{v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
 	fmla	v1.2d, v5.2d, v9.2d
 	fmla	v2.2d, v6.2d, v10.2d
 	fmla	v3.2d, v7.2d, v11.2d
 	fmla	v4.2d, v8.2d, v12.2d
 	ld1	{v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
 	ld1	{v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
 	fmla	v1.2d, v13.2d, v17.2d
 	fmla	v2.2d, v14.2d, v18.2d
 	fmla	v3.2d, v15.2d, v19.2d
 	fmla	v4.2d, v16.2d, v20.2d
 	ld1	{v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
 	ld1	{v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
 	fmla	v1.2d, v5.2d, v9.2d
 	fmla	v2.2d, v6.2d, v10.2d
 	fmla	v3.2d, v7.2d, v11.2d
 	fmla	v4.2d, v8.2d, v12.2d
 	ld1	{v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
 	ld1	{v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
 	fmla	v1.2d, v13.2d, v17.2d
 	fmla	v2.2d, v14.2d, v18.2d
 	fmla	v3.2d, v15.2d, v19.2d
 	fmla	v4.2d, v16.2d, v20.2d
 #endif
 .endm
 .macro KERNEL_F32_FINALIZE
 #if !defined(DOUBLE)
 	fadd	v1.4s, v1.4s, v2.4s
 	fadd	v1.4s, v1.4s, v3.4s
 	fadd	v1.4s, v1.4s, v4.4s
 #else
 	fadd	v1.2d, v1.2d, v2.2d
 	fadd	v1.2d, v1.2d, v3.2d
 	fadd	v1.2d, v1.2d, v4.2d
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v2.4s}, [A_PTR], #16
 	ld1	{v3.4s}, [X_PTR], #16
 	fmla	v1.4s, v2.4s, v3.4s
 #else
 	ld1	{v2.2d}, [A_PTR], #16
 	ld1	{v3.2d}, [X_PTR], #16
 	fmla	v1.2d, v2.2d, v3.2d
 	ld1	{v4.2d}, [A_PTR], #16
 	ld1	{v5.2d}, [X_PTR], #16
 	fmla	v1.2d, v4.2d, v5.2d
 #endif
 .endm
 .macro KERNEL_F4_FINALIZE
 #if !defined(DOUBLE)
 	ext	v2.16b, v1.16b, v1.16b, #8
 	fadd	v1.2s, v1.2s, v2.2s
 	faddp	TEMP, v1.2s
 #else
 	faddp	TEMP, v1.2d
 #endif
 .endm
 .macro KERNEL_F1
 	ld1	TMPV1, [A_PTR], #SZ
 	ld1	TMPV2, [X_PTR], #SZ
 	fmadd	TEMP, TMP1, TMP2, TEMP
 .endm
 .macro INIT_S
 	lsl	INC_X, INC_X, #SHZ
 .endm
 .macro KERNEL_S1
 	ld1	TMPV1, [A_PTR], #SZ
 	ld1	TMPV2, [X_PTR], INC_X
 	fmadd	TEMP, TMP1, TMP2, TEMP
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	ldr	INC_Y, [sp]
 	SAVE_REGS
 	cmp	N, xzr
 	ble	gemv_t_kernel_L999
 	cmp	M, xzr
 	ble	gemv_t_kernel_L999
 	lsl	LDA, LDA, #SHZ
 	lsl	INC_Y, INC_Y, #SHZ
 	mov	J, N
 	cmp	INC_X, #1
 	bne	gemv_t_kernel_S_BEGIN
 gemv_t_kernel_F_LOOP:
 	fmov	TEMP, REG0
 	fmov	TEMP1, REG0
 	fmov	TEMP2, REG0
 	fmov	TEMP3, REG0
 	mov	A_PTR, A
 	mov	X_PTR, X
 gemv_t_kernel_F32:
 	asr	I, M, #5
 	cmp	I, xzr
 	beq	gemv_t_kernel_F4
 gemv_t_kernel_F320:
 	KERNEL_F32
 	subs	I, I, #1
 	bne	gemv_t_kernel_F320
 	KERNEL_F32_FINALIZE
 gemv_t_kernel_F4:
 	ands	I, M, #31
 	asr	I, I, #2
 	cmp	I, xzr
 	beq	gemv_t_kernel_F1
 gemv_t_kernel_F40:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	gemv_t_kernel_F40
 gemv_t_kernel_F1:
 	KERNEL_F4_FINALIZE
 	ands	I, M, #3
 	ble	gemv_t_kernel_F_END
 gemv_t_kernel_F10:
 	KERNEL_F1
 	subs	I, I, #1
 	bne	gemv_t_kernel_F10
 gemv_t_kernel_F_END:
 	ld1	TMPV1, [Y]
 	add	A, A, LDA
 	subs	J, J, #1
 	fmadd	TMP1, ALPHA, TEMP, TMP1
 	st1	TMPV1, [Y], INC_Y
 	bne	gemv_t_kernel_F_LOOP
 	b	gemv_t_kernel_L999
 gemv_t_kernel_S_BEGIN:
 	INIT_S
 gemv_t_kernel_S_LOOP:
 	fmov	TEMP, REG0
 	mov	A_PTR, A
 	mov	X_PTR, X
 	asr	I, M, #2
 	cmp	I, xzr
 	ble	gemv_t_kernel_S1
 gemv_t_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	gemv_t_kernel_S4
 gemv_t_kernel_S1:
 	ands	I, M, #3
 	ble	gemv_t_kernel_S_END
 gemv_t_kernel_S10:
 	KERNEL_S1
 	subs	I, I, #1
 	bne	gemv_t_kernel_S10
 gemv_t_kernel_S_END:
 	ld1	TMPV1, [Y]
 	add	A, A, LDA
 	subs    J, J, #1
 	fmadd	TMP1, ALPHA, TEMP, TMP1
 	st1	TMPV1, [Y], INC_Y
        bne     gemv_t_kernel_S_LOOP
 gemv_t_kernel_L999:
 	RESTORE_REGS
 	mov	w0, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/idamax.S
+++ b/kernel/arm64/idamax.S
@@ -0,0 +1,124 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define INDEX	x3	/* index of max/min value */
 #define Z	x4	/* vector index */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if defined(USE_MIN)
 #define COND	le
 #else
 #define COND	ge
 #endif
 #define MAXF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 /******************************************************************************/
 .macro INIT_S
 	lsl	INC_X, INC_X, #3
 	ld1	{v0.d}[0], [X], INC_X
 	mov	Z, #1
 	mov	INDEX, Z
 	fabs	MAXF, MAXF
 .endm
 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	add	Z, Z, #1
 	fabs	TMPF, TMPF
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 	csel	INDEX, INDEX, Z, COND
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	iamax_kernel_zero
 	cmp	INC_X, xzr
 	ble	iamax_kernel_zero
 	INIT_S
 	subs	N, N, #1
 	ble	iamax_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	iamax_kernel_S1
 iamax_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	iamax_kernel_S4
 iamax_kernel_S1:
 	ands	I, N, #3
 	ble	iamax_kernel_L999
 iamax_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     iamax_kernel_S10
 iamax_kernel_L999:
 	mov	x0, INDEX
 	ret
 iamax_kernel_zero:
 	mov	x0, xzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/isamax.S
+++ b/kernel/arm64/isamax.S
@@ -0,0 +1,213 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define INDEX	x3	/* index of max/min value */
 #define Z	x4	/* vector index */
 #define I	x5	/* loop variable */
 #define X_COPY	x6	/* copy of X address */
 #define MAXF_Z	x7
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #define MAXF	s5
 #define TMPF	s6
 #define TMPVF	{v6.s}[0]
 #define SZ	4
 /******************************************************************************/
 .macro INIT_F1
 	ldr	MAXF, [X], #SZ
 	mov	Z, #1
 	mov	INDEX, Z
 	fabs	MAXF, MAXF
 .endm
 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 	add	Z, Z, #1
 	fabs	TMPF, TMPF
 	fcmp	TMPF, MAXF
 	fcsel	MAXF, MAXF, TMPF, le
 	csel	INDEX, INDEX, Z, le
 .endm
 .macro INIT_F4
 	ld1	{v0.4s}, [X], #16
 	fabs	v0.4s, v0.4s
 	fmaxv	MAXF, v0.4s
 	mov	Z, #5
 	mov	MAXF_Z, #1
 .endm
 .macro KERNEL_F4
 	ld1	{v0.4s}, [X], #16
 	fabs	v0.4s, v0.4s
 	fmaxv	TMPF, v0.4s
 	PRFM	PLDL1KEEP, [X, #512]
 	fcmp	TMPF, MAXF
 	fcsel	MAXF, MAXF, TMPF, le
 	csel	MAXF_Z, MAXF_Z, Z, le
 	add	Z, Z, #4
 .endm
 .macro KERNEL_F4_FINALIZE
 	mov	INDEX, MAXF_Z
 	sub	MAXF_Z, MAXF_Z, #1
 	lsl	MAXF_Z, MAXF_Z, #2
 	add	X_COPY, X_COPY, MAXF_Z
 	ldr	TMPF, [X_COPY], #SZ
 	fabs	TMPF, TMPF
 	fcmp	TMPF, MAXF
 	beq	KERNEL_F4_FINALIZE_DONE
 	add	INDEX, INDEX, #1
 	ldr	TMPF, [X_COPY], #SZ
 	fabs	TMPF, TMPF
 	fcmp	TMPF, MAXF
 	beq	KERNEL_F4_FINALIZE_DONE
 	add	INDEX, INDEX, #1
 	ldr	TMPF, [X_COPY], #SZ
 	fabs	TMPF, TMPF
 	fcmp	TMPF, MAXF
 	beq	KERNEL_F4_FINALIZE_DONE
 	add	INDEX, INDEX, #1
 KERNEL_F4_FINALIZE_DONE:
 .endm
 .macro INIT_S
 	lsl	INC_X, INC_X, #2
 	ld1	TMPVF, [X], INC_X
 	mov	Z, #1
 	mov	INDEX, Z
 	fabs	MAXF, TMPF
 .endm
 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	add	Z, Z, #1
 	fabs	TMPF, TMPF
 	fcmp	TMPF, MAXF
 	fcsel	MAXF, MAXF, TMPF, le
 	csel	INDEX, INDEX, Z, le
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	iamax_kernel_zero
 	cmp	INC_X, xzr
 	ble	iamax_kernel_zero
 	PRFM	PLDL1KEEP, [X]
 	mov	X_COPY, X
 	cmp	INC_X, #1
 	bne	iamax_kernel_S_BEGIN
 iamax_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	iamax_kernel_F1_INIT
 	INIT_F4
 	subs	I, I, #1
 	beq	iamax_kernel_F4_FINALIZE
 iamax_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	iamax_kernel_F4
 iamax_kernel_F4_FINALIZE:
 	KERNEL_F4_FINALIZE
 iamax_kernel_F1:
 	ands	I, N, #3
 	ble	iamax_kernel_L999
 iamax_kernel_F10:
 	KERNEL_F1
 	subs	I, I, #1
 	bne	iamax_kernel_F10
 	b	iamax_kernel_L999
 iamax_kernel_F1_INIT:
 	INIT_F1
 	subs	N, N, #1
 	b	iamax_kernel_F1
 iamax_kernel_S_BEGIN:
 	INIT_S
 	subs	N, N, #1
 	ble	iamax_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	iamax_kernel_S1
 iamax_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	iamax_kernel_S4
 iamax_kernel_S1:
 	ands	I, N, #3
 	ble	iamax_kernel_L999
 iamax_kernel_S10:
 	KERNEL_S1
 	subs	I, I, #1
 	bne	iamax_kernel_S10
 iamax_kernel_L999:
 	mov	x0, INDEX
 	ret
 iamax_kernel_zero:
 	mov	x0, xzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/izamax.S
+++ b/kernel/arm64/izamax.S
@@ -0,0 +1,151 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define INDEX	x3	/* index of max/min value */
 #define Z	x4	/* vector index */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if defined(USE_MIN)
 #define COND	le
 #else
 #define COND	ge
 #endif
 #if !defined(DOUBLE)
 #define MAXF	s0
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4
 #else
 #define MAXF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	ld1	{v0.2s}, [X], INC_X
 	mov	Z, #1
 	mov	INDEX, Z
 	fabs	v0.2s, v0.2s
 	ext	v1.8b, v0.8b, v0.8b, #4
 	fadd	MAXF, s0, s1
 #else
 	lsl	INC_X, INC_X, #4
 	ld1	{v0.2d}, [X], INC_X
 	mov	Z, #1
 	mov	INDEX, Z
 	fabs	v0.2d, v0.2d
 	faddp	MAXF, v0.2d
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v1.2s}, [X], INC_X
 	add	Z, Z, #1
 	fabs	v1.2s, v1.2s
 	ext	v2.8b, v1.8b, v1.8b, #4
 	fadd	TMPF, s1, s2
 #else
 	ld1	{v1.2d}, [X], INC_X
 	add	Z, Z, #1
 	fabs	v1.2d, v1.2d
 	faddp	TMPF, v1.2d
 #endif
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 	csel	INDEX, INDEX, Z, COND
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	iamax_kernel_zero
 	cmp	INC_X, xzr
 	ble	iamax_kernel_zero
 	INIT_S
 	subs	N, N, #1
 	ble	iamax_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	iamax_kernel_S1
 iamax_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	iamax_kernel_S4
 iamax_kernel_S1:
 	ands	I, N, #3
 	ble	iamax_kernel_L999
 iamax_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     iamax_kernel_S10
 iamax_kernel_L999:
 	mov	x0, INDEX
 	ret
 iamax_kernel_zero:
 	mov	x0, xzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/rot.S
+++ b/kernel/arm64/rot.S
@@ -0,0 +1,243 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define	Y	x3	/* Y vector address */
 #define	INC_Y	x4	/* Y stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define	C	s0	/* scale input value */
 #define	S	s1	/* scale input value */
 #else
 #define	C	d0	/* scale input value */
 #define	S	d1	/* scale input value */
 #endif
 /******************************************************************************/
 .macro INIT
 #if !defined(DOUBLE)
 	ins	v0.s[1], v0.s[0]		// [C, C]
 #else
 	ins	v0.d[1], v0.d[0]		// [C, C]
 #endif
 .endm
 .macro INIT_F1
 #if !defined(DOUBLE)
 	fneg	s2, S
 	ins	v1.s[1], v2.s[0]		// [-S, S]
 #else
 	fneg	d2, S
 	ins	v1.d[1], v2.d[0]		// [-S, S]
 #endif
 .endm
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v2.s}[0], [X]
 	ld1	{v2.s}[1], [Y]			// [Y, X]
 	ext	v3.8b, v2.8b, v2.8b, #4		// [X, Y]
 	fmul	v4.2s, v2.2s, v0.2s		// [C*Y, C*X]
 	fmla	v4.2s, v3.2s, v1.2s		// [C*Y - S*X, C*X + S*Y]
 	st1	{v4.s}[0], [X], #4
 	st1	{v4.s}[1], [Y], #4
 #else
 	ld1	{v2.d}[0], [X]
 	ld1	{v2.d}[1], [Y]			// [Y, X]
 	ext	v3.16b, v2.16b, v2.16b, #8	// [X, Y]
 	fmul	v4.2d, v2.2d, v0.2d		// [C*Y, C*X]
 	fmla	v4.2d, v3.2d, v1.2d		// [C*Y - S*X, C*X + S*Y]
 	st1	{v4.d}[0], [X], #8
 	st1	{v4.d}[1], [Y], #8
 #endif
 .endm
 .macro KERNEL_INIT_F4
 #if !defined(DOUBLE)
 	ins	v0.d[1], v0.d[0]		// [C, C, C, C]
 	ins	v1.s[1], v1.s[0]
 	ins	v1.d[1], v1.d[0]		// [S, S, S, S]
 #else
 	ins	v1.d[1], v1.d[0]		// [S, S]
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v2.4s}, [X]
 	fmul	v4.4s, v0.4s, v2.4s		// C*X3, C*X2, C*X1, C*X0
 	ld1	{v3.4s}, [Y]
 	fmla	v4.4s, v1.4s, v3.4s		// C*X3+S*Y3, ..., C*X0+S*Y0
 	st1	{v4.4s}, [X], #16
 	fmul	v5.4s, v0.4s, v3.4s		// C*Y3, C*Y2, C*Y1, C*Y0
 	fmls	v5.4s, v1.4s, v2.4s		// C*Y3-S*X3, ..., C*Y0-S*X0
 	st1	{v5.4s}, [Y], #16
 #else // DOUBLE
 	ld1	{v2.2d, v3.2d}, [X]
 	fmul	v6.2d, v0.2d, v2.2d		// C*X1, C*X0
 	fmul	v7.2d, v0.2d, v3.2d		// C*X3, C*X2
 	ld1	{v4.2d, v5.2d}, [Y]
 	fmla	v6.2d, v1.2d, v4.2d		// C*X1+S*Y1, C*X0+S*Y0
 	fmla	v7.2d, v1.2d, v5.2d		// C*X3+S*Y3, C*X2+S*Y2
 	st1	{v6.2d, v7.2d}, [X], #32
 	fmul	v16.2d, v0.2d, v4.2d		// C*Y1, C*Y0
 	fmul	v17.2d, v0.2d, v5.2d		// C*Y3, C*Y2
 	fmls	v16.2d, v1.2d, v2.2d		// C*Y1-S*X1, C*Y0-S*X0
 	fmls	v17.2d, v1.2d, v3.2d		// C*Y3-S*X3, C*Y2-S*X2
 	st1	{v16.2d, v17.2d}, [Y], #32
 	PRFM	PLDL1KEEP, [X, #512]
 	PRFM	PLDL1KEEP, [Y, #512]
 #endif
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 	lsl	INC_Y, INC_Y, #2
 #else
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v2.s}[0], [X]
 	ld1	{v2.s}[1], [Y]			// [Y, X]
 	ext	v3.8b, v2.8b, v2.8b, #4		// [X, Y]
 	fmul	v4.2s, v2.2s, v0.2s		// [C*Y, C*X]
 	fmla	v4.2s, v3.2s, v1.2s		// [C*Y - S*X, C*X + S*Y]
 	st1	{v4.s}[0], [X], INC_X
 	st1	{v4.s}[1], [Y], INC_Y
 #else
 	ld1	{v2.d}[0], [X]
 	ld1	{v2.d}[1], [Y]			// [Y, X]
 	ext	v3.16b, v2.16b, v2.16b, #8	// [X, Y]
 	fmul	v4.2d, v2.2d, v0.2d		// [C*Y, C*X]
 	fmla	v4.2d, v3.2d, v1.2d		// [C*Y - S*X, C*X + S*Y]
 	st1	{v4.d}[0], [X], INC_X
 	st1	{v4.d}[1], [Y], INC_Y
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	rot_kernel_L999
 	INIT
 	cmp	INC_X, #1
 	bne	rot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	rot_kernel_S_BEGIN
 rot_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	rot_kernel_F1
 	KERNEL_INIT_F4
 rot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	rot_kernel_F4
 rot_kernel_F1:
 	ands	I, N, #3
 	ble	rot_kernel_L999
 	INIT_F1
 rot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     rot_kernel_F10
 	mov	w0, wzr
 	ret
 rot_kernel_S_BEGIN:
 	INIT_S
 	INIT_F1
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	rot_kernel_S1
 rot_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	rot_kernel_S4
 rot_kernel_S1:
 	ands	I, N, #3
 	ble	rot_kernel_L999
 rot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     rot_kernel_S10
 rot_kernel_L999:
 	mov	w0, wzr
 	ret
--- a/kernel/arm64/scal.S
+++ b/kernel/arm64/scal.S
@@ -0,0 +1,253 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x3	/* X vector address */
 #define	X_COPY	x5	/* X vector address */
 #define	INC_X	x4	/* X stride */
 #define I	x1	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define	DA	s0	/* scale input value */
 #define	DAV	{v0.s}[0]
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4
 #else
 #define	DA	d0	/* scale input value */
 #define	DAV	{v0.d}[0]
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 	ldr	TMPF, [X]
 	fmul	TMPF, TMPF, DA
 	str	TMPF, [X], #SZ
 .endm
 .macro KERNEL_INIT_F8
 #if !defined(DOUBLE)
 	ins	v0.s[1], v0.s[0]
 	ins	v0.s[2], v0.s[0]
 	ins	v0.s[3], v0.s[0]
 #else
 	ins	v0.d[1], v0.d[0]
 #endif
 .endm
 .macro KERNEL_F8
 #if !defined(DOUBLE)
 	ld1	{v1.4s, v2.4s}, [X]
 	fmul	v1.4s, v1.4s, v0.4s
 	fmul	v2.4s, v2.4s, v0.4s
 	st1	{v1.4s, v2.4s}, [X], #32
 #else // DOUBLE
 	ld1	{v1.2d, v2.2d, v3.2d, v4.2d}, [X]
 	fmul	v1.2d, v1.2d, v0.2d
 	fmul	v2.2d, v2.2d, v0.2d
 	fmul	v3.2d, v3.2d, v0.2d
 	fmul	v4.2d, v4.2d, v0.2d
 	st1	{v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
 #endif
 	PRFM	PLDL1KEEP, [X, #1024]
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 #else
 	lsl	INC_X, INC_X, #3
 #endif
 .endm
 .macro KERNEL_S1
 	ldr	TMPF, [X]
 	fmul	TMPF, TMPF, DA
 	st1	TMPVF, [X], INC_X
 .endm
 .macro KERNEL_S4
 #if !defined(DOUBLE)
 	ldr	s1, [X]
 	add	X, X, INC_X
 	fmul	s1, s1, s0
 	str	s1, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 	ldr	s2, [X]
 	add	X, X, INC_X
 	fmul	s2, s2, s0
 	str	s2, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 	ldr	s3, [X]
 	add	X, X, INC_X
 	fmul	s3, s3, s0
 	str	s3, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 	ldr	s4, [X]
 	add	X, X, INC_X
 	fmul	s4, s4, s0
 	str	s4, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 #else
 	ldr	d1, [X]
 	add	X, X, INC_X
 	fmul	d1, d1, d0
 	str	d1, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 	ldr	d2, [X]
 	add	X, X, INC_X
 	fmul	d2, d2, d0
 	str	d2, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 	ldr	d3, [X]
 	add	X, X, INC_X
 	fmul	d3, d3, d0
 	str	d3, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 	ldr	d4, [X]
 	add	X, X, INC_X
 	fmul	d4, d4, d0
 	str	d4, [X_COPY]
 	add	X_COPY, X_COPY, INC_X
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	scal_kernel_L999
 	fcmp	DA, #0.0
 	beq	scal_kernel_zero
 	cmp	INC_X, #1
 	bne	scal_kernel_S_BEGIN
 scal_kernel_F_BEGIN:
 	asr	I, N, #3
 	cmp	I, xzr
 	beq	scal_kernel_F1
 	KERNEL_INIT_F8
 scal_kernel_F8:
 	KERNEL_F8
 	subs	I, I, #1
 	bne	scal_kernel_F8
 scal_kernel_F1:
 	ands	I, N, #7
 	ble	scal_kernel_L999
 scal_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     scal_kernel_F10
 	mov	w0, wzr
 	ret
 scal_kernel_S_BEGIN:
 	INIT_S
 	mov	X_COPY, X
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	scal_kernel_S1
 scal_kernel_S4:
 	KERNEL_S4
 	subs	I, I, #1
 	bne	scal_kernel_S4
 scal_kernel_S1:
 	ands	I, N, #3
 	ble	scal_kernel_L999
 scal_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     scal_kernel_S10
 scal_kernel_L999:
 	mov	w0, wzr
 	ret
 scal_kernel_zero:
 	INIT_S
 scal_kernel_Z1:
 	st1	DAV, [X], INC_X
 	subs    N, N, #1
        bne     scal_kernel_Z1
 	mov	w0, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/sgemm_kernel_4x4.S
+++ b/kernel/arm64/sgemm_kernel_4x4.S
--- a/kernel/arm64/snrm2.S
+++ b/kernel/arm64/snrm2.S
@@ -0,0 +1,178 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #define TMPF	s6
 #define SSQ	s0
 #define TMPVF	{v6.s}[0]
 #define SZ	4
 /******************************************************************************/
 .macro INIT_F1
 	ldr	TMPF, [X], #SZ
 	fmul	SSQ, TMPF, TMPF
 .endm
 .macro KERNEL_F1
 	ldr	TMPF, [X], #SZ
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 .endm
 .macro INIT_F4
 	ld1	{v1.4s}, [X], #16
 	fmul	v1.4s, v1.4s, v1.4s
 	ext	v2.16b, v1.16b, v1.16b, #8
 	fadd	v2.2s, v1.2s, v2.2s
 	faddp	SSQ, v2.2s
 .endm
 .macro KERNEL_F4
 	ld1	{v1.4s}, [X], #16
 	fmul	v1.4s, v1.4s, v1.4s
 	ext	v2.16b, v1.16b, v1.16b, #8
 	fadd	v2.2s, v1.2s, v2.2s
 	faddp	TMPF, v2.2s
 	fadd	SSQ, SSQ, TMPF
 .endm
 .macro INIT_S
 	lsl	INC_X, INC_X, #2
 	ld1	TMPVF, [X], INC_X
 	fmul	SSQ, TMPF, TMPF
 .endm
 .macro KERNEL_S1
 	ld1	TMPVF, [X], INC_X
 	fmul	TMPF, TMPF, TMPF
 	fadd	SSQ, SSQ, TMPF
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN
 nrm2_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	nrm2_kernel_F1_INIT
 	INIT_F4
 	subs	I, I, #1
 	beq	nrm2_kernel_F1
 nrm2_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	nrm2_kernel_F4
 nrm2_kernel_F1:
 	ands	I, N, #3
 	ble	nrm2_kernel_L999
 nrm2_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     nrm2_kernel_F10
 	b	nrm2_kernel_L999
 nrm2_kernel_F1_INIT:
 	INIT_F1
 	subs	N, N, #1
 	b	nrm2_kernel_F1
 nrm2_kernel_S_BEGIN:
 	INIT_S
 	subs	N, N, #1
 	ble	nrm2_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	nrm2_kernel_S1
 nrm2_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	nrm2_kernel_S4
 nrm2_kernel_S1:
 	ands	I, N, #3
 	ble	nrm2_kernel_L999
 nrm2_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
 	bne     nrm2_kernel_S10
 nrm2_kernel_L999:
 	fsqrt	SSQ, SSQ
 	ret
 nrm2_kernel_zero:
 	fmov	SSQ, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/strmm_kernel_4x4.S
+++ b/kernel/arm64/strmm_kernel_4x4.S
--- a/kernel/arm64/swap.S
+++ b/kernel/arm64/swap.S
@@ -0,0 +1,266 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x3	/* X vector address */
 #define	INC_X	x4	/* X stride */
 #define	Y	x5	/* Y vector address */
 #define	INC_Y	x6	/* Y stride */
 #define I	x1	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define TMP0	s0
 #define TMPV0	{v0.s}[0]
 #define TMP1	s1
 #define TMPV1	{v1.s}[0]
 #define SZ	4
 #else
 #define TMP0	d0
 #define TMPV0	{v0.d}[0]
 #define TMP1	d1
 #define TMPV1	{v1.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 #if !defined(COMPLEX)
 	ldr	TMP0, [X]
 	ldr	TMP1, [Y]
 	str	TMP0, [Y], #SZ
 	str	TMP1, [X], #SZ
 #else
 #if !defined(DOUBLE)
 	ld1	{v0.2s}, [X]
 	ld1	{v1.2s}, [Y]
 	st1	{v0.2s}, [Y], #8
 	st1	{v1.2s}, [X], #8
 #else
 	ld1	{v0.2d}, [X]
 	ld1	{v1.2d}, [Y]
 	st1	{v0.2d}, [Y], #16
 	st1	{v1.2d}, [X], #16
 #endif
 #endif
 .endm
 .macro KERNEL_F8
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 #else // DOUBLE
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 #endif
 #else // COMPLEX
 #if !defined(DOUBLE)
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 #else // DOUBLE
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 	ld1	{v0.4s, v1.4s}, [X]
 	ld1	{v2.4s, v3.4s}, [Y]
 	st1	{v0.4s, v1.4s}, [Y], #32
 	st1	{v2.4s, v3.4s}, [X], #32
 #endif
 #endif
 .endm
 .macro INIT_S
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #2
 	lsl	INC_Y, INC_Y, #2
 #else
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #endif
 #else
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #else
 	lsl	INC_X, INC_X, #4
 	lsl	INC_Y, INC_Y, #4
 #endif
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(COMPLEX)
 #if !defined(DOUBLE)
 	ldr	w10, [X]
 	ldr	w11, [Y]
 	str	w10, [Y]
 	str	w11, [X]
 #else
 	ldr	x10, [X]
 	ldr	x11, [Y]
 	str	x10, [Y]
 	str	x11, [X]
 #endif
 #else
 #if !defined(DOUBLE)
 	ldr	x10, [X]
 	ldr	x11, [Y]
 	str	x10, [Y]
 	str	x11, [X]
 #else
 	ldr	x10, [X]
 	ldr	x11, [Y]
 	str	x10, [Y]
 	str	x11, [X]
 	ldr	x12, [X, #8]
 	ldr	x13, [Y, #8]
 	str	x12, [Y, #8]
 	str	x13, [X, #8]
 #endif
 #endif
 	add	Y, Y, INC_Y
 	add	X, X, INC_X
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	swap_kernel_L999
 	cmp	INC_X, #1
 	bne	swap_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	swap_kernel_S_BEGIN
 swap_kernel_F_BEGIN:
 	asr	I, N, #3
 	cmp	I, xzr
 	beq	swap_kernel_F1
 swap_kernel_F8:
 	KERNEL_F8
 	subs	I, I, #1
 	bne	swap_kernel_F8
 swap_kernel_F1:
 	ands	I, N, #7
 	ble	swap_kernel_L999
 swap_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
 	bne     swap_kernel_F10
 	b	swap_kernel_L999
 swap_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	swap_kernel_S1
 swap_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	swap_kernel_S4
 swap_kernel_S1:
 	ands	I, N, #3
 	ble	swap_kernel_L999
 swap_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     swap_kernel_S10
 swap_kernel_L999:
 	mov	w0, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/zamax.S
+++ b/kernel/arm64/zamax.S
@@ -0,0 +1,273 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if defined(USE_MIN)
 #define COND	le
 #else
 #define COND	ge
 #endif
 #if !defined(DOUBLE)
 #define REG0	wzr
 #define MAXF	s0
 #define TMPF	s1
 #define TMPVF	{v1.s}[0]
 #define SZ	4
 #else
 #define REG0	xzr
 #define MAXF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro INIT_F1
 #if !defined(DOUBLE)
 	ld1	{v0.2s}, [X], #8
 	fabs	v0.2s, v0.2s
 	ext	v1.8b, v0.8b, v0.8b, #4
 	fadd	MAXF, s0, s1
 #else
 	ld1	{v0.2d}, [X], #16
 	fabs	v0.2d, v0.2d
 	faddp	MAXF, v0.2d
 #endif
 .endm
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v1.2s}, [X], #8
 	fabs	v1.2s, v1.2s
 	ext	v2.8b, v1.8b, v1.8b, #4
 	fadd	TMPF, s1, s2
 #else
 	ld1	{v1.2d}, [X], #16
 	fabs	v1.2d, v1.2d
 	faddp	TMPF, v1.2d
 #endif
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 .endm
 .macro INIT_F4
 #if !defined(DOUBLE)
 	ld2	{v0.4s,v1.4s}, [X], #32
 	fabs	v0.4s, v0.4s			// [X6, X4, X2, X0]
 	fabs	v1.4s, v1.4s			// [X7, X5, X3, X1]
 	fadd	v0.4s, v0.4s, v1.4s		// [X7+X6, X5+X4, X3+X2, X1+X0]
 #if defined(USE_MIN)
 	fminv	MAXF, v0.4s
 #else
 	fmaxv	MAXF, v0.4s
 #endif
 #else // DOUBLE
 	ld4	{v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64
 	fabs	v0.2d, v0.2d
 	fabs	v1.2d, v1.2d
 	fabs	v2.2d, v2.2d
 	fabs	v3.2d, v3.2d
 	fadd	v0.2d, v0.2d, v1.2d
 	fadd	v2.2d, v2.2d, v3.2d
 #if defined(USE_MIN)
 	fmin	v0.2d, v0.2d, v2.2d
 	fminp	MAXF, v0.2d
 #else
 	fmax	v0.2d, v0.2d, v2.2d
 	fmaxp	MAXF, v0.2d
 #endif
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld2	{v1.4s,v2.4s}, [X], #32
 	fabs	v1.4s, v1.4s			// [X6, X4, X2, X0]
 	fabs	v2.4s, v2.4s			// [X7, X5, X3, X1]
 	fadd	v1.4s, v1.4s, v2.4s		// [X7+X6, X5+X4, X3+X2, X1+X0]
 #if defined(USE_MIN)
 	fminv	TMPF, v1.4s
 #else
 	fmaxv	TMPF, v1.4s
 #endif
 #else // DOUBLE
 	ld4	{v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64
 	fabs	v1.2d, v1.2d
 	fabs	v2.2d, v2.2d
 	fabs	v3.2d, v3.2d
 	fabs	v4.2d, v4.2d
 	fadd	v1.2d, v1.2d, v2.2d
 	fadd	v3.2d, v3.2d, v4.2d
 #if defined(USE_MIN)
 	fmin	v1.2d, v1.2d, v3.2d
 	fminp	MAXF, v1.2d
 #else
 	fmax	v1.2d, v1.2d, v3.2d
 	fmaxp	MAXF, v1.2d
 #endif
 #endif
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	ld1	{v0.2s}, [X], INC_X
 	fabs	v0.2s, v0.2s
 	ext	v1.8b, v0.8b, v0.8b, #4
 	fadd	MAXF, s0, s1
 #else
 	lsl	INC_X, INC_X, #4
 	ld1	{v0.2d}, [X], INC_X
 	fabs	v0.2d, v0.2d
 	faddp	MAXF, v0.2d
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v1.2s}, [X], INC_X
 	fabs	v1.2s, v1.2s
 	ext	v2.8b, v1.8b, v1.8b, #4
 	fadd	TMPF, s1, s2
 #else
 	ld1	{v1.2d}, [X], INC_X
 	fabs	v1.2d, v1.2d
 	faddp	TMPF, v1.2d
 #endif
 	fcmp	MAXF, TMPF
 	fcsel	MAXF, MAXF, TMPF, COND
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	amax_kernel_zero
 	cmp	INC_X, xzr
 	ble	amax_kernel_zero
 	cmp	INC_X, #1
 	bne	amax_kernel_S_BEGIN
 amax_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	amax_kernel_F1_INIT
 	INIT_F4
 	subs	I, I, #1
 	beq	amax_kernel_F1
 amax_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	amax_kernel_F4
 amax_kernel_F1:
 	ands	I, N, #3
 	ble	amax_kernel_L999
 amax_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     amax_kernel_F10
 	ret
 amax_kernel_F1_INIT:
 	INIT_F1
 	subs	N, N, #1
 	b	amax_kernel_F1
 amax_kernel_S_BEGIN:
 	INIT_S
 	subs	N, N, #1
 	ble	amax_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	amax_kernel_S1
 amax_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	amax_kernel_S4
 amax_kernel_S1:
 	ands	I, N, #3
 	ble	amax_kernel_L999
 amax_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     amax_kernel_S10
 amax_kernel_L999:
 	ret
 amax_kernel_zero:
 	fmov	MAXF, REG0
 	ret
 	EPILOGUE
--- a/kernel/arm64/zasum.S
+++ b/kernel/arm64/zasum.S
@@ -0,0 +1,164 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #define REG0	xzr
 #define SUMF	d0
 #define TMPF	d1
 #define TMPVF	{v1.d}[0]
 #define SZ	8
 /******************************************************************************/
 .macro KERNEL_F1
 	ld1	{v1.2d}, [X], #16
 	fabs	v1.2d, v1.2d
 	faddp	TMPF, v1.2d
 	fadd	SUMF, SUMF, TMPF
 .endm
 .macro KERNEL_F4
 	ld1	{v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
 	fabs	v1.2d, v1.2d
 	fabs	v2.2d, v2.2d
 	fabs	v3.2d, v3.2d
 	fabs	v4.2d, v4.2d
 	fadd	v1.2d, v1.2d, v2.2d
 	fadd	v3.2d, v3.2d, v4.2d
 	fadd	v0.2d, v0.2d, v1.2d
 	fadd	v0.2d, v0.2d, v3.2d
 	PRFM	PLDL1KEEP, [X, #1024]
 .endm
 .macro KERNEL_F4_FINALIZE
 	faddp	SUMF, v0.2d
 .endm
 .macro INIT_S
 	lsl	INC_X, INC_X, #4
 .endm
 .macro KERNEL_S1
 	ld1	{v1.2d}, [X], INC_X
 	fabs	v1.2d, v1.2d
 	faddp	TMPF, v1.2d
 	fadd	SUMF, SUMF, TMPF
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	fmov	SUMF, REG0
 	cmp	N, xzr
 	ble	asum_kernel_L999
 	cmp	INC_X, xzr
 	ble	asum_kernel_L999
 	cmp	INC_X, #1
 	bne	asum_kernel_S_BEGIN
 asum_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	asum_kernel_F1
 asum_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	asum_kernel_F4
 	KERNEL_F4_FINALIZE
 asum_kernel_F1:
 	ands	I, N, #3
 	ble	asum_kernel_L999
 asum_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     asum_kernel_F10
 asum_kernel_L999:
 	ret
 asum_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	asum_kernel_S1
 asum_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	asum_kernel_S4
 asum_kernel_S1:
 	ands	I, N, #3
 	ble	asum_kernel_L999
 asum_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     asum_kernel_S10
 	ret
 	EPILOGUE
--- a/kernel/arm64/zaxpy.S
+++ b/kernel/arm64/zaxpy.S
@@ -0,0 +1,301 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x3	/* X vector address */
 #define	INC_X	x4	/* X stride */
 #define	Y	x5	/* Y vector address */
 #define	INC_Y	x6	/* Y stride */
 #define I	x1	/* loop variable */
 #define Y_COPY	x7	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define	DA_R	s0	/* scale input value */
 #define	DA_I	s1	/* scale input value */
 #define TMPX	v2.2s
 #define TMPY	v3.2s
 #define SZ	4
 #else
 #define	DA_R	d0	/* scale input value */
 #define	DA_I	d1	/* scale input value */
 #define TMPX	v2.2d
 #define TMPY	v3.2d
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro INIT
 #if !defined(CONJ)
 #if !defined(DOUBLE)
 	ins	v0.s[1], v0.s[0]		// v0 = DA_R, DA_R 
 	fneg	s2, DA_I
 	ins	v1.s[1], v2.s[0]		// v1 = -DA_I, DA_I 
 	ext	v1.8b, v1.8b, v1.8b, #4		// v1 = DA_I, -DA_I
 #else
 	ins	v0.d[1], v0.d[0]		// v0 = DA_R, DA_R 
 	fneg	d2, DA_I
 	ins	v1.d[1], v2.d[0]		// v1 = -DA_I, DA_I 
 	ext	v1.16b, v1.16b, v1.16b, #8	// v1 = DA_I, -DA_I
 #endif
 #else
 #if !defined(DOUBLE)
 	fneg	s2, DA_R
 	ins	v0.s[1], v2.s[0]		// v0 = -DA_R, DA_R 
 	ins	v1.s[1], v1.s[0]		// v1 = DA_I, DA_I 
 #else
 	fneg	d2, DA_R
 	ins	v0.d[1], v2.d[0]		// v0 = -DA_R, DA_R 
 	ins	v1.d[1], v1.d[0]		// v1 = DA_I, DA_I 
 #endif
 #endif
 .endm
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X], #8		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2s}, [Y]			// V3 = Y[iy+1], Y[iy]
 	ext	v4.8b, v2.8b, v2.8b, #4		// V4 = X[ix], X[ix+1]
 	fmla	v3.2s, v0.2s, v2.2s		// Y[iy]   += DA_R * X[ix]
 						// Y[iy+1] += +-DA_R * X[ix+1]
 	fmla	v3.2s, v1.2s, v4.2s		// Y[iy]   += +-DA_I * X[ix+1]
 						// Y[iy+1] += DA_I * X[ix]
 	st1	{v3.2s}, [Y], #8
 #else
 	ld1	{v2.2d}, [X], #16		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2d}, [Y]			// V3 = Y[iy+1], Y[iy]
 	ext	v4.16b, v2.16b, v2.16b, #8	// V4 = X[ix], X[ix+1]
 	fmla	v3.2d, v0.2d, v2.2d		// Y[iy]   += DA_R * X[ix]
 						// Y[iy+1] += +-DA_R * X[ix+1]
 	fmla	v3.2d, v1.2d, v4.2d		// Y[iy]   += +-DA_I * X[ix+1]
 						// Y[iy+1] += DA_I * X[ix]
 	st1	{v3.2d}, [Y], #16
 #endif
 .endm
 .macro KERNEL_INIT_F4
 #if !defined(DOUBLE)
 	// Replicate the lower 2 floats into the upper 2 slots
 	ins	v0.d[1], v0.d[0]		// v0 = DA_R, DA_R, DA_R, DA_R
 	ins	v1.d[1], v1.d[0]		// v1 = DA_I, DA_I, DA_I, DA_I
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v2.4s,v3.4s}, [X], #32		// V2 = X[3], X[2], X[1], X[0]
 						// V3 = X[7], X[6], X[5], X[4]
 	ext	v6.8b, v2.8b, v2.8b, #4		// V6 =  -  ,  -  , X[0], X[1]
 	ins	v6.s[2], v2.s[3]		// V6 =  -  , X[3], X[0], X[1]
 	ins	v6.s[3], v2.s[2]		// V6 = X[2], X[3], X[0], X[1]
 	ld1	{v4.4s,v5.4s}, [Y]		// V4 = Y[3], Y[2], Y[1], Y[0]
 						// V5 = Y[7], Y[6], Y[5], Y[4]
 	ext	v7.8b, v3.8b, v3.8b, #4		// V7 =  -  ,  -  , X[4], X[5]
 	ins	v7.s[2], v3.s[3]		// V7 =  -  , X[7], X[4], X[5]
 	ins	v7.s[3], v3.s[2]		// V7 = X[6], X[7], X[4], X[5]
 	fmla	v4.4s, v0.4s, v2.4s		// Y[iy]   += DA_R * X[ix]
 						// Y[iy+1] += +-DA_R * X[ix+1]
 	fmla	v4.4s, v1.4s, v6.4s		// Y[iy]   += +-DA_I * X[ix+1]
 						// Y[iy+1] += DA_I * X[ix]
 	st1	{v4.4s}, [Y], #16
 	fmla	v5.4s, v0.4s, v3.4s		// Y[iy]   += DA_R * X[ix]
 	fmla	v5.4s, v1.4s, v7.4s		// Y[iy]   += +-DA_I * X[ix+1]
 						// Y[iy+1] += +-DA_R * X[ix+1]
 						// Y[iy+1] += DA_I * X[ix]
 	st1	{v5.4s}, [Y], #16
 #else // DOUBLE
 	ld1	{v2.2d,v3.2d}, [X], #32			// CX0, CX1, CX2, CX3
 	ext	v20.16b, v2.16b, v2.16b, #8		// X[ix], X[ix+1]
 	ext	v21.16b, v3.16b, v3.16b, #8		// X[ix], X[ix+1]
 	ld1	{v4.2d,v5.2d}, [X], #32			// CX0, CX1, CX2, CX3
 	ext	v22.16b, v4.16b, v4.16b, #8		// X[ix], X[ix+1]
 	ext	v23.16b, v5.16b, v5.16b, #8		// X[ix], X[ix+1]
 	ld1	{v16.2d,v17.2d}, [Y_COPY], #32	// CY0, CY1, CY2, CY3  
 	fmla	v16.2d, v0.2d, v2.2d
 	fmla	v17.2d, v0.2d, v3.2d
 	ld1	{v18.2d,v19.2d}, [Y_COPY], #32	// CY0, CY1, CY2, CY3  
 	fmla	v16.2d, v1.2d, v20.2d
 	fmla	v17.2d, v1.2d, v21.2d
 	st1	{v16.2d,v17.2d}, [Y], #32
 	fmla	v18.2d, v0.2d, v4.2d
 	fmla	v19.2d, v0.2d, v5.2d
 	fmla	v18.2d, v1.2d, v22.2d
 	fmla	v19.2d, v1.2d, v23.2d
 	st1	{v18.2d,v19.2d}, [Y], #32
 #endif
 	PRFM	PLDL1KEEP, [X, #512]
 	PRFM	PLDL1KEEP, [Y, #512]
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #else
 	lsl	INC_X, INC_X, #4
 	lsl	INC_Y, INC_Y, #4
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X], INC_X		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2s}, [Y]			// V3 = Y[iy+1], Y[iy]
 	ext	v4.8b, v2.8b, v2.8b, #4		// V4 = X[ix], X[ix+1]
 	fmla	v3.2s, v0.2s, v2.2s		// Y[iy]   += DA_R * X[ix]
 						// Y[iy+1] += +-DA_R * X[ix+1]
 	fmla	v3.2s, v1.2s, v4.2s		// Y[iy]   += +-DA_I * X[ix+1]
 						// Y[iy+1] += DA_I * X[ix]
 	st1	{v3.2s}, [Y], INC_Y
 #else
 	ld1	{v2.2d}, [X], INC_X		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2d}, [Y]			// V3 = Y[iy+1], Y[iy]
 	ext	v4.16b, v2.16b, v2.16b, #8	// V4 = X[ix], X[ix+1]
 	fmla	v3.2d, v0.2d, v2.2d		// Y[iy]   += DA_R * X[ix]
 						// Y[iy+1] += +-DA_R * X[ix+1]
 	fmla	v3.2d, v1.2d, v4.2d		// Y[iy]   += +-DA_I * X[ix+1]
 						// Y[iy+1] += DA_I * X[ix]
 	st1	{v3.2d}, [Y], INC_Y
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	zaxpy_kernel_L999
 	mov	Y_COPY, Y
 	fcmp	DA_R, #0.0
 	bne	.L1
 	fcmp	DA_I, #0.0
 	beq	zaxpy_kernel_L999
 .L1:
 	INIT
 	cmp	INC_X, #1
 	bne	zaxpy_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	zaxpy_kernel_S_BEGIN
 zaxpy_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	zaxpy_kernel_F1
 	KERNEL_INIT_F4
 zaxpy_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	zaxpy_kernel_F4
 zaxpy_kernel_F1:
 	ands	I, N, #3
 	ble	zaxpy_kernel_L999
 zaxpy_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     zaxpy_kernel_F10
 	mov	w0, wzr
 	ret
 zaxpy_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	zaxpy_kernel_S1
 zaxpy_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	zaxpy_kernel_S4
 zaxpy_kernel_S1:
 	ands	I, N, #3
 	ble	zaxpy_kernel_L999
 zaxpy_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     zaxpy_kernel_S10
 zaxpy_kernel_L999:
 	mov	w0, wzr
 	ret
--- a/kernel/arm64/zdot.S
+++ b/kernel/arm64/zdot.S
@@ -0,0 +1,302 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define	Y	x3	/* Y vector address */
 #define	INC_Y	x4	/* Y stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #if !defined(DSDOT)
 #define REG0	wzr
 #define DOTF	s0
 #else // DSDOT
 #define REG0	xzr
 #define DOTF	d0
 #endif
 #define DOTI	s1
 #define TMPX	s2
 #define LD1VX	{v2.s}[0]
 #define TMPY	s3
 #define LD1VY	{v3.s}[0]
 #define TMPVY	v3.s[0]
 #define SZ	4
 #else
 #define REG0	xzr
 #define DOTF	d0
 #define DOTI	d1
 #define TMPX	d2
 #define LD1VX	{v2.d}[0]
 #define TMPY	d3
 #define LD1VY	{v3.d}[0]
 #define TMPVY	v3.d[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X], #8		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2s}, [Y], #8		// V3 = Y[iy+1], Y[iy]; Y += 2
 	ins	v4.s[0], v2.s[1]		// V4 = X[ix+1]
 #if !defined(CONJ)
 	fmla	DOTF, s2, v3.s[0]		// dot[0] += X[ix] * Y[iy]
 	fmls	DOTF, s4, v3.s[1]		// dot[0] -= X[ix+1] * Y[iy+1]
 	fmla	DOTI, s4, v3.s[0]		// dot[1] += X[ix+1] * Y[iy]
 	fmla	DOTI, s2, v3.s[1]		// dot[1] += X[ix] * Y[iy+1]
 #else
 	fmla	DOTF, s2, v3.s[0]		// dot[0] += X[ix] * Y[iy]
 	fmla	DOTF, s4, v3.s[1]		// dot[0] += X[ix+1] * Y[iy+1]
 	fmls	DOTI, s4, v3.s[0]		// dot[1] -= X[ix+1] * Y[iy]
 	fmla	DOTI, s2, v3.s[1]		// dot[1] += X[ix] * Y[iy+1]
 #endif
 #else // DOUBLE
 	ld1	{v2.2d}, [X], #16		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2d}, [Y], #16		// V3 = Y[iy+1], Y[iy]; Y += 2
 	ins	v4.d[0], v2.d[1]		// V4 = X[ix+1]
 #if !defined(CONJ)
 	fmla	DOTF, d2, v3.d[0]		// dot[0] += X[ix] * Y[iy]
 	fmls	DOTF, d4, v3.d[1]		// dot[0] -= X[ix+1] * Y[iy+1]
 	fmla	DOTI, d4, v3.d[0]		// dot[1] += X[ix+1] * Y[iy]
 	fmla	DOTI, d2, v3.d[1]		// dot[1] += X[ix] * Y[iy+1]
 #else
 	fmla	DOTF, d2, v3.d[0]		// dot[0] += X[ix] * Y[iy]
 	fmla	DOTF, d4, v3.d[1]		// dot[0] += X[ix+1] * Y[iy+1]
 	fmls	DOTI, d4, v3.d[0]		// dot[1] -= X[ix+1] * Y[iy]
 	fmla	DOTI, d2, v3.d[1]		// dot[1] += X[ix] * Y[iy+1]
 #endif
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld2	{v2.4s, v3.4s}, [X], #32	// V2 = X[ix+1], X[ix]; X += 2
 	ld2	{v4.4s, v5.4s}, [Y], #32	// V2 = X[ix+1], X[ix]; X += 2
 	fmla	v0.4s, v2.4s, v4.4s		// dot[0] += X[ix] * Y[iy]
 	fmla	v1.4s, v2.4s, v5.4s		// dot[1] += X[ix] * Y[iy+1]
 	PRFM	PLDL1KEEP, [X, #1024]
 	PRFM	PLDL1KEEP, [Y, #1024]
 #if !defined(CONJ)
 	fmls	v0.4s, v3.4s, v5.4s		// dot[0] -= X[ix+1] * Y[iy+1]
 	fmla	v1.4s, v3.4s, v4.4s		// dot[1] += X[ix+1] * Y[iy]
 #else
 	fmla	v0.4s, v3.4s, v5.4s		// dot[0] += X[ix+1] * Y[iy+1]
 	fmls	v1.4s, v3.4s, v4.4s		// dot[1] -= X[ix+1] * Y[iy]
 #endif
 #else // DOUBLE
 	ld2	{v2.2d, v3.2d}, [X], #32	// V2 = X[ix+1], X[ix]; X += 2
 	ld2	{v16.2d, v17.2d}, [Y], #32
 	fmla	v0.2d, v2.2d, v16.2d		// dot[0] += X[ix] * Y[iy]
 	fmla	v1.2d, v2.2d, v17.2d		// dot[1] += X[ix] * Y[iy+1]
 	ld2	{v4.2d, v5.2d}, [X], #32
 	ld2	{v18.2d, v19.2d}, [Y], #32
 	fmla	v0.2d, v4.2d, v18.2d		// dot[1] += X[ix] * Y[iy+1]
 	fmla	v1.2d, v4.2d, v19.2d		// dot[1] += X[ix] * Y[iy+1]
 	PRFM	PLDL1KEEP, [X, #1024]
 	PRFM	PLDL1KEEP, [Y, #1024]
 #if !defined(CONJ)
 	fmls	v0.2d, v3.2d, v17.2d		// dot[0] -= X[ix+1] * Y[iy+1]
 	fmls	v20.2d, v5.2d, v19.2d		// dot[0] -= X[ix+1] * Y[iy+1]
 	fmla	v1.2d, v3.2d, v16.2d		// dot[1] += X[ix+1] * Y[iy]
 	fmla	v21.2d, v5.2d, v18.2d		// dot[1] += X[ix+1] * Y[iy]
 #else
 	fmla	v0.2d, v3.2d, v17.2d		// dot[0] += X[ix+1] * Y[iy+1]
 	fmla	v20.2d, v5.2d, v19.2d		// dot[0] += X[ix+1] * Y[iy+1]
 	fmls	v1.2d, v3.2d, v16.2d		// dot[1] -= X[ix+1] * Y[iy]
 	fmls	v21.2d, v5.2d, v18.2d		// dot[1] -= X[ix+1] * Y[iy]
 #endif
 #endif
 .endm
 .macro KERNEL_F4_FINALIZE
 #if !defined(DOUBLE)
 	ext	v2.16b, v0.16b, v0.16b, #8
 	fadd	v0.2s, v0.2s, v2.2s
 	faddp	DOTF, v0.2s
 	ext	v3.16b, v1.16b, v1.16b, #8
 	fadd	v1.2s, v1.2s, v3.2s
 	faddp	DOTI, v1.2s
 #else
 	fadd	v0.2d, v0.2d, v20.2d
 	faddp	DOTF, v0.2d
 	fadd	v1.2d, v1.2d, v21.2d
 	faddp	DOTI, v1.2d
 #endif
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #else
 	lsl	INC_X, INC_X, #4
 	lsl	INC_Y, INC_Y, #4
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X], INC_X		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2s}, [Y], INC_Y		// V3 = Y[iy+1], Y[iy]; Y += 2
 	ext	v4.8b, v2.8b, v2.8b, #4		// V4 = X[ix], X[ix+1]
 #if !defined(CONJ)
 	fmla	DOTF, s2, v3.s[0]		// dot[0] += X[ix] * Y[iy]
 	fmls	DOTF, s4, v3.s[1]		// dot[0] -= X[ix+1] * Y[iy+1]
 	fmla	DOTI, s4, v3.s[0]		// dot[1] += X[ix+1] * Y[iy]
 	fmla	DOTI, s2, v3.s[1]		// dot[1] += X[ix] * Y[iy+1]
 #else
 	fmla	DOTF, s2, v3.s[0]		// dot[0] += X[ix] * Y[iy]
 	fmla	DOTF, s4, v3.s[1]		// dot[0] += X[ix+1] * Y[iy+1]
 	fmls	DOTI, s4, v3.s[0]		// dot[1] -= X[ix+1] * Y[iy]
 	fmla	DOTI, s2, v3.s[1]		// dot[1] += X[ix] * Y[iy+1]
 #endif
 #else // DOUBLE
 	ld1	{v2.2d}, [X], INC_X		// V2 = X[ix+1], X[ix]; X += 2
 	ld1	{v3.2d}, [Y], INC_Y		// V3 = Y[iy+1], Y[iy]; Y += 2
 	ext	v4.16b, v2.16b, v2.16b, #8	// V4 = X[ix], X[ix+1]
 #if !defined(CONJ)
 	fmla	DOTF, d2, v3.d[0]		// dot[0] += X[ix] * Y[iy]
 	fmls	DOTF, d4, v3.d[1]		// dot[0] -= X[ix+1] * Y[iy+1]
 	fmla	DOTI, d4, v3.d[0]		// dot[1] += X[ix+1] * Y[iy]
 	fmla	DOTI, d2, v3.d[1]		// dot[1] += X[ix] * Y[iy+1]
 #else
 	fmla	DOTF, d2, v3.d[0]		// dot[0] += X[ix] * Y[iy]
 	fmla	DOTF, d4, v3.d[1]		// dot[0] += X[ix+1] * Y[iy+1]
 	fmls	DOTI, d4, v3.d[0]		// dot[1] -= X[ix+1] * Y[iy]
 	fmla	DOTI, d2, v3.d[1]		// dot[1] += X[ix] * Y[iy+1]
 #endif
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	fmov	DOTF, REG0
 	fmov	DOTI, DOTF
 #if !defined(DOUBLE)
 	fmov	s20, DOTF
 	fmov	s21, DOTI
 #else
 	fmov	d20, DOTF
 	fmov	d21, DOTI
 #endif
 	cmp	N, xzr
 	ble	dot_kernel_L999
 	cmp	INC_X, #1
 	bne	dot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	dot_kernel_S_BEGIN
 dot_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	dot_kernel_F1
 dot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	dot_kernel_F4
 	KERNEL_F4_FINALIZE
 dot_kernel_F1:
 	ands	I, N, #3
 	ble	dot_kernel_L999
 dot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     dot_kernel_F10
 	ret
 dot_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	dot_kernel_S1
 dot_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	dot_kernel_S4
 dot_kernel_S1:
 	ands	I, N, #3
 	ble	dot_kernel_L999
 dot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     dot_kernel_S10
 dot_kernel_L999:
 	ret
 	EPILOGUE
--- a/kernel/arm64/zgemm_kernel_4x4.S
+++ b/kernel/arm64/zgemm_kernel_4x4.S
--- a/kernel/arm64/zgemv_n.S
+++ b/kernel/arm64/zgemv_n.S
@@ -0,0 +1,514 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	M	x0	/* Y vector length */
 #define	N	x1	/* X vector length */
 #define	A	x3	/* A vector address */
 #define	LDA	x4	/* A stride */
 #define	X	x5	/* X vector address */
 #define	INC_X	x6	/* X stride */
 #define	Y	x7	/* Y vector address */
 #define	INC_Y	x2	/* Y stride */
 #define	A_PTR	x9	/* loop A vector address */
 #define	Y_IPTR	x10	/* loop Y vector address */
 #define J	x11	/* loop variable */
 #define I	x12	/* loop variable */
 #define	Y_OPTR	x13	/* loop Y vector address */
 #define	X_PTR	x14	/* loop X vector address */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define ALPHA_R		s0
 #define ALPHA_I		s1
 #define ALPHA_R_COPY	s7
 #define ALPHA_I_COPY	s8
 #define SHZ		3
 #else
 #define ALPHA_R		d0
 #define ALPHA_I		d1
 #define ALPHA_R_COPY	d7
 #define ALPHA_I_COPY	d8
 #define SHZ		4
 #endif
 /******************************************************************************/
 .macro SAVE_REGS
 	add	sp, sp, #-(11 * 16)
 	stp	d8, d9, [sp, #(0 * 16)]
 	stp	d10, d11, [sp, #(1 * 16)]
 	stp	d12, d13, [sp, #(2 * 16)]
 	stp	d14, d15, [sp, #(3 * 16)]
 	stp	d16, d17, [sp, #(4 * 16)]
 	stp	x18, x19, [sp, #(5 * 16)]
 	stp	x20, x21, [sp, #(6 * 16)]
 	stp	x22, x23, [sp, #(7 * 16)]
 	stp	x24, x25, [sp, #(8 * 16)]
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 .endm
 .macro RESTORE_REGS
 	ldp	d8, d9, [sp, #(0 * 16)]
 	ldp	d10, d11, [sp, #(1 * 16)]
 	ldp	d12, d13, [sp, #(2 * 16)]
 	ldp	d14, d15, [sp, #(3 * 16)]
 	ldp	d16, d17, [sp, #(4 * 16)]
 	ldp	x18, x19, [sp, #(5 * 16)]
 	ldp	x20, x21, [sp, #(6 * 16)]
 	ldp	x22, x23, [sp, #(7 * 16)]
 	ldp	x24, x25, [sp, #(8 * 16)]
 	ldp	x26, x27, [sp, #(9 * 16)]
 	ldr	x28, [sp, #(10 * 16)]
 	add	sp, sp, #(11*16)
 .endm
 .macro INIT
 	/********** INIT FOR F4 LOOP **********/
 	fmov	ALPHA_R_COPY, ALPHA_R
 	fmov	ALPHA_I_COPY, ALPHA_I
 #if !defined(DOUBLE)
 	ins	v7.s[1], v7.s[0]		// R(ALPHA), R(ALPHA) 
 	ins	v8.s[1], v8.s[0]		// I(ALPHA), I(ALPHA) 
 	ins	v7.d[1], v7.d[0]
 	ins	v8.d[1], v8.d[0]
 #else
 	ins	v7.d[1], v7.d[0]		// R(ALPHA), R(ALPHA)
 	ins	v8.d[1], v8.d[0]		// I(ALPHA), I(ALPHA)
 #endif
 	/******* INIT FOR F1 AND S1 LOOP ******/
 #if !defined(DOUBLE)
 	ins	v0.s[1], v0.s[0]		// R(ALPHA), R(ALPHA) 
 	fneg	s2, ALPHA_I
 	ins	v1.s[1], v2.s[0]		// -I(ALPHA), I(ALPHA) 
 #if !defined(XCONJ)
 	ext	v1.8b, v1.8b, v1.8b, #4		// I(ALPHA), -I(ALPHA)
 #endif
 #else
 	ins	v0.d[1], v0.d[0]		// R(ALPHA), R(ALPHA) 
 	fneg	d2, ALPHA_I
 	ins	v1.d[1], v2.d[0]		// -I(ALPHA), I(ALPHA) 
 #if !defined(XCONJ)
 	ext	v1.16b, v1.16b, v1.16b, #8	// I(ALPHA), -I(ALPHA)
 #endif
 #endif
 .endm
 .macro INIT_LOOP
 	/********** INIT_LOOP FOR F4 LOOP **********/
 #if !defined(DOUBLE)
 	ld1	{v9.2s}, [X_PTR]		// [I(X), R(X)]
 	ins	v10.s[0], v9.s[1]
 	ins	v9.s[1], v9.s[0]		// [R(X), R(X)]
 	ins	v10.s[1], v10.s[0]		// [I(X), I(X)]
 	ins	v9.d[1], v9.d[0]
 	ins	v10.d[1], v10.d[0]
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmul	v11.4s, v9.4s, v7.4s		// [+ R(X) * R(ALPHA)]
 	fmls	v11.4s, v10.4s, v8.4s		// [- I(X) * I(ALPHA)]
 	fmul	v12.4s, v9.4s, v8.4s		// [+ R(X) * I(ALPHA)]
 	fmla	v12.4s, v10.4s, v7.4s		// [+ I(X) * R(ALPHA)]
 #else
 	fmul	v11.4s, v9.4s, v7.4s		// [+ R(X) * R(ALPHA)]
 	fmla	v11.4s, v10.4s, v8.4s		// [+ I(X) * I(ALPHA)]
 	fmul	v12.4s, v9.4s, v8.4s		// [+ R(X) * I(ALPHA)]
 	fmls	v12.4s, v10.4s, v7.4s		// [- I(X) * R(ALPHA)]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmul	v11.4s, v9.4s, v7.4s		// [+ R(X) * R(ALPHA)]
 	fmls	v11.4s, v10.4s, v8.4s		// [+ I(X) * I(ALPHA)]
 	fmul	v12.4s, v10.4s, v7.4s		// [+ I(X) * R(ALPHA)]
 	fmls	v12.4s, v9.4s, v8.4s		// [- R(X) * I(ALPHA)]
 #else
 	fmul	v11.4s, v9.4s, v7.4s		// [+ R(X) * R(ALPHA)]
 	fmls	v11.4s, v10.4s, v8.4s		// [- I(X) * I(ALPHA)]
 	fmul	v12.4s, v9.4s, v8.4s		// [R(X) * I(ALPHA)]
 	fneg	v12.4s, v12.4s			// [- R(X) * I(ALPHA)]
 	fmla	v12.4s, v10.4s, v7.4s		// [- I(X) * R(ALPHA)]
 #endif
 #endif // CONJ
 	/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
 	ld1	{v2.2s}, [X_PTR]		// [I(X), R(X)]
 	ext	v3.8b, v2.8b, v2.8b, #4		// [R(X), I(X)]
 	fmul	v2.2s, v0.2s, v2.2s
 	fmla	v2.2s, v1.2s, v3.2s		// [I(TEMP), R(TEMP)]
 	ins	v3.s[0], v2.s[1]
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fneg	s4, s3
 	ins	v3.s[1], v4.s[0]
 	ext	v3.8b, v3.8b, v3.8b, #4		// [I(TEMP), -I(TEMP)]
 	ins	v2.s[1], v2.s[0]		// [R(TEMP), R(TEMP)]
 #else
 	fneg	s4, s3
 	ins	v3.s[1], v4.s[0]		// [-I(TEMP), I(TEMP)]
 	ins	v2.s[1], v2.s[0]		// [R(TEMP), R(TEMP)]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	ins	v3.s[1], v3.s[0]		// [I(TEMP), I(TEMP)]
 	fneg	s4, s2
 	ins	v2.s[1], v4.s[0]		// [-R(TEMP), R(TEMP)]
 #else
 	fneg	s3, s3
 	ins	v3.s[1], v3.s[0]		// [-I(TEMP), -I(TEMP)]
 	fneg	s4, s2
 	ins	v2.s[1], v4.s[0]		// [-R(TEMP), R(TEMP)]
 #endif
 #endif // CONJ
 #else // DOUBLE
 	/********** INIT_LOOP FOR F4 LOOP **********/
 	ld1	{v9.2d}, [X_PTR]		// [I(X), R(X)]
 	ins	v10.d[0], v9.d[1]
 	ins	v9.d[1], v9.d[0]		// [R(X), R(X)]
 	ins	v10.d[1], v10.d[0]		// [I(X), I(X)]
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmul	v11.2d, v9.2d, v7.2d		// [+ R(X) * R(ALPHA)]
 	fmls	v11.2d, v10.2d, v8.2d		// [- I(X) * I(ALPHA)]
 	fmul	v12.2d, v9.2d, v8.2d		// [+ R(X) * I(ALPHA)]
 	fmla	v12.2d, v10.2d, v7.2d		// [+ I(X) * R(ALPHA)]
 #else
 	fmul	v11.2d, v9.2d, v7.2d		// [+ R(X) * R(ALPHA)]
 	fmla	v11.2d, v10.2d, v8.2d		// [+ I(X) * I(ALPHA)]
 	fmul	v12.2d, v9.2d, v8.2d		// [+ R(X) * I(ALPHA)]
 	fmls	v12.2d, v10.2d, v7.2d		// [- I(X) * R(ALPHA)]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmul	v11.2d, v9.2d, v7.2d		// [+ R(X) * R(ALPHA)]
 	fmls	v11.2d, v10.2d, v8.2d		// [+ I(X) * I(ALPHA)]
 	fmul	v12.2d, v10.2d, v7.2d		// [+ I(X) * R(ALPHA)]
 	fmls	v12.2d, v9.2d, v8.2d		// [- R(X) * I(ALPHA)]
 #else
 	fmul	v11.2d, v9.2d, v7.2d		// [+ R(X) * R(ALPHA)]
 	fmls	v11.2d, v10.2d, v8.2d		// [- I(X) * I(ALPHA)]
 	fmul	v12.2d, v9.2d, v8.2d		// [R(X) * I(ALPHA)]
 	fneg	v12.2d, v12.2d			// [- R(X) * I(ALPHA)]
 	fmla	v12.2d, v10.2d, v7.2d		// [- I(X) * R(ALPHA)]
 #endif
 #endif // CONJ
 	/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
 	ld1	{v2.2d}, [X_PTR]		// [I(X), R(X)]
 	ext	v3.16b, v2.16b, v2.16b, #8	// [R(X), I(X)]
 	fmul	v2.2d, v0.2d, v2.2d
 	fmla	v2.2d, v1.2d, v3.2d		// [I(TEMP), R(TEMP)]
 	ins	v3.d[0], v2.d[1]		// I(TEMP)
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fneg	d4, d3				// -I(TEMP)
 	ins	v3.d[1], v4.d[0]
 	ext	v3.16b, v3.16b, v3.16b, #8	// [I(TEMP), -I(TEMP)]
 	ins	v2.d[1], v2.d[0]		// [R(TEMP), R(TEMP)]
 #else
 	fneg	d4, d3				// -I(TEMP)
 	ins	v3.d[1], v4.d[0]		// [-I(TEMP), I(TEMP)]
 	ins	v2.d[1], v2.d[0]		// [R(TEMP), R(TEMP)]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	ins	v3.d[1], v3.d[0]		// [I(TEMP), I(TEMP)]
 	fneg	d4, d2				// -R(TEMP)
 	ins	v2.d[1], v4.d[0]		// [-R(TEMP), R(TEMP)]
 #else
 	fneg	d3, d3				// -I(TEMP)
 	ins	v3.d[1], v3.d[0]		// [-I(TEMP), -I(TEMP)]
 	fneg	d4, d2				// -R(TEMP)
 	ins	v2.d[1], v4.d[0]		// [-R(TEMP), R(TEMP)]
 #endif
 #endif // CONJ
 #endif // DOUBLE
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld2	{v13.4s, v14.4s}, [A_PTR], #32
 	ld2	{v15.4s, v16.4s}, [Y_IPTR], #32
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmla	v15.4s, v11.4s, v13.4s		// [+ R(ALPHA * X) * A_R]
 	fmls	v15.4s, v12.4s, v14.4s		// [- I(ALPHA * X) * A_I]
 	fmla	v16.4s, v11.4s, v14.4s		// [+ R(ALPHA * X) * A_I]
 	fmla	v16.4s, v12.4s, v13.4s		// [+ I(ALPHA * X) * A_R]
 #else
 	fmla	v15.4s, v11.4s, v13.4s		// [+ R(ALPHA * X) * A_R]
 	fmla	v15.4s, v12.4s, v14.4s		// [+ I(ALPHA * X) * A_I]
 	fmla	v16.4s, v11.4s, v14.4s		// [+ R(ALPHA * X) * A_I]
 	fmls	v16.4s, v12.4s, v13.4s		// [- I(ALPHA * X) * A_R]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmla	v15.4s, v11.4s, v13.4s		// [+ R(ALPHA * X) * A_R]
 	fmla	v15.4s, v12.4s, v14.4s		// [+ I(ALPHA * X) * A_I]
 	fmls	v16.4s, v11.4s, v14.4s		// [- R(ALPHA * X) * A_I]
 	fmla	v16.4s, v12.4s, v13.4s		// [+ I(ALPHA * X) * A_R]
 #else
 	fmla	v15.4s, v11.4s, v13.4s		// [+ R(ALPHA * X) * A_R]
 	fmls	v15.4s, v12.4s, v14.4s		// [- I(ALPHA * X) * A_I]
 	fmls	v16.4s, v11.4s, v14.4s		// [- R(ALPHA * X) * A_I]
 	fmls	v16.4s, v12.4s, v13.4s		// [- I(ALPHA * X) * A_R]
 #endif
 #endif // CONJ
 	st2	{v15.4s, v16.4s}, [Y_OPTR], #32
 #else // DOUBLE
 	ld2	{v13.2d, v14.2d}, [A_PTR], #32
 	ld2	{v15.2d, v16.2d}, [Y_IPTR], #32
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmla	v15.2d, v11.2d, v13.2d		// [+ R(ALPHA * X) * A_R]
 	fmls	v15.2d, v12.2d, v14.2d		// [- I(ALPHA * X) * A_I]
 	fmla	v16.2d, v11.2d, v14.2d		// [+ R(ALPHA * X) * A_I]
 	fmla	v16.2d, v12.2d, v13.2d		// [+ I(ALPHA * X) * A_R]
 #else
 	fmla	v15.2d, v11.2d, v13.2d		// [+ R(ALPHA * X) * A_R]
 	fmla	v15.2d, v12.2d, v14.2d		// [+ I(ALPHA * X) * A_I]
 	fmla	v16.2d, v11.2d, v14.2d		// [+ R(ALPHA * X) * A_I]
 	fmls	v16.2d, v12.2d, v13.2d		// [- I(ALPHA * X) * A_R]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmla	v15.2d, v11.2d, v13.2d		// [+ R(ALPHA * X) * A_R]
 	fmla	v15.2d, v12.2d, v14.2d		// [+ I(ALPHA * X) * A_I]
 	fmls	v16.2d, v11.2d, v14.2d		// [- R(ALPHA * X) * A_I]
 	fmla	v16.2d, v12.2d, v13.2d		// [+ I(ALPHA * X) * A_R]
 #else
 	fmla	v15.2d, v11.2d, v13.2d		// [+ R(ALPHA * X) * A_R]
 	fmls	v15.2d, v12.2d, v14.2d		// [- I(ALPHA * X) * A_I]
 	fmls	v16.2d, v11.2d, v14.2d		// [- R(ALPHA * X) * A_I]
 	fmls	v16.2d, v12.2d, v13.2d		// [- I(ALPHA * X) * A_R]
 #endif
 #endif // CONJ
 	st2	{v15.2d, v16.2d}, [Y_OPTR], #32
 	ld2	{v17.2d, v18.2d}, [A_PTR], #32
 	ld2	{v19.2d, v20.2d}, [Y_IPTR], #32
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmla	v19.2d, v11.2d, v17.2d		// [+ R(ALPHA * X) * A_R]
 	fmls	v19.2d, v12.2d, v18.2d		// [- I(ALPHA * X) * A_I]
 	fmla	v20.2d, v11.2d, v18.2d		// [+ R(ALPHA * X) * A_I]
 	fmla	v20.2d, v12.2d, v17.2d		// [+ I(ALPHA * X) * A_R]
 #else
 	fmla	v19.2d, v11.2d, v17.2d		// [+ R(ALPHA * X) * A_R]
 	fmla	v19.2d, v12.2d, v18.2d		// [- I(ALPHA * X) * A_I]
 	fmla	v20.2d, v11.2d, v18.2d		// [+ R(ALPHA * X) * A_I]
 	fmls	v20.2d, v12.2d, v17.2d		// [+ I(ALPHA * X) * A_R]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmla	v19.2d, v11.2d, v17.2d		// [+ R(ALPHA * X) * A_R]
 	fmla	v19.2d, v12.2d, v18.2d		// [- I(ALPHA * X) * A_I]
 	fmls	v20.2d, v11.2d, v18.2d		// [+ R(ALPHA * X) * A_I]
 	fmla	v20.2d, v12.2d, v17.2d		// [+ I(ALPHA * X) * A_R]
 #else
 	fmla	v19.2d, v11.2d, v17.2d		// [+ R(ALPHA * X) * A_R]
 	fmls	v19.2d, v12.2d, v18.2d		// [- I(ALPHA * X) * A_I]
 	fmls	v20.2d, v11.2d, v18.2d		// [+ R(ALPHA * X) * A_I]
 	fmls	v20.2d, v12.2d, v17.2d		// [+ I(ALPHA * X) * A_R]
 #endif
 #endif // CONJ
 	st2	{v19.2d, v20.2d}, [Y_OPTR], #32
 #endif
 .endm
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v4.2s}, [A_PTR], #8
 	ld1	{v5.2s}, [Y_IPTR], #8
 	ext	v6.8b, v4.8b, v4.8b, #4
 	fmla	v5.2s, v2.2s, v4.2s
 	fmla	v5.2s, v3.2s, v6.2s
 	st1	{v5.2s}, [Y_OPTR], #8
 #else // DOUBLE
 	ld1	{v4.2d}, [A_PTR], #16
 	ld1	{v5.2d}, [Y_IPTR], #16
 	ext	v6.16b, v4.16b, v4.16b, #8
 	fmla	v5.2d, v2.2d, v4.2d
 	fmla	v5.2d, v3.2d, v6.2d
 	st1	{v5.2d}, [Y_OPTR], #16
 #endif
 .endm
 .macro INIT_S
 	lsl	INC_Y, INC_Y, #SHZ
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v4.2s}, [A_PTR], #8
 	ld1	{v5.2s}, [Y_IPTR], INC_Y
 	ext	v6.8b, v4.8b, v4.8b, #4
 	fmla	v5.2s, v2.2s, v4.2s
 	fmla	v5.2s, v3.2s, v6.2s
 	st1	{v5.2s}, [Y_OPTR], INC_Y
 #else // DOUBLE
 	ld1	{v4.2d}, [A_PTR], #16
 	ld1	{v5.2d}, [Y_IPTR], INC_Y
 	ext	v6.16b, v4.16b, v4.16b, #8
 	fmla	v5.2d, v2.2d, v4.2d
 	fmla	v5.2d, v3.2d, v6.2d
 	st1	{v5.2d}, [Y_OPTR], INC_Y
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	ldr	INC_Y, [sp]
 	SAVE_REGS
 	cmp	N, xzr
 	ble	zgemv_n_kernel_L999
 	cmp	M, xzr
 	ble	zgemv_n_kernel_L999
 	lsl	LDA, LDA, #SHZ
 	lsl	INC_X, INC_X, #SHZ
 	mov	J, N
 	INIT
 	cmp	INC_Y, #1
 	bne	zgemv_n_kernel_S_BEGIN
 zgemv_n_kernel_F_LOOP:
 	mov	A_PTR, A
 	mov	Y_IPTR, Y
 	mov	Y_OPTR, Y
 	mov	X_PTR, X
 	add	X, X, INC_X
 	INIT_LOOP
 	asr	I, M, #2
 	cmp	I, xzr
 	beq	zgemv_n_kernel_F1
 zgemv_n_kernel_F4:
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	KERNEL_F1
 	subs	I, I, #1
 	bne	zgemv_n_kernel_F4
 zgemv_n_kernel_F1:
 	ands	I, M, #3
 	ble	zgemv_n_kernel_F_END
 zgemv_n_kernel_F10:
 	KERNEL_F1
 	subs	I, I, #1
 	bne	zgemv_n_kernel_F10
 zgemv_n_kernel_F_END:
 	add	A, A, LDA
 	subs    J, J, #1
        bne     zgemv_n_kernel_F_LOOP
 	b	zgemv_n_kernel_L999
 zgemv_n_kernel_S_BEGIN:
 	INIT_S
 zgemv_n_kernel_S_LOOP:
 	mov	A_PTR, A
 	mov	Y_IPTR, Y
 	mov	Y_OPTR, Y
 	mov	X_PTR, X
 	add	X, X, INC_X
 	INIT_LOOP
 	asr	I, M, #2
 	cmp	I, xzr
 	ble	zgemv_n_kernel_S1
 zgemv_n_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	zgemv_n_kernel_S4
 zgemv_n_kernel_S1:
 	ands	I, M, #3
 	ble	zgemv_n_kernel_S_END
 zgemv_n_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     zgemv_n_kernel_S10
 zgemv_n_kernel_S_END:
 	add	A, A, LDA
 	subs    J, J, #1
        bne     zgemv_n_kernel_S_LOOP
 zgemv_n_kernel_L999:
 	RESTORE_REGS
 	mov	w0, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/zgemv_t.S
+++ b/kernel/arm64/zgemv_t.S
@@ -0,0 +1,448 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	M	x0	/* Y vector length */
 #define	N	x1	/* X vector length */
 #define	A	x3	/* A vector address */
 #define	LDA	x4	/* A stride */
 #define	X	x5	/* X vector address */
 #define	INC_X	x6	/* X stride */
 #define	Y	x7	/* Y vector address */
 #define	INC_Y	x2	/* Y stride */
 #define	A_PTR	x9	/* loop A vector address */
 #define	X_PTR	x10	/* loop Y vector address */
 #define J	x11	/* loop variable */
 #define I	x12	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define ALPHA_R		s0
 #define ALPHA_I		s1
 #define ALPHA_R_COPY	s7
 #define ALPHA_I_COPY	s8
 #define SHZ		3
 #else
 #define ALPHA_R		d0
 #define ALPHA_I		d1
 #define ALPHA_R_COPY	d7
 #define ALPHA_I_COPY	d8
 #define SHZ		4
 #endif
 /******************************************************************************/
 .macro SAVE_REGS
 	add	sp, sp, #-(11 * 16)
 	stp	d8, d9, [sp, #(0 * 16)]
 	stp	d10, d11, [sp, #(1 * 16)]
 	stp	d12, d13, [sp, #(2 * 16)]
 	stp	d14, d15, [sp, #(3 * 16)]
 	stp	d16, d17, [sp, #(4 * 16)]
 	stp	x18, x19, [sp, #(5 * 16)]
 	stp	x20, x21, [sp, #(6 * 16)]
 	stp	x22, x23, [sp, #(7 * 16)]
 	stp	x24, x25, [sp, #(8 * 16)]
 	stp	x26, x27, [sp, #(9 * 16)]
 	str	x28, [sp, #(10 * 16)]
 .endm
 .macro RESTORE_REGS
 	ldp	d8, d9, [sp, #(0 * 16)]
 	ldp	d10, d11, [sp, #(1 * 16)]
 	ldp	d12, d13, [sp, #(2 * 16)]
 	ldp	d14, d15, [sp, #(3 * 16)]
 	ldp	d16, d17, [sp, #(4 * 16)]
 	ldp	x18, x19, [sp, #(5 * 16)]
 	ldp	x20, x21, [sp, #(6 * 16)]
 	ldp	x22, x23, [sp, #(7 * 16)]
 	ldp	x24, x25, [sp, #(8 * 16)]
 	ldp	x26, x27, [sp, #(9 * 16)]
 	ldr	x28, [sp, #(10 * 16)]
 	add	sp, sp, #(11*16)
 .endm
 .macro INIT
 #if !defined(XCONJ)
 #if !defined(DOUBLE)
 	ins	v0.s[1], v0.s[0]		// v0 = ALPHA_R, ALPHA_R 
 	fneg	s2, ALPHA_I
 	ins	v1.s[1], v2.s[0]
 	ext	v1.8b, v1.8b, v1.8b, #4		// v1 = ALPHA_I, -ALPHA_I 
 #else
 	ins	v0.d[1], v0.d[0]		// v0 = ALPHA_R, ALPHA_R 
 	fneg	d2, ALPHA_I
 	ins	v1.d[1], v2.d[0]
 	ext	v1.16b, v1.16b, v1.16b, #8	// v1 = ALPHA_I, -ALPHA_I 
 #endif
 #else // XCONJ
 #if !defined(DOUBLE)
 	fneg	s2, ALPHA_R
 	ins	v0.s[1], v2.s[0]		// v0 = -ALPHA_R, ALPHA_R 
 	ins	v1.s[1], v1.s[0]		// v1 = ALPHA_I, ALPHA_I 
 #else
 	fneg	d2, ALPHA_R
 	ins	v0.d[1], v2.d[0]		// v0 = -ALPHA_R, ALPHA_R 
 	ins	v1.d[1], v1.d[0]		// v1 = ALPHA_I, ALPHA_I 
 #endif
 #endif
 .endm
 .macro INIT_LOOP
 	fmov	d9, xzr				// TEMP_R = [0, 0]
 	fmov	d10, xzr			// TEMP_I = [0, 0]
 #if !defined(DOUBLE)
 #else
 	fmov	d15, xzr			// TEMP_R = [0, 0]
 	fmov	d16, xzr			// TEMP_I = [0, 0]
 #endif
 	fmov	d2, xzr				// TEMP = [0, 0]
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld2	{v11.4s, v12.4s}, [X_PTR], #32
 	ld2	{v13.4s, v14.4s}, [A_PTR], #32
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmla	v9.4s,  v11.4s, v13.4s		// [+ R(X) * A_R]
 	fmls	v9.4s,  v12.4s, v14.4s		// [- I(X) * A_I]
 	fmla	v10.4s, v11.4s, v14.4s		// [+ R(X) * A_I]
 	fmla	v10.4s, v12.4s, v13.4s		// [+ I(X) * A_R]
 #else
 	fmla	v9.4s,  v11.4s, v13.4s		// [+ R(X) * A_R]
 	fmla	v9.4s,  v12.4s, v14.4s		// [+ I(X) * A_I]
 	fmla	v10.4s, v11.4s, v14.4s		// [+ R(X) * A_I]
 	fmls	v10.4s, v12.4s, v13.4s		// [- I(X) * A_R]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmla	v9.4s,  v11.4s, v13.4s		// [+ R(X) * A_R]
 	fmla	v9.4s,  v12.4s, v14.4s		// [+ I(X) * A_I]
 	fmls	v10.4s, v11.4s, v14.4s		// [- R(X) * A_I]
 	fmla	v10.4s, v12.4s, v13.4s		// [+ I(X) * A_R]
 #else
 	fmla	v9.4s,  v11.4s, v13.4s		// [+ R(X) * A_R]
 	fmls	v9.4s,  v12.4s, v14.4s		// [- I(X) * A_I]
 	fmls	v10.4s, v11.4s, v14.4s		// [- R(X) * A_I]
 	fmls	v10.4s, v12.4s, v13.4s		// [- I(X) * A_R]
 #endif
 #endif // CONJ
 #else // DOUBLE
 	ld2	{v11.2d, v12.2d}, [X_PTR], #32
 	ld2	{v13.2d, v14.2d}, [A_PTR], #32
 	prfm    PLDL1STRM, [X_PTR, #512]
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmla	v9.2d,  v11.2d, v13.2d		// [+ R(X) * A_R]
 	fmls	v9.2d,  v12.2d, v14.2d		// [- I(X) * A_I]
 	fmla	v10.2d, v11.2d, v14.2d		// [+ R(X) * A_I]
 	fmla	v10.2d, v12.2d, v13.2d		// [+ I(X) * A_R]
 #else
 	fmla	v9.2d,  v11.2d, v13.2d		// [+ R(X) * A_R]
 	fmla	v9.2d,  v12.2d, v14.2d		// [+ I(X) * A_I]
 	fmla	v10.2d, v11.2d, v14.2d		// [+ R(X) * A_I]
 	fmls	v10.2d, v12.2d, v13.2d		// [- I(X) * A_R]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmla	v9.2d,  v11.2d, v13.2d		// [+ R(X) * A_R]
 	fmla	v9.2d,  v12.2d, v14.2d		// [+ I(X) * A_I]
 	fmls	v10.2d, v11.2d, v14.2d		// [- R(X) * A_I]
 	fmla	v10.2d, v12.2d, v13.2d		// [+ I(X) * A_R]
 #else
 	fmla	v9.2d,  v11.2d, v13.2d		// [+ R(X) * A_R]
 	fmls	v9.2d,  v12.2d, v14.2d		// [- I(X) * A_I]
 	fmls	v10.2d, v11.2d, v14.2d		// [- R(X) * A_I]
 	fmls	v10.2d, v12.2d, v13.2d		// [- I(X) * A_R]
 #endif
 #endif // CONJ
 	ld2	{v17.2d, v18.2d}, [X_PTR], #32
 	ld2	{v19.2d, v20.2d}, [A_PTR], #32
 	prfm    PLDL1STRM, [A_PTR, #512]
 #if !defined(CONJ)
 #if !defined(XCONJ)
 	fmla	v15.2d, v17.2d, v19.2d		// [+ R(X) * A_R]
 	fmls	v15.2d, v18.2d, v20.2d		// [- I(X) * A_I]
 	fmla	v16.2d, v17.2d, v20.2d		// [+ R(X) * A_I]
 	fmla	v16.2d, v18.2d, v19.2d		// [+ I(X) * A_R]
 #else
 	fmla	v15.2d, v17.2d, v19.2d		// [+ R(X) * A_R]
 	fmla	v15.2d, v18.2d, v20.2d		// [- I(X) * A_I]
 	fmla	v16.2d, v17.2d, v20.2d		// [+ R(X) * A_I]
 	fmls	v16.2d, v18.2d, v19.2d		// [+ I(X) * A_R]
 #endif
 #else // CONJ
 #if !defined(XCONJ)
 	fmla	v15.2d, v17.2d, v19.2d		// [+ R(X) * A_R]
 	fmla	v15.2d, v18.2d, v20.2d		// [- I(X) * A_I]
 	fmls	v16.2d, v17.2d, v20.2d		// [+ R(X) * A_I]
 	fmla	v16.2d, v18.2d, v19.2d		// [+ I(X) * A_R]
 #else
 	fmla	v15.2d, v17.2d, v19.2d		// [+ R(X) * A_R]
 	fmls	v15.2d, v18.2d, v20.2d		// [- I(X) * A_I]
 	fmls	v16.2d, v17.2d, v20.2d		// [+ R(X) * A_I]
 	fmls	v16.2d, v18.2d, v19.2d		// [+ I(X) * A_R]
 #endif
 #endif // CONJ
 #endif //DOUBLE
 .endm
 .macro KERNEL_F4_FINALIZE
 #if !defined(DOUBLE)
 	ext	v21.16b, v9.16b, v9.16b, #8
 	fadd	v9.2s, v9.2s, v21.2s
 	faddp	s9, v9.2s
 	ext	v21.16b, v10.16b, v10.16b, #8
 	fadd	v10.2s, v10.2s, v21.2s
 	faddp	s10, v10.2s
 	ins	v2.s[0], v9.s[0]
 	ins	v2.s[1], v10.s[0]
 #else
 	fadd	v9.2d, v9.2d, v15.2d
 	fadd	v10.2d, v10.2d, v16.2d
 	faddp	d9, v9.2d
 	faddp	d10, v10.2d
 	ins	v2.d[0], v9.d[0]
 	ins	v2.d[1], v10.d[0]
 #endif
 .endm
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1r	{v4.2s}, [A_PTR], #4		// [A0, A0]
 	ld1	{v5.s}[0], [A_PTR], #4		// A1
 	ld1	{v6.2s}, [X_PTR], #8		// [X1, X0]
 	fneg	s16, s5
 	ins	v5.s[1], v16.s[0]		// [-A1, A1]
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 	ext	v5.8b, v5.8b, v5.8b, #4		// [A1, -A1]
 #endif
 	ext	v7.8b, v6.8b, v6.8b, #4		// [X0, X1]
 	fmla	v2.2s, v4.2s, v6.2s
 	fmla	v2.2s, v5.2s, v7.2s
 #else // DOUBLE
 	ld1r	{v4.2d}, [A_PTR], #8		// [A0, A0]
 	ld1	{v5.d}[0], [A_PTR], #8		// A1
 	ld1	{v6.2d}, [X_PTR], #16		// [X1, X0]
 	fneg	d16, d5
 	ins	v5.d[1], v16.d[0]		// [-A1, A1]
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 	ext	v5.16b, v5.16b, v5.16b, #8	// [A1, -A1]
 #endif
 	ext	v7.16b, v6.16b, v6.16b, #8	// [X0, X1]
 	fmla	v2.2d, v4.2d, v6.2d
 	fmla	v2.2d, v5.2d, v7.2d
 #endif
 .endm
 .macro INIT_S
 	lsl	INC_X, INC_X, #SHZ
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1r	{v4.2s}, [A_PTR], #4		// [A0, A0]
 	ld1	{v5.s}[0], [A_PTR], #4		// A1
 	ld1	{v6.2s}, [X_PTR], INC_X		// [X1, X0]
 	fneg	s16, s5
 	ins	v5.s[1], v16.s[0]		// [-A1, A1]
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 	ext	v5.8b, v5.8b, v5.8b, #4		// [A1, -A1]
 #endif
 	ext	v7.8b, v6.8b, v6.8b, #4		// [X0, X1]
 	fmla	v2.2s, v4.2s, v6.2s
 	fmla	v2.2s, v5.2s, v7.2s
 #else // DOUBLE
 	ld1r	{v4.2d}, [A_PTR], #8		// [A0, A0]
 	ld1	{v5.d}[0], [A_PTR], #8		// A1
 	ld1	{v6.2d}, [X_PTR], INC_X		// [X1, X0]
 	fneg	d16, d5
 	ins	v5.d[1], v16.d[0]		// [-A1, A1]
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 	ext	v5.16b, v5.16b, v5.16b, #8	// [A1, -A1]
 #endif
 	ext	v7.16b, v6.16b, v6.16b, #8	// [X0, X1]
 	fmla	v2.2d, v4.2d, v6.2d
 	fmla	v2.2d, v5.2d, v7.2d
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	ldr	INC_Y, [sp]
 	SAVE_REGS
 	cmp	N, xzr
 	ble	zgemv_t_kernel_L999
 	cmp	M, xzr
 	ble	zgemv_t_kernel_L999
 	lsl	LDA, LDA, #SHZ
 	lsl	INC_Y, INC_Y, #SHZ
 	mov	J, N
 	INIT
 	cmp	INC_X, #1
 	bne	zgemv_t_kernel_S_BEGIN
 zgemv_t_kernel_F_LOOP:
 	mov	A_PTR, A
 	mov	X_PTR, X
 	INIT_LOOP
 	asr	I, M, #2
 	cmp	I, xzr
 	beq	zgemv_t_kernel_F1
 zgemv_t_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	zgemv_t_kernel_F4
 	KERNEL_F4_FINALIZE
 zgemv_t_kernel_F1:
 	ands	I, M, #3
 	ble	zgemv_t_kernel_F_END
 zgemv_t_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     zgemv_t_kernel_F10
 zgemv_t_kernel_F_END:
 #if !defined(DOUBLE)
 	ld1	{v4.2s}, [Y]
 	ext	v3.8b, v2.8b, v2.8b, #4		// [TEMP_R, TEMP_I]
 	fmla	v4.2s, v0.2s, v2.2s
 	fmla	v4.2s, v1.2s, v3.2s
 	st1	{v4.2s}, [Y], INC_Y
 #else // DOUBLE
 	ld1	{v4.2d}, [Y]
 	ext	v3.16b, v2.16b, v2.16b, #8	// [TEMP_R, TEMP_I]
 	fmla	v4.2d, v0.2d, v2.2d
 	fmla	v4.2d, v1.2d, v3.2d
 	st1	{v4.2d}, [Y], INC_Y
 #endif
 	add	A, A, LDA
 	subs    J, J, #1
        bne     zgemv_t_kernel_F_LOOP
 	b	zgemv_t_kernel_L999
 zgemv_t_kernel_S_BEGIN:
 	INIT_S
 zgemv_t_kernel_S_LOOP:
 	mov	A_PTR, A
 	mov	X_PTR, X
 	INIT_LOOP
 	asr	I, M, #2
 	cmp	I, xzr
 	ble	zgemv_t_kernel_S1
 zgemv_t_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	zgemv_t_kernel_S4
 zgemv_t_kernel_S1:
 	ands	I, M, #3
 	ble	zgemv_t_kernel_S_END
 zgemv_t_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     zgemv_t_kernel_S10
 zgemv_t_kernel_S_END:
 #if !defined(DOUBLE)
 	ld1	{v4.2s}, [Y]
 	ext	v3.8b, v2.8b, v2.8b, #4		// [TEMP_R, TEMP_I]
 	fmla	v4.2s, v0.2s, v2.2s
 	fmla	v4.2s, v1.2s, v3.2s
 	st1	{v4.2s}, [Y], INC_Y
 #else // DOUBLE
 	ld1	{v4.2d}, [Y]
 	ext	v3.16b, v2.16b, v2.16b, #8	// [TEMP_R, TEMP_I]
 	fmla	v4.2d, v0.2d, v2.2d
 	fmla	v4.2d, v1.2d, v3.2d
 	st1	{v4.2d}, [Y], INC_Y
 #endif
 	add	A, A, LDA
 	subs    J, J, #1
        bne     zgemv_t_kernel_S_LOOP
 zgemv_t_kernel_L999:
 	RESTORE_REGS
 	mov	w0, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/znrm2.S
+++ b/kernel/arm64/znrm2.S
@@ -0,0 +1,228 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define TMPF	s6
 #define SSQ	s0
 #define TMPVF	{v6.s}[0]
 #define SZ	4
 #else
 #define TMPF	d6
 #define SSQ	d0
 #define TMPVF	{v6.d}[0]
 #define SZ	8
 #endif
 /******************************************************************************/
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v1.2s}, [X], #8
 	fmul	v1.2s, v1.2s, v1.2s
 	faddp	TMPF, v1.2s
 	fadd	SSQ, SSQ, TMPF
 #else
 	ld1	{v1.2d}, [X], #16
 	fmul	v1.2d, v1.2d, v1.2d
 	faddp	TMPF, v1.2d
 	fadd	SSQ, SSQ, TMPF
 #endif
 .endm
 .macro KERNEL_F8
 #if !defined(DOUBLE)
 	ld1	{v1.4s, v2.4s}, [X], #32
 	fmla	v0.4s, v1.4s, v1.4s
 	fmla	v5.4s, v2.4s, v2.4s
 	ld1	{v3.4s,v4.4s}, [X], #32
 	fmla	v0.4s, v3.4s, v3.4s
 	fmla	v5.4s, v4.4s, v4.4s
 	PRFM	PLDL1KEEP, [X, #1024]
 #else // DOUBLE
 	ld1	{v1.2d, v2.2d}, [X], #32
 	fmla	v0.2d, v1.2d, v1.2d
 	fmla	v5.2d, v2.2d, v2.2d
 	ld1	{v3.2d, v4.2d}, [X], #32
 	fmla	v0.2d, v3.2d, v3.2d
 	fmla	v5.2d, v4.2d, v4.2d
 	ld1	{v16.2d, v17.2d}, [X], #32
 	fmla	v0.2d, v16.2d, v16.2d
 	fmla	v5.2d, v17.2d, v17.2d
 	ld1	{v18.2d, v19.2d}, [X], #32
 	fmla	v0.2d, v18.2d, v18.2d
 	fmla	v5.2d, v19.2d, v19.2d
 #endif
 .endm
 .macro nrm2_kernel_F8_FINALIZE
 #if !defined(DOUBLE)
 	fadd	v0.4s, v0.4s, v5.4s
 	ext	v1.16b, v0.16b, v0.16b, #8
 	fadd	v0.2s, v0.2s,  v1.2s
 	faddp	SSQ, v0.2s
 #else
 	fadd	v0.2d, v0.2d, v5.2d
 	faddp	SSQ, v0.2d
 #endif
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	ld1	{v1.2s}, [X], INC_X
 	fmul	v1.2s, v1.2s, v1.2s
 	faddp	SSQ, v1.2s
 #else
 	lsl	INC_X, INC_X, #4
 	ld1	{v1.2d}, [X], INC_X
 	fmul	v1.2d, v1.2d, v1.2d
 	faddp	SSQ, v1.2d
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v1.2s}, [X], INC_X
 	fmul	v1.2s, v1.2s, v1.2s
 	faddp	TMPF, v1.2s
 	fadd	SSQ, SSQ, TMPF
 #else
 	ld1	{v1.2d}, [X], INC_X
 	fmul	v1.2d, v1.2d, v1.2d
 	faddp	TMPF, v1.2d
 	fadd	SSQ, SSQ, TMPF
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 #if !defined(DOUBLE)
 	fmov	SSQ, wzr
 	fmov	s5, SSQ
 #else
 	fmov	SSQ, xzr
 	fmov	d5, SSQ
 #endif
 	cmp	N, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, xzr
 	ble	nrm2_kernel_zero
 	cmp	INC_X, #1
 	bne	nrm2_kernel_S_BEGIN
 nrm2_kernel_F_BEGIN:
 	asr	I, N, #3
 	cmp	I, xzr
 	beq	nrm2_kernel_F1_INIT
 nrm2_kernel_F8:
 	KERNEL_F8
 	subs	I, I, #1
 	bne	nrm2_kernel_F8
 	nrm2_kernel_F8_FINALIZE
 nrm2_kernel_F1:
 	ands	I, N, #7
 	ble	nrm2_kernel_L999
 nrm2_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     nrm2_kernel_F10
 	b	nrm2_kernel_L999
 nrm2_kernel_F1_INIT:
 	b	nrm2_kernel_F1
 nrm2_kernel_S_BEGIN:
 	INIT_S
 	subs	N, N, #1
 	ble	nrm2_kernel_L999
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	nrm2_kernel_S1
 nrm2_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	nrm2_kernel_S4
 nrm2_kernel_S1:
 	ands	I, N, #3
 	ble	nrm2_kernel_L999
 nrm2_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
 	bne     nrm2_kernel_S10
 nrm2_kernel_L999:
 	fsqrt	SSQ, SSQ
 	ret
 nrm2_kernel_zero:
 	ret
 	EPILOGUE
--- a/kernel/arm64/zrot.S
+++ b/kernel/arm64/zrot.S
@@ -0,0 +1,256 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x1	/* X vector address */
 #define	INC_X	x2	/* X stride */
 #define	Y	x3	/* Y vector address */
 #define	INC_Y	x4	/* Y stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define	C	s0	/* scale input value */
 #define	S	s1	/* scale input value */
 #else
 #define	C	d0	/* scale input value */
 #define	S	d1	/* scale input value */
 #endif
 /******************************************************************************/
 .macro INIT
 #if !defined(DOUBLE)
 	ins	v0.s[1], v0.s[0]		// [C, C]
 	ins	v1.s[1], v1.s[0]		// [S, S]
 #else
 	ins	v0.d[1], v0.d[0]		// [C, C]
 	ins	v1.d[1], v1.d[0]		// [S, S]
 #endif
 .endm
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X]
 	ld1	{v3.2s}, [Y]
 	fmul	v4.2s, v0.2s, v2.2s		// [C*X1, C*X0]
 	fmla	v4.2s, v1.2s, v3.2s		// [C*X1 + S*Y1, C*X0 + S*Y0]
 	fmul	v5.2s, v0.2s, v3.2s		// [C*Y1, C*Y0]
 	fmls	v5.2s, v1.2s, v2.2s		// [C*Y1 - S*X1, C*Y0 - S*X0]
 	st1	{v4.2s}, [X], #8
 	st1	{v5.2s}, [Y], #8
 #else
 	ld1	{v2.2d}, [X]
 	ld1	{v3.2d}, [Y]
 	fmul	v4.2d, v0.2d, v2.2d		// [C*X1, C*X0]
 	fmla	v4.2d, v1.2d, v3.2d		// [C*X1 + S*Y1, C*X0 + S*Y0]
 	fmul	v5.2d, v0.2d, v3.2d		// [C*Y1, C*Y0]
 	fmls	v5.2d, v1.2d, v2.2d		// [C*Y1 - S*X1, C*Y0 - S*X0]
 	st1	{v4.2d}, [X], #16
 	st1	{v5.2d}, [Y], #16
 #endif
 .endm
 .macro KERNEL_INIT_F4
 #if !defined(DOUBLE)
 	ins	v0.d[1], v0.d[0]		// [C, C, C, C]
 	ins	v1.d[1], v1.d[0]		// [S, S, S, S]
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v2.4s, v3.4s}, [X]
 	ld1	{v4.4s, v5.4s}, [Y]
 	fmul	v6.4s, v0.4s, v2.4s		// C*X3, C*X2, C*X1, C*X0
 	fmul	v7.4s, v0.4s, v3.4s		// C*X7, C*X6, C*X5, C*X4
 	fmla	v6.4s, v1.4s, v4.4s		// C*X3+S*Y3, ..., C*X0+S*Y0
 	fmla	v7.4s, v1.4s, v5.4s		// C*X7+S*Y7, ..., C*X4+S*Y4
 	fmul	v16.4s, v0.4s, v4.4s		// C*Y3, C*Y2, C*Y1, C*Y0
 	fmul	v17.4s, v0.4s, v5.4s		// C*Y7, C*Y6, C*Y5, C*Y4
 	fmls	v16.4s, v1.4s, v2.4s		// C*Y3-S*X3, ..., C*Y0-S*X0
 	fmls	v17.4s, v1.4s, v3.4s		// C*Y7-S*X7, ..., C*Y4-S*X4
 	st1	{v6.4s,v7.4s}, [X], #32
 	st1	{v16.4s,v17.4s}, [Y], #32
 #else // DOUBLE
 	ld1	{v2.2d, v3.2d}, [X]
 	ld1	{v4.2d, v5.2d}, [Y]
 	fmul	v6.2d, v0.2d, v2.2d		// C*X3, C*X2, C*X1, C*X0
 	fmul	v7.2d, v0.2d, v3.2d		// C*X7, C*X6, C*X5, C*X4
 	fmla	v6.2d, v1.2d, v4.2d		// C*X3+S*Y3, ..., C*X0+S*Y0
 	fmla	v7.2d, v1.2d, v5.2d		// C*X7+S*Y7, ..., C*X4+S*Y4
 	fmul	v16.2d, v0.2d, v4.2d		// C*Y3, C*Y2, C*Y1, C*Y0
 	fmul	v17.2d, v0.2d, v5.2d		// C*Y7, C*Y6, C*Y5, C*Y4
 	fmls	v16.2d, v1.2d, v2.2d		// C*Y3-S*X3, ..., C*Y0-S*X0
 	fmls	v17.2d, v1.2d, v3.2d		// C*Y7-S*X7, ..., C*Y4-S*X4
 	st1	{v6.2d,v7.2d}, [X], #32
 	st1	{v16.2d,v17.2d}, [Y], #32
 	ld1	{v2.2d, v3.2d}, [X]
 	ld1	{v4.2d, v5.2d}, [Y]
 	fmul	v6.2d, v0.2d, v2.2d		// C*X3, C*X2, C*X1, C*X0
 	fmul	v7.2d, v0.2d, v3.2d		// C*X7, C*X6, C*X5, C*X4
 	fmla	v6.2d, v1.2d, v4.2d		// C*X3+S*Y3, ..., C*X0+S*Y0
 	fmla	v7.2d, v1.2d, v5.2d		// C*X7+S*Y7, ..., C*X4+S*Y4
 	fmul	v16.2d, v0.2d, v4.2d		// C*Y3, C*Y2, C*Y1, C*Y0
 	fmul	v17.2d, v0.2d, v5.2d		// C*Y7, C*Y6, C*Y5, C*Y4
 	fmls	v16.2d, v1.2d, v2.2d		// C*Y3-S*X3, ..., C*Y0-S*X0
 	fmls	v17.2d, v1.2d, v3.2d		// C*Y7-S*X7, ..., C*Y4-S*X4
 	st1	{v6.2d,v7.2d}, [X], #32
 	st1	{v16.2d,v17.2d}, [Y], #32
 #endif
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 	lsl	INC_Y, INC_Y, #3
 #else
 	lsl	INC_X, INC_X, #4
 	lsl	INC_Y, INC_Y, #4
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X]
 	ld1	{v3.2s}, [Y]
 	fmul	v4.2s, v0.2s, v2.2s		// [C*X1, C*X0]
 	fmla	v4.2s, v1.2s, v3.2s		// [C*X1 + S*Y1, C*X0 + S*Y0]
 	fmul	v5.2s, v0.2s, v3.2s		// [C*Y1, C*Y0]
 	fmls	v5.2s, v1.2s, v2.2s		// [C*Y1 - S*X1, C*Y0 - S*X0]
 	st1	{v4.2s}, [X], INC_X
 	st1	{v5.2s}, [Y], INC_Y
 #else
 	ld1	{v2.2d}, [X]
 	ld1	{v3.2d}, [Y]
 	fmul	v4.2d, v0.2d, v2.2d		// [C*X1, C*X0]
 	fmla	v4.2d, v1.2d, v3.2d		// [C*X1 + S*Y1, C*X0 + S*Y0]
 	fmul	v5.2d, v0.2d, v3.2d		// [C*Y1, C*Y0]
 	fmls	v5.2d, v1.2d, v2.2d		// [C*Y1 - S*X1, C*Y0 - S*X0]
 	st1	{v4.2d}, [X], INC_X
 	st1	{v5.2d}, [Y], INC_Y
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	rot_kernel_L999
 	INIT
 	cmp	INC_X, #1
 	bne	rot_kernel_S_BEGIN
 	cmp	INC_Y, #1
 	bne	rot_kernel_S_BEGIN
 rot_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	rot_kernel_F1
 	KERNEL_INIT_F4
 rot_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	rot_kernel_F4
 rot_kernel_F1:
 	ands	I, N, #3
 	ble	rot_kernel_L999
 rot_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     rot_kernel_F10
 	mov	w0, wzr
 	ret
 rot_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	rot_kernel_S1
 rot_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	rot_kernel_S4
 rot_kernel_S1:
 	ands	I, N, #3
 	ble	rot_kernel_L999
 rot_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     rot_kernel_S10
 rot_kernel_L999:
 	mov	w0, wzr
 	ret
--- a/kernel/arm64/zscal.S
+++ b/kernel/arm64/zscal.S
@@ -0,0 +1,274 @@
 /*******************************************************************************
 Copyright (c) 2015, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #define	N	x0	/* vector length */
 #define	X	x3	/* X vector address */
 #define	INC_X	x4	/* X stride */
 #define I	x5	/* loop variable */
 /*******************************************************************************
 * Macro definitions
 *******************************************************************************/
 #if !defined(DOUBLE)
 #define	DA_R	s0	/* real scale input value */
 #define	DA_I	s1	/* imaginary scale input value */
 #else
 #define	DA_R	d0	/* real scale input value */
 #define	DA_I	d1	/* imaginary scale input value */
 #endif
 /******************************************************************************/
 .macro INIT
 #if !defined(DOUBLE)
 	ins	v0.s[1], v0.s[0]		// v0 = DA_R, DA_R 
 	fneg	s2, DA_I
 	ins	v1.s[1], v2.s[0]		// v1 = -DA_I, DA_I 
 	ext	v1.8b, v1.8b, v1.8b, #4		// v1 = DA_I, -DA_I
 #else
 	ins	v0.d[1], v0.d[0]		// v0 = DA_R, DA_R 
 	fneg	d2, DA_I
 	ins	v1.d[1], v2.d[0]		// v1 = DA_I, DA_I 
 	ext	v1.16b, v1.16b, v1.16b, #8	// v1 = DA_I, -DA_I
 #endif
 .endm
 .macro KERNEL_F1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X]			// X1, X0
 	ext	v3.8b, v2.8b, v2.8b, #4		// X0, X1
 	fmul	v2.2s, v2.2s, v0.2s		// DA_R*X1, DA_R*X0
 	fmla	v2.2s, v3.2s, v1.2s		// DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
 	st1	{v2.2s}, [X], #8
 #else
 	ld1	{v2.2d}, [X]			// X1, X0
 	ext	v3.16b, v2.16b, v2.16b, #8	// X0, X1
 	fmul	v2.2d, v2.2d, v0.2d		// DA_R*X1, DA_R*X0
 	fmla	v2.2d, v3.2d, v1.2d		// DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
 	st1	{v2.2d}, [X], #16
 #endif
 .endm
 .macro KERNEL_INIT_F4
 #if !defined(DOUBLE)
 	// Replicate the lower 2 floats into the upper 2 slots
 	ins	v0.d[1], v0.d[0]		// v0 = DA_R, DA_R, DA_R, DA_R
 	ins	v1.d[1], v1.d[0]		// v1 = DA_I, DA_I, DA_I, DA_I
 #endif
 .endm
 .macro KERNEL_F4
 #if !defined(DOUBLE)
 	ld1	{v2.4s,v3.4s}, [X]		// V2 = X[3], X[2], X[1], X[0]
 						// V3 = X[7], X[6], X[5], X[4]
 	ext	v6.8b, v2.8b, v2.8b, #4		// V6 =  -  ,  -  , X[0], X[1]
 	ins	v6.s[2], v2.s[3]		// V6 =  -  , X[3], X[0], X[1]
 	ins	v6.s[3], v2.s[2]		// V6 = X[2], X[3], X[0], X[1]
 	fmul	v2.4s, v0.4s, v2.4s		// X'[ix]   += DA_R * X[ix]
 						// X'[ix+1] += DA_R * X[ix+1]
 	fmla	v2.4s, v1.4s, v6.4s		// X'[ix]   += -DA_I * X[ix+1]
 						// X'[ix+1] += DA_I * X[ix]
 	ext	v7.8b, v3.8b, v3.8b, #4		// V7 =  -  ,  -  , X[4], X[5]
 	ins	v7.s[2], v3.s[3]		// V7 =  -  , X[7], X[4], X[5]
 	ins	v7.s[3], v3.s[2]		// V7 = X[6], X[7], X[4], X[5]
 	fmul	v3.4s, v0.4s, v3.4s		// X'[ix]   += DA_R * X[ix]
 						// X'[ix+1] += DA_R * X[ix+1]
 	fmla	v3.4s, v1.4s, v7.4s		// X'[ix]   += -DA_I * X[ix+1]
 						// X'[ix+1] += DA_I * X[ix]
 	st1	{v2.4s,v3.4s}, [X], #32
 #else // DOUBLE
 	ld1	{v2.2d,v3.2d,v4.2d,v5.2d}, [X]	// CX0, CX1, CX2, CX3
 	ext	v20.16b, v2.16b, v2.16b, #8	// X[ix], X[ix+1]
 	ext	v21.16b, v3.16b, v3.16b, #8	// X[ix], X[ix+1]
 	ext	v22.16b, v4.16b, v4.16b, #8	// X[ix], X[ix+1]
 	ext	v23.16b, v5.16b, v5.16b, #8	// X[ix], X[ix+1]
 	fmul	v2.2d, v0.2d, v2.2d
 	fmla	v2.2d, v1.2d, v20.2d
 	fmul	v3.2d, v0.2d, v3.2d
 	fmla	v3.2d, v1.2d, v21.2d
 	st1	{v2.2d,v3.2d}, [X], #32
 	fmul	v4.2d, v0.2d, v4.2d
 	fmla	v4.2d, v1.2d, v22.2d
 	fmul	v5.2d, v0.2d, v5.2d
 	fmla	v5.2d, v1.2d, v23.2d
 	st1	{v4.2d,v5.2d}, [X], #32
 #endif
 	PRFM	PLDL1KEEP, [X, #1024]
 .endm
 .macro INIT_S
 #if !defined(DOUBLE)
 	lsl	INC_X, INC_X, #3
 #else
 	lsl	INC_X, INC_X, #4
 #endif
 .endm
 .macro KERNEL_S1
 #if !defined(DOUBLE)
 	ld1	{v2.2s}, [X]			// X1, X0
 	ext	v3.8b, v2.8b, v2.8b, #4		// X0, X1
 	fmul	v2.2s, v2.2s, v0.2s		// DA_R*X1, DA_R*X0
 	fmla	v2.2s, v3.2s, v1.2s		// DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
 	st1	{v2.2s}, [X], INC_X
 #else
 	ld1	{v2.2d}, [X]			// X1, X0
 	ext	v3.16b, v2.16b, v2.16b, #8	// X0, X1
 	fmul	v2.2d, v2.2d, v0.2d		// DA_R*X1, DA_R*X0
 	fmla	v2.2d, v3.2d, v1.2d		// DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
 	st1	{v2.2d}, [X], INC_X
 #endif
 .endm
 /*******************************************************************************
 * End of macro definitions
 *******************************************************************************/
 	PROLOGUE
 	cmp	N, xzr
 	ble	zscal_kernel_L999
 	fcmp	DA_R, #0.0
 	bne	zscal_kernel_1
 	fcmp	DA_I, #0.0
 	beq	zscal_kernel_zero
 	// TODO: special case DA_R == 0 && DA_I != 0
 zscal_kernel_1:
 	// TODO: special case DA_R != 0 && DA_I == 0
 	INIT
 	cmp	INC_X, #1
 	bne	zscal_kernel_S_BEGIN
 zscal_kernel_F_BEGIN:
 	asr	I, N, #2
 	cmp	I, xzr
 	beq	zscal_kernel_F1
 	KERNEL_INIT_F4
 zscal_kernel_F4:
 	KERNEL_F4
 	subs	I, I, #1
 	bne	zscal_kernel_F4
 zscal_kernel_F1:
 	ands	I, N, #3
 	ble	zscal_kernel_L999
 zscal_kernel_F10:
 	KERNEL_F1
 	subs    I, I, #1
        bne     zscal_kernel_F10
 	mov	w0, wzr
 	ret
 zscal_kernel_S_BEGIN:
 	INIT_S
 	asr	I, N, #2
 	cmp	I, xzr
 	ble	zscal_kernel_S1
 zscal_kernel_S4:
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	KERNEL_S1
 	subs	I, I, #1
 	bne	zscal_kernel_S4
 zscal_kernel_S1:
 	ands	I, N, #3
 	ble	zscal_kernel_L999
 zscal_kernel_S10:
 	KERNEL_S1
 	subs    I, I, #1
        bne     zscal_kernel_S10
 zscal_kernel_L999:
 	mov	w0, wzr
 	ret
 zscal_kernel_zero:
 	INIT_S
 zscal_kernel_Z1:
 	stp	DA_R, DA_I, [X]
 	add	X, X, INC_X
 	subs    N, N, #1
        bne     zscal_kernel_Z1
 	mov	w0, wzr
 	ret
 	EPILOGUE
--- a/kernel/arm64/ztrmm_kernel_4x4.S
+++ b/kernel/arm64/ztrmm_kernel_4x4.S
--- a/kernel/generic/ztrmmkernel_4x4.c
+++ b/kernel/generic/ztrmmkernel_4x4.c
@@ -0,0 +1,883 @@
 #include "common.h"
 #define MADD_ALPHA_N_STORE(C, res, alpha) \
 	C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \
 	C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r;
 #if defined(NN) || defined(NT) || defined(TN) || defined(TT)
 #define MADD(res, op1, op2) \
 	res ## _r += op1 ## _r * op2 ## _r; \
 	res ## _r -= op1 ## _i * op2 ## _i; \
 	res ## _i += op1 ## _r * op2 ## _i; \
 	res ## _i += op1 ## _i * op2 ## _r;
 #elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
 #define MADD(res, op1, op2) \
 	res ## _r += op1 ## _r * op2 ## _r; \
 	res ## _r += op1 ## _i * op2 ## _i; \
 	res ## _i -= op1 ## _r * op2 ## _i; \
 	res ## _i += op1 ## _i * op2 ## _r;
 #elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
 #define MADD(res, op1, op2) \
 	res ## _r += op1 ## _r * op2 ## _r; \
 	res ## _r += op1 ## _i * op2 ## _i; \
 	res ## _i += op1 ## _r * op2 ## _i; \
 	res ## _i -= op1 ## _i * op2 ## _r;
 #elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
 #define MADD(res, op1, op2) \
 	res ## _r += op1 ## _r * op2 ## _r; \
 	res ## _r -= op1 ## _i * op2 ## _i; \
 	res ## _i -= op1 ## _r * op2 ## _i; \
 	res ## _i -= op1 ## _i * op2 ## _r;
 #endif
 int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
 		, BLASLONG offset
 		)
 {
 	BLASLONG i,j,k;
 	FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
 	FLOAT res00_r, res01_r, res02_r, res03_r;
 	FLOAT res00_i, res01_i, res02_i, res03_i;
 	FLOAT res10_r, res11_r, res12_r, res13_r;
 	FLOAT res10_i, res11_i, res12_i, res13_i;
 	FLOAT res20_r, res21_r, res22_r, res23_r;
 	FLOAT res20_i, res21_i, res22_i, res23_i;
 	FLOAT res30_r, res31_r, res32_r, res33_r;
 	FLOAT res30_i, res31_i, res32_i, res33_i;
 	FLOAT a0_r, a1_r;
 	FLOAT a0_i, a1_i;
 	FLOAT b0_r, b1_r, b2_r, b3_r;
 	FLOAT b0_i, b1_i, b2_i, b3_i;
 	BLASLONG off, temp;
 #if defined(TRMMKERNEL) && !defined(LEFT)
 	off = -offset;
 #endif
 	for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
 	{
 		C0 = C;
 		C1 = C0+2*ldc;
 		C2 = C1+2*ldc;
 		C3 = C2+2*ldc;
 #if defined(TRMMKERNEL) && defined(LEFT)
 		off = offset;
 #endif
 		ptrba = ba;
 		for (i=0; i<bm/4; i+=1) // do blocks of 4x4
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*4*2; // number of values in A
 			ptrbb = bb + off*4*2; // number of values in B
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res01_r = 0;
 			res01_i = 0;
 			res02_r = 0;
 			res02_i = 0;
 			res03_r = 0;
 			res03_i = 0;
 			res10_r = 0;
 			res10_i = 0;
 			res11_r = 0;
 			res11_i = 0;
 			res12_r = 0;
 			res12_i = 0;
 			res13_r = 0;
 			res13_i = 0;
 			res20_r = 0;
 			res20_i = 0;
 			res21_r = 0;
 			res21_i = 0;
 			res22_r = 0;
 			res22_i = 0;
 			res23_r = 0;
 			res23_i = 0;
 			res30_r = 0;
 			res30_i = 0;
 			res31_r = 0;
 			res31_i = 0;
 			res32_r = 0;
 			res32_i = 0;
 			res33_r = 0;
 			res33_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk - off;
 #elif defined(LEFT)
 			temp = off + 4;
 #else
 			temp = off + 4;
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
 				b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
 				b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				MADD(res10, a0, b1);
 				MADD(res20, a0, b2);
 				MADD(res30, a0, b3);
 				a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
 				MADD(res01, a1, b0);
 				MADD(res11, a1, b1);
 				MADD(res21, a1, b2);
 				MADD(res31, a1, b3);
 				a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
 				MADD(res02, a0, b0);
 				MADD(res12, a0, b1);
 				MADD(res22, a0, b2);
 				MADD(res32, a0, b3);
 				a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
 				MADD(res03, a1, b0);
 				MADD(res13, a1, b1);
 				MADD(res23, a1, b2);
 				MADD(res33, a1, b3);
 				ptrba = ptrba+8;
 				ptrbb = ptrbb+8;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res01, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res02, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res03, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C1, res10, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res11, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res12, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res13, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C2, res20, alpha);
 			C2 = C2 + 2;
 			MADD_ALPHA_N_STORE(C2, res21, alpha);
 			C2 = C2 + 2;
 			MADD_ALPHA_N_STORE(C2, res22, alpha);
 			C2 = C2 + 2;
 			MADD_ALPHA_N_STORE(C2, res23, alpha);
 			C2 = C2 + 2;
 			MADD_ALPHA_N_STORE(C3, res30, alpha);
 			C3 = C3 + 2;
 			MADD_ALPHA_N_STORE(C3, res31, alpha);
 			C3 = C3 + 2;
 			MADD_ALPHA_N_STORE(C3, res32, alpha);
 			C3 = C3 + 2;
 			MADD_ALPHA_N_STORE(C3, res33, alpha);
 			C3 = C3 + 2;
 #if ( defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk-off;
 #if defined(LEFT)
 			temp = temp - 4;
 #else
 			temp = temp - 4;
 #endif
 			ptrba += temp*4*2; // number of values in A
 			ptrbb += temp*4*2; // number of values in B
 #endif
 #ifdef LEFT
 			off += 4; // number of values in A
 #endif
 		}
 		if ( bm & 2 ) // do any 2x4 loop
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*2*2;
 			ptrbb = bb + off*4*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res01_r = 0;
 			res01_i = 0;
 			res10_r = 0;
 			res10_i = 0;
 			res11_r = 0;
 			res11_i = 0;
 			res20_r = 0;
 			res20_i = 0;
 			res21_r = 0;
 			res21_i = 0;
 			res30_r = 0;
 			res30_i = 0;
 			res31_r = 0;
 			res31_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+2;	// number of values in A
 #else
 			temp = off+4;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
 				b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
 				b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				MADD(res10, a0, b1);
 				MADD(res20, a0, b2);
 				MADD(res30, a0, b3);
 				a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
 				MADD(res01, a1, b0);
 				MADD(res11, a1, b1);
 				MADD(res21, a1, b2);
 				MADD(res31, a1, b3);
 				ptrba = ptrba+4;
 				ptrbb = ptrbb+8;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res01, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C1, res10, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res11, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C2, res20, alpha);
 			C2 = C2 + 2;
 			MADD_ALPHA_N_STORE(C2, res21, alpha);
 			C2 = C2 + 2;
 			MADD_ALPHA_N_STORE(C3, res30, alpha);
 			C3 = C3 + 2;
 			MADD_ALPHA_N_STORE(C3, res31, alpha);
 			C3 = C3 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 2; // number of values in A
 #else
 			temp -= 4; // number of values in B
 #endif
 			ptrba += temp*2*2;
 			ptrbb += temp*4*2;
 #endif
 #ifdef LEFT
 			off += 2; // number of values in A
 #endif
 		}
 		if ( bm & 1 ) // do any 1x4 loop
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*1*2;
 			ptrbb = bb + off*4*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res10_r = 0;
 			res10_i = 0;
 			res20_r = 0;
 			res20_i = 0;
 			res30_r = 0;
 			res30_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+1;	// number of values in A
 #else
 			temp = off+4;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
 				b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
 				b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				MADD(res10, a0, b1);
 				MADD(res20, a0, b2);
 				MADD(res30, a0, b3);
 				ptrba = ptrba+2;
 				ptrbb = ptrbb+8;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C1, res10, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C2, res20, alpha);
 			C2 = C2 + 2;
 			MADD_ALPHA_N_STORE(C3, res30, alpha);
 			C3 = C3 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 1; // number of values in A
 #else
 			temp -= 4; // number of values in B
 #endif
 			ptrba += temp*1*2;
 			ptrbb += temp*4*2;
 #endif
 #ifdef LEFT
 			off += 1; // number of values in A
 #endif
 		}
 #if defined(TRMMKERNEL) && !defined(LEFT)
 		off += 4;
 #endif
 		k = (bk<<3);
 		bb = bb+k;
 		i = (ldc<<3);
 		C = C+i;
 	}
 	for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
 	{
 		C0 = C;
 		C1 = C0+ldc*2;
 #if defined(TRMMKERNEL) && defined(LEFT)
 		off = offset;
 #endif
 		ptrba = ba;
 		for (i=0; i<bm/4; i+=1) // do blocks of 4x2
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*4*2;
 			ptrbb = bb + off*2*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res01_r = 0;
 			res01_i = 0;
 			res02_r = 0;
 			res02_i = 0;
 			res03_r = 0;
 			res03_i = 0;
 			res10_r = 0;
 			res10_i = 0;
 			res11_r = 0;
 			res11_i = 0;
 			res12_r = 0;
 			res12_i = 0;
 			res13_r = 0;
 			res13_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+4;	// number of values in A
 #else
 			temp = off+2;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				MADD(res10, a0, b1);
 				a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
 				MADD(res01, a1, b0);
 				MADD(res11, a1, b1);
 				a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
 				MADD(res02, a0, b0);
 				MADD(res12, a0, b1);
 				a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
 				MADD(res03, a1, b0);
 				MADD(res13, a1, b1);
 				ptrba = ptrba+8;
 				ptrbb = ptrbb+4;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res01, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res02, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res03, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C1, res10, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res11, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res12, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res13, alpha);
 			C1 = C1 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 4; // number of values in A
 #else
 			temp -= 2; // number of values in B
 #endif
 			ptrba += temp*4*2;
 			ptrbb += temp*2*2;
 #endif
 #ifdef LEFT
 			off += 4; // number of values in A
 #endif
 		}
 		if ( bm & 2 ) // do any 2x2 loop
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*2*2;
 			ptrbb = bb + off*2*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res01_r = 0;
 			res01_i = 0;
 			res10_r = 0;
 			res10_i = 0;
 			res11_r = 0;
 			res11_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+2;	// number of values in A
 #else
 			temp = off+2;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				MADD(res10, a0, b1);
 				a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
 				MADD(res01, a1, b0);
 				MADD(res11, a1, b1);
 				ptrba = ptrba+4;
 				ptrbb = ptrbb+4;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res01, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C1, res10, alpha);
 			C1 = C1 + 2;
 			MADD_ALPHA_N_STORE(C1, res11, alpha);
 			C1 = C1 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 2; // number of values in A
 #else
 			temp -= 2; // number of values in B
 #endif
 			ptrba += temp*2*2;
 			ptrbb += temp*2*2;
 #endif
 #ifdef LEFT
 			off += 2; // number of values in A
 #endif
 		}
 		if ( bm & 1 ) // do any 1x2 loop
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*1*2;
 			ptrbb = bb + off*2*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res10_r = 0;
 			res10_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+1;	// number of values in A
 #else
 			temp = off+2;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				MADD(res10, a0, b1);
 				ptrba = ptrba+2;
 				ptrbb = ptrbb+4;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C1, res10, alpha);
 			C1 = C1 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 1; // number of values in A
 #else
 			temp -= 2; // number of values in B
 #endif
 			ptrba += temp*1*2;
 			ptrbb += temp*2*2;
 #endif
 #ifdef LEFT
 			off += 1; // number of values in A
 #endif
 		}
 #if defined(TRMMKERNEL) && !defined(LEFT)
 		off += 2;
 #endif
 		k = (bk<<2);
 		bb = bb+k;
 		i = (ldc<<2);
 		C = C+i;
 	}
 	for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
 	{
 		C0 = C;
 #if defined(TRMMKERNEL) &&  defined(LEFT)
 		off = offset;
 #endif
 		ptrba = ba;
 		for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*4*2;
 			ptrbb = bb + off*1*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res01_r = 0;
 			res01_i = 0;
 			res02_r = 0;
 			res02_i = 0;
 			res03_r = 0;
 			res03_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+4;	// number of values in A
 #else
 			temp = off+1;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
 				MADD(res01, a1, b0);
 				a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
 				MADD(res02, a0, b0);
 				a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
 				MADD(res03, a1, b0);
 				ptrba = ptrba+8;
 				ptrbb = ptrbb+2;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res01, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res02, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res03, alpha);
 			C0 = C0 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 4; // number of values in A
 #else
 			temp -= 1; // number of values in B
 #endif
 			ptrba += temp*4*2;
 			ptrbb += temp*1*2;
 #endif
 #ifdef LEFT
 			off += 4; // number of values in A
 #endif
 		}
 		if ( bm & 2 ) // do any 2x1 loop
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*2*2;
 			ptrbb = bb + off*1*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 			res01_r = 0;
 			res01_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+2;	// number of values in A
 #else
 			temp = off+1;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
 				MADD(res01, a1, b0);
 				ptrba = ptrba+4;
 				ptrbb = ptrbb+2;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 			MADD_ALPHA_N_STORE(C0, res01, alpha);
 			C0 = C0 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 2; // number of values in A
 #else
 			temp -= 1; // number of values in B
 #endif
 			ptrba += temp*2*2;
 			ptrbb += temp*1*2;
 #endif
 #ifdef LEFT
 			off += 2; // number of values in A
 #endif
 		}
 		if ( bm & 1 ) // do any 1x1 loop
 		{
 #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			ptrbb = bb;
 #else
 			ptrba += off*1*2;
 			ptrbb = bb + off*1*2;
 #endif
 			res00_r = 0;
 			res00_i = 0;
 #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
 			temp = bk-off;
 #elif defined(LEFT)
 			temp = off+1;	// number of values in A
 #else
 			temp = off+1;	// number of values in B
 #endif
 			for (k=0; k<temp; k++)
 			{
 				b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
 				a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
 				MADD(res00, a0, b0);
 				ptrba = ptrba+2;
 				ptrbb = ptrbb+2;
 			}
 			MADD_ALPHA_N_STORE(C0, res00, alpha);
 			C0 = C0 + 2;
 #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
 			temp = bk - off;
 #ifdef LEFT
 			temp -= 1; // number of values in A
 #else
 			temp -= 1; // number of values in B
 #endif
 			ptrba += temp*1*2;
 			ptrbb += temp*1*2;
 #endif
 #ifdef LEFT
 			off += 1; // number of values in A
 #endif
 		}
 #if defined(TRMMKERNEL) && !defined(LEFT)
 		off += 1;
 #endif
 		k = (bk<<1);
 		bb = bb+k;
 		i = (ldc<<1);
 		C = C+i;
 	}
 	return 0;
 }
--- a/param.h
+++ b/param.h
@@ -2214,6 +2214,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define ZGEMM_DEFAULT_R 4096
 #define SYMV_P	16
 #endif
 #if defined(CORTEXA57)
 #define SNUMOPT		2
 #define DNUMOPT		2
 #define GEMM_DEFAULT_OFFSET_A 0
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 #define SGEMM_DEFAULT_UNROLL_M  4
 #define SGEMM_DEFAULT_UNROLL_N  4
 #define DGEMM_DEFAULT_UNROLL_M  4
 #define DGEMM_DEFAULT_UNROLL_N  4
 #define CGEMM_DEFAULT_UNROLL_M  4
 #define CGEMM_DEFAULT_UNROLL_N  4
 #define ZGEMM_DEFAULT_UNROLL_M  4
 #define ZGEMM_DEFAULT_UNROLL_N  4
 #define SGEMM_DEFAULT_P	128
 #define DGEMM_DEFAULT_P	256
 #define CGEMM_DEFAULT_P 256
 #define ZGEMM_DEFAULT_P 128
 #define SGEMM_DEFAULT_Q 240
 #define DGEMM_DEFAULT_Q 1024
 #define CGEMM_DEFAULT_Q 1024
 #define ZGEMM_DEFAULT_Q 512
 #define SGEMM_DEFAULT_R 12288
 #define DGEMM_DEFAULT_R 4096
 #define CGEMM_DEFAULT_R 4096
 #define ZGEMM_DEFAULT_R 2048
 #define SYMV_P	16
 #endif