Browse Source

Fix #686. Merge branch 'ashwinyes-develop' into develop

tags/v0.2.16.rc1
Zhang Xianyi 10 years ago
parent
commit
e31948ceb0
89 changed files with 19268 additions and 771 deletions
  1. +1
    -0
      .gitignore
  2. +4
    -0
      Makefile.arm64
  3. +2
    -0
      TargetList.txt
  4. +1
    -1
      benchmark/gemm.c
  5. +1
    -0
      common.h
  6. +7
    -1
      common_arm64.h
  7. +53
    -31
      cpuid_arm64.c
  8. +1
    -2
      driver/level2/spmv_thread.c
  9. +1
    -2
      driver/level2/spr2_thread.c
  10. +2
    -2
      driver/level2/spr_thread.c
  11. +1
    -2
      driver/level2/symv_thread.c
  12. +0
    -2
      driver/level2/tbmv_L.c
  13. +0
    -2
      driver/level2/tbmv_U.c
  14. +0
    -2
      driver/level2/tbsv_L.c
  15. +0
    -2
      driver/level2/tbsv_U.c
  16. +0
    -2
      driver/level2/tpsv_L.c
  17. +0
    -2
      driver/level2/tpsv_U.c
  18. +0
    -2
      driver/level2/ztbmv_L.c
  19. +0
    -2
      driver/level2/ztbmv_U.c
  20. +0
    -2
      driver/level2/ztbsv_L.c
  21. +0
    -2
      driver/level2/ztbsv_U.c
  22. +0
    -2
      driver/level2/ztpmv_L.c
  23. +0
    -2
      driver/level2/ztpmv_U.c
  24. +0
    -2
      driver/level2/ztpsv_L.c
  25. +0
    -2
      driver/level2/ztpsv_U.c
  26. +1
    -1
      driver/level3/gemm_thread_mn.c
  27. +2
    -4
      driver/level3/level3_thread.c
  28. +0
    -1
      driver/others/memory.c
  29. +16
    -2
      getarch.c
  30. +23
    -42
      interface/symm.c
  31. +1
    -2
      interface/syr.c
  32. +1
    -2
      interface/syr2.c
  33. +1
    -2
      interface/zhemv.c
  34. +1
    -2
      interface/zher.c
  35. +1
    -2
      interface/zher2.c
  36. +18
    -18
      kernel/Makefile.L1
  37. +4
    -2
      kernel/arm/amax.c
  38. +4
    -2
      kernel/arm/amin.c
  39. +1
    -1
      kernel/arm/asum.c
  40. +4
    -2
      kernel/arm/iamax.c
  41. +3
    -1
      kernel/arm/iamin.c
  42. +3
    -1
      kernel/arm/imax.c
  43. +3
    -1
      kernel/arm/imin.c
  44. +7
    -7
      kernel/arm/izamax.c
  45. +7
    -7
      kernel/arm/izamin.c
  46. +3
    -1
      kernel/arm/max.c
  47. +3
    -1
      kernel/arm/min.c
  48. +1
    -1
      kernel/arm/nrm2.c
  49. +8
    -10
      kernel/arm/zamax.c
  50. +8
    -10
      kernel/arm/zamin.c
  51. +2
    -1
      kernel/arm/zasum.c
  52. +2
    -4
      kernel/arm/zaxpby.c
  53. +1
    -1
      kernel/arm/znrm2.c
  54. +91
    -0
      kernel/arm64/KERNEL.CORTEXA57
  55. +249
    -0
      kernel/arm64/amax.S
  56. +194
    -0
      kernel/arm64/asum.S
  57. +209
    -0
      kernel/arm64/axpy.S
  58. +170
    -0
      kernel/arm64/casum.S
  59. +1667
    -0
      kernel/arm64/cgemm_kernel_4x4.S
  60. +232
    -0
      kernel/arm64/copy.S
  61. +1621
    -0
      kernel/arm64/ctrmm_kernel_4x4.S
  62. +1338
    -0
      kernel/arm64/dgemm_kernel_4x4.S
  63. +169
    -0
      kernel/arm64/dnrm2.S
  64. +227
    -0
      kernel/arm64/dot.S
  65. +1398
    -0
      kernel/arm64/dtrmm_kernel_4x4.S
  66. +320
    -0
      kernel/arm64/gemv_n.S
  67. +347
    -0
      kernel/arm64/gemv_t.S
  68. +124
    -0
      kernel/arm64/idamax.S
  69. +213
    -0
      kernel/arm64/isamax.S
  70. +151
    -0
      kernel/arm64/izamax.S
  71. +243
    -0
      kernel/arm64/rot.S
  72. +253
    -0
      kernel/arm64/scal.S
  73. +807
    -571
      kernel/arm64/sgemm_kernel_4x4.S
  74. +178
    -0
      kernel/arm64/snrm2.S
  75. +1405
    -0
      kernel/arm64/strmm_kernel_4x4.S
  76. +266
    -0
      kernel/arm64/swap.S
  77. +273
    -0
      kernel/arm64/zamax.S
  78. +164
    -0
      kernel/arm64/zasum.S
  79. +301
    -0
      kernel/arm64/zaxpy.S
  80. +302
    -0
      kernel/arm64/zdot.S
  81. +1617
    -0
      kernel/arm64/zgemm_kernel_4x4.S
  82. +514
    -0
      kernel/arm64/zgemv_n.S
  83. +448
    -0
      kernel/arm64/zgemv_t.S
  84. +228
    -0
      kernel/arm64/znrm2.S
  85. +256
    -0
      kernel/arm64/zrot.S
  86. +274
    -0
      kernel/arm64/zscal.S
  87. +1893
    -0
      kernel/arm64/ztrmm_kernel_4x4.S
  88. +883
    -0
      kernel/generic/ztrmmkernel_4x4.c
  89. +40
    -0
      param.h

+ 1
- 0
.gitignore View File

@@ -68,3 +68,4 @@ test/zblat2
test/zblat3 test/zblat3
build build
build.* build.*
*.swp

+ 4
- 0
Makefile.arm64 View File

@@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a
FCOMMON_OPT += -march=armv8-a FCOMMON_OPT += -march=armv8-a
endif endif


ifeq ($(CORE), CORTEXA57)
CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57
endif



+ 2
- 0
TargetList.txt View File

@@ -74,3 +74,5 @@ ARMV5


7.ARM 64-bit CPU: 7.ARM 64-bit CPU:
ARMV8 ARMV8
CORTEXA57


+ 1
- 1
benchmark/gemm.c View File

@@ -172,7 +172,7 @@ int main(int argc, char *argv[]){
srandom(getpid()); srandom(getpid());
#endif #endif
for(j = 0; j < m; j++){
for(j = 0; j < to; j++){
for(i = 0; i < to * COMPSIZE; i++){ for(i = 0; i < to * COMPSIZE; i++){
a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;


+ 1
- 0
common.h View File

@@ -86,6 +86,7 @@ extern "C" {
#if !defined(_MSC_VER) #if !defined(_MSC_VER)
#include <unistd.h> #include <unistd.h>
#endif #endif
#include <time.h>


#ifdef OS_LINUX #ifdef OS_LINUX
#include <malloc.h> #include <malloc.h>


+ 7
- 1
common_arm64.h View File

@@ -89,8 +89,10 @@ static inline int blas_quickdivide(blasint x, blasint y){
#if defined(ASSEMBLER) && !defined(NEEDPARAM) #if defined(ASSEMBLER) && !defined(NEEDPARAM)


#define PROLOGUE \ #define PROLOGUE \
.text ;\
.align 4 ;\
.global REALNAME ;\ .global REALNAME ;\
.func REALNAME ;\
.type REALNAME, %function ;\
REALNAME: REALNAME:


#define EPILOGUE #define EPILOGUE
@@ -107,7 +109,11 @@ REALNAME:
#endif #endif
#define HUGE_PAGESIZE ( 4 << 20) #define HUGE_PAGESIZE ( 4 << 20)


#if defined(CORTEXA57)
#define BUFFER_SIZE (40 << 20)
#else
#define BUFFER_SIZE (16 << 20) #define BUFFER_SIZE (16 << 20)
#endif




#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)


+ 53
- 31
cpuid_arm64.c View File

@@ -29,12 +29,19 @@


#define CPU_UNKNOWN 0 #define CPU_UNKNOWN 0
#define CPU_ARMV8 1 #define CPU_ARMV8 1
#define CPU_CORTEXA57 2


static char *cpuname[] = { static char *cpuname[] = {
"UNKOWN",
"ARMV8"
"UNKNOWN",
"ARMV8" ,
"CORTEXA57"
}; };


static char *cpuname_lower[] = {
"unknown",
"armv8" ,
"cortexa57"
};


int get_feature(char *search) int get_feature(char *search)
{ {
@@ -53,13 +60,13 @@ int get_feature(char *search)
{ {
p = strchr(buffer, ':') + 2; p = strchr(buffer, ':') + 2;
break; break;
}
}
}
}


fclose(infile);
fclose(infile);




if( p == NULL ) return;
if( p == NULL ) return 0;


t = strtok(p," "); t = strtok(p," ");
while( t = strtok(NULL," ")) while( t = strtok(NULL," "))
@@ -82,11 +89,30 @@ int detect(void)
p = (char *) NULL ; p = (char *) NULL ;


infile = fopen("/proc/cpuinfo", "r"); infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile))
{


if (!strncmp("CPU part", buffer, 8))
{
p = strchr(buffer, ':') + 2;
break;
}
}

fclose(infile);
if(p != NULL) {
if (strstr(p, "0xd07")) {
return CPU_CORTEXA57;
}
}

p = (char *) NULL ;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)) while (fgets(buffer, sizeof(buffer), infile))
{ {


if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) ||
(!strncmp("CPU architecture", buffer, 16)))
{ {
p = strchr(buffer, ':') + 2; p = strchr(buffer, ':') + 2;
break; break;
@@ -100,7 +126,7 @@ int detect(void)


if (strstr(p, "AArch64")) if (strstr(p, "AArch64"))
{ {
return CPU_ARMV8;
return CPU_ARMV8;


} }


@@ -118,23 +144,13 @@ char *get_corename(void)


void get_architecture(void) void get_architecture(void)
{ {
printf("ARM");
printf("ARM64");
} }


void get_subarchitecture(void) void get_subarchitecture(void)
{ {
int d = detect(); int d = detect();
switch (d)
{

case CPU_ARMV8:
printf("ARMV8");
break;

default:
printf("UNKNOWN");
break;
}
printf("%s", cpuname[d]);
} }


void get_subdirname(void) void get_subdirname(void)
@@ -160,26 +176,32 @@ void get_cpuconfig(void)
printf("#define L2_ASSOCIATIVE 4\n"); printf("#define L2_ASSOCIATIVE 4\n");
break; break;



case CPU_CORTEXA57:
printf("#define CORTEXA57\n");
printf("#define HAVE_VFP\n");
printf("#define HAVE_VFPV3\n");
printf("#define HAVE_NEON\n");
printf("#define HAVE_VFPV4\n");
printf("#define L1_CODE_SIZE 49152\n");
printf("#define L1_CODE_LINESIZE 64\n");
printf("#define L1_CODE_ASSOCIATIVE 3\n");
printf("#define L1_DATA_SIZE 32768\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L1_DATA_ASSOCIATIVE 2\n");
printf("#define L2_SIZE 2097152\n");
printf("#define L2_LINESIZE 64\n");
printf("#define L2_ASSOCIATIVE 16\n");
break;
} }
} }




void get_libname(void) void get_libname(void)
{ {

int d = detect(); int d = detect();
switch (d)
{

case CPU_ARMV8:
printf("armv8\n");
break;

}
printf("%s", cpuname_lower[d]);
} }



void get_features(void) void get_features(void)
{ {




+ 1
- 2
driver/level2/spmv_thread.c View File

@@ -55,7 +55,7 @@
static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){


FLOAT *a, *x, *y; FLOAT *a, *x, *y;
BLASLONG incx, incy;
BLASLONG incx;
BLASLONG m_from, m_to, i; BLASLONG m_from, m_to, i;
#ifndef COMPLEX #ifndef COMPLEX
FLOAT result; FLOAT result;
@@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
y = (FLOAT *)args -> c; y = (FLOAT *)args -> c;


incx = args -> ldb; incx = args -> ldb;
incy = args -> ldc;


m_from = 0; m_from = 0;
m_to = args -> m; m_to = args -> m;


+ 1
- 2
driver/level2/spr2_thread.c View File

@@ -43,7 +43,7 @@
static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){


FLOAT *a, *x, *y; FLOAT *a, *x, *y;
BLASLONG lda, incx, incy;
BLASLONG incx, incy;
BLASLONG i, m_from, m_to; BLASLONG i, m_from, m_to;
FLOAT alpha_r; FLOAT alpha_r;
#ifdef COMPLEX #ifdef COMPLEX
@@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL


incx = args -> lda; incx = args -> lda;
incy = args -> ldb; incy = args -> ldb;
lda = args -> ldc;


alpha_r = *((FLOAT *)args -> alpha + 0); alpha_r = *((FLOAT *)args -> alpha + 0);
#ifdef COMPLEX #ifdef COMPLEX


+ 2
- 2
driver/level2/spr_thread.c View File

@@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
BLASLONG incx; BLASLONG incx;
BLASLONG i, m_from, m_to; BLASLONG i, m_from, m_to;
FLOAT alpha_r; FLOAT alpha_r;
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
FLOAT alpha_i; FLOAT alpha_i;
#endif #endif


@@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL
incx = args -> lda; incx = args -> lda;


alpha_r = *((FLOAT *)args -> alpha + 0); alpha_r = *((FLOAT *)args -> alpha + 0);
#if defined(COMPLEX) && !defined(HER) && !defined(HERREV)
#if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV)
alpha_i = *((FLOAT *)args -> alpha + 1); alpha_i = *((FLOAT *)args -> alpha + 1);
#endif #endif




+ 1
- 2
driver/level2/symv_thread.c View File

@@ -55,7 +55,7 @@
static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){


FLOAT *a, *x, *y; FLOAT *a, *x, *y;
BLASLONG lda, incx, incy;
BLASLONG lda, incx;
BLASLONG m_from, m_to; BLASLONG m_from, m_to;


a = (FLOAT *)args -> a; a = (FLOAT *)args -> a;
@@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F


lda = args -> lda; lda = args -> lda;
incx = args -> ldb; incx = args -> ldb;
incy = args -> ldc;


m_from = 0; m_from = 0;
m_to = args -> m; m_to = args -> m;


+ 0
- 2
driver/level2/tbmv_L.c View File

@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/tbmv_U.c View File

@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/tbsv_L.c View File

@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/tbsv_U.c View File

@@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/tpsv_L.c View File

@@ -43,12 +43,10 @@
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/tpsv_U.c View File

@@ -43,12 +43,10 @@
int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztbmv_L.c View File

@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztbmv_U.c View File

@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztbsv_L.c View File

@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztbsv_U.c View File

@@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.;
int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){


BLASLONG i; BLASLONG i;
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;
BLASLONG length; BLASLONG length;
#if (TRANSA == 2) || (TRANSA == 4) #if (TRANSA == 2) || (TRANSA == 4)
@@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095);
COPY_K(n, b, incb, buffer, 1); COPY_K(n, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztpmv_L.c View File

@@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT atemp1, atemp2, btemp1, btemp2; FLOAT atemp1, atemp2, btemp1, btemp2;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztpmv_U.c View File

@@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT atemp1, atemp2, btemp1, btemp2; FLOAT atemp1, atemp2, btemp1, btemp2;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztpsv_L.c View File

@@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT ar, ai, br, bi, ratio, den; FLOAT ar, ai, br, bi, ratio, den;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }




+ 0
- 2
driver/level2/ztpsv_U.c View File

@@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){
#ifndef UNIT #ifndef UNIT
FLOAT ar, ai, br, bi, ratio, den; FLOAT ar, ai, br, bi, ratio, den;
#endif #endif
FLOAT *gemvbuffer = (FLOAT *)buffer;
FLOAT *B = b; FLOAT *B = b;


if (incb != 1) { if (incb != 1) {
B = buffer; B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
COPY_K(m, b, incb, buffer, 1); COPY_K(m, b, incb, buffer, 1);
} }




+ 1
- 1
driver/level3/gemm_thread_mn.c View File

@@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int (
blas_queue_t queue[MAX_CPU_NUMBER]; blas_queue_t queue[MAX_CPU_NUMBER];


BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1];
BLASLONG procs, total_procs, num_cpu_m, num_cpu_n;
BLASLONG procs, num_cpu_m, num_cpu_n;


BLASLONG width, i, j; BLASLONG width, i, j;
BLASLONG divM, divN; BLASLONG divM, divN;


+ 2
- 4
driver/level3/level3_thread.c View File

@@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
BLASLONG is, min_i, div_n; BLASLONG is, min_i, div_n;


BLASLONG i, current; BLASLONG i, current;
BLASLONG l1stride, l2size;
BLASLONG l1stride;


#ifdef TIMING #ifdef TIMING
BLASULONG rpcc_counter; BLASULONG rpcc_counter;
@@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n,
#endif #endif
) return 0; ) return 0;


l2size = GEMM_P * GEMM_Q;

#if 0 #if 0
fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n",
mypos, m_from, m_to, n_from, n_to, N_from, N_to); mypos, m_from, m_to, n_from, n_to, N_from, N_to);
@@ -706,7 +704,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
n = n_to - n_from; n = n_to - n_from;
} }


if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) {
if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) {
GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); GEMM_LOCAL(args, range_m, range_n, sa, sb, 0);
return 0; return 0;
} }


+ 0
- 1
driver/others/memory.c View File

@@ -914,7 +914,6 @@ static volatile struct {
} memory[NUM_BUFFERS]; } memory[NUM_BUFFERS];


static int memory_initialized = 0; static int memory_initialized = 0;
static void gotoblas_memory_init(void);


/* Memory allocation routine */ /* Memory allocation routine */
/* procpos ... indicates where it comes from */ /* procpos ... indicates where it comes from */


+ 16
- 2
getarch.c View File

@@ -819,10 +819,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 "
#define LIBNAME "armv8" #define LIBNAME "armv8"
#define CORENAME "XGENE1"
#else
#define CORENAME "ARMV8"
#endif #endif


#ifdef FORCE_CORTEXA57
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "ARMV8"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DCORTEXA57 " \
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON"
#define LIBNAME "cortexa57"
#define CORENAME "CORTEXA57"
#else
#endif


#ifndef FORCE #ifndef FORCE




+ 23
- 42
interface/symm.c View File

@@ -91,6 +91,27 @@
#endif #endif
#endif #endif



#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
#define MODE (BLAS_XDOUBLE | BLAS_REAL)
#elif defined(DOUBLE)
#define MODE (BLAS_DOUBLE | BLAS_REAL)
#else
#define MODE (BLAS_SINGLE | BLAS_REAL)
#endif
#else
#ifdef XDOUBLE
#define MODE (BLAS_XDOUBLE | BLAS_COMPLEX)
#elif defined(DOUBLE)
#define MODE (BLAS_DOUBLE | BLAS_COMPLEX)
#else
#define MODE (BLAS_SINGLE | BLAS_COMPLEX)
#endif
#endif
#endif

static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef GEMM3M #ifndef GEMM3M
#ifndef HEMM #ifndef HEMM
@@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO,
FLOAT *buffer; FLOAT *buffer;
FLOAT *sa, *sb; FLOAT *sa, *sb;


#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif

#if defined(SMP) && !defined(NO_AFFINITY) #if defined(SMP) && !defined(NO_AFFINITY)
int nodes; int nodes;
#endif #endif
@@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,
FLOAT *buffer; FLOAT *buffer;
FLOAT *sa, *sb; FLOAT *sa, *sb;


#ifdef SMP
#ifndef COMPLEX
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_REAL;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_REAL;
#else
int mode = BLAS_SINGLE | BLAS_REAL;
#endif
#else
#ifdef XDOUBLE
int mode = BLAS_XDOUBLE | BLAS_COMPLEX;
#elif defined(DOUBLE)
int mode = BLAS_DOUBLE | BLAS_COMPLEX;
#else
int mode = BLAS_SINGLE | BLAS_COMPLEX;
#endif
#endif
#endif

#if defined(SMP) && !defined(NO_AFFINITY) #if defined(SMP) && !defined(NO_AFFINITY)
int nodes; int nodes;
#endif #endif
@@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,


args.nthreads /= nodes; args.nthreads /= nodes;


gemm_thread_mn(mode, &args, NULL, NULL,
gemm_thread_mn(MODE, &args, NULL, NULL,
symm[4 | (side << 1) | uplo ], sa, sb, nodes); symm[4 | (side << 1) | uplo ], sa, sb, nodes);


} else { } else {
@@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo,


#else #else


GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);
GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads);


#endif #endif




+ 1
- 2
interface/syr.c View File

@@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {


FLOAT *buffer; FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,


PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;


trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;




+ 1
- 2
interface/syr2.c View File

@@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) {


FLOAT *buffer; FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,


PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;


trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;




+ 1
- 2
interface/zhemv.c View File

@@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
FLOAT beta_i = BETA[1]; FLOAT beta_i = BETA[1];


FLOAT *buffer; FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA


PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;


trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;




+ 1
- 2
interface/zher.c View File

@@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA,
void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) {


FLOAT *buffer; FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha,


PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;


trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;




+ 1
- 2
interface/zher2.c View File

@@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA
FLOAT alpha_r = ALPHA[0]; FLOAT alpha_r = ALPHA[0];
FLOAT alpha_i = ALPHA[1]; FLOAT alpha_i = ALPHA[1];
FLOAT *buffer; FLOAT *buffer;
int trans, uplo;
int uplo;
blasint info; blasint info;
#ifdef SMP #ifdef SMP
int nthreads; int nthreads;
@@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA


PRINT_DEBUG_CNAME; PRINT_DEBUG_CNAME;


trans = -1;
uplo = -1; uplo = -1;
info = 0; info = 0;




+ 18
- 18
kernel/Makefile.L1 View File

@@ -637,49 +637,49 @@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@


$(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@


$(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@


$(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@


$(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@


$(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@


$(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@


$(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@


$(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@


$(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@


$(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@


$(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@


$(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@


$(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@


$(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@


$(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@


$(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@
@@ -799,15 +799,15 @@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KE
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@


$(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@


$(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@


$(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@


$(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL)
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@
$(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@





+ 4
- 2
kernel/arm/amax.c View File

@@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf=0.0; FLOAT maxf=0.0;


if (n < 0 || inc_x < 1 ) return(maxf);
if (n <= 0 || inc_x <= 0) return(maxf);


maxf=ABS(x[0]); maxf=ABS(x[0]);
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {
if( ABS(x[ix]) > ABS(maxf) )
if( ABS(x[ix]) > maxf )
{ {
maxf = ABS(x[ix]); maxf = ABS(x[ix]);
} }


+ 4
- 2
kernel/arm/amin.c View File

@@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf=0.0; FLOAT minf=0.0;


if (n < 0 || inc_x < 1 ) return(minf);
if (n <= 0 || inc_x <= 0) return(minf);


minf=ABS(x[0]); minf=ABS(x[0]);
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {
if( ABS(x[ix]) < ABS(minf) )
if( ABS(x[ix]) < minf )
{ {
minf = ABS(x[ix]); minf = ABS(x[ix]);
} }


+ 1
- 1
kernel/arm/asum.c View File

@@ -53,7 +53,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
if (n < 0 || inc_x < 1 ) return(sumf);
if (n <= 0 || inc_x <= 0) return(sumf);


n *= inc_x; n *= inc_x;
while(i < n) while(i < n)


+ 4
- 2
kernel/arm/iamax.c View File

@@ -55,13 +55,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT maxf=0.0; FLOAT maxf=0.0;
BLASLONG max=0; BLASLONG max=0;


if (n < 0 || inc_x < 1 ) return(max);
if (n <= 0 || inc_x <= 0) return(max);


maxf=ABS(x[0]); maxf=ABS(x[0]);
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {
if( ABS(x[ix]) > ABS(maxf) )
if( ABS(x[ix]) > maxf )
{ {
max = i; max = i;
maxf = ABS(x[ix]); maxf = ABS(x[ix]);


+ 3
- 1
kernel/arm/iamin.c View File

@@ -55,9 +55,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT minf=0.0; FLOAT minf=0.0;
BLASLONG min=0; BLASLONG min=0;


if (n < 0 || inc_x < 1 ) return(min);
if (n <= 0 || inc_x <= 0) return(min);


minf=ABS(x[0]); minf=ABS(x[0]);
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {


+ 3
- 1
kernel/arm/imax.c View File

@@ -47,9 +47,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT maxf=0.0; FLOAT maxf=0.0;
BLASLONG max=0; BLASLONG max=0;


if (n < 0 || inc_x < 1 ) return(max);
if (n <= 0 || inc_x <= 0) return(max);


maxf=x[0]; maxf=x[0];
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {


+ 3
- 1
kernel/arm/imin.c View File

@@ -45,9 +45,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT minf=0.0; FLOAT minf=0.0;
BLASLONG min=0; BLASLONG min=0;


if (n < 0 || inc_x < 1 ) return(min);
if (n <= 0 || inc_x <= 0) return(min);


minf=x[0]; minf=x[0];
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {


+ 7
- 7
kernel/arm/izamax.c View File

@@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf[2];
FLOAT maxf;
BLASLONG max=0; BLASLONG max=0;
BLASLONG inc_x2; BLASLONG inc_x2;


if (n < 0 || inc_x < 1 ) return(max);
if (n <= 0 || inc_x <= 0) return(max);


inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;


maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
maxf = CABS1(x,0);
ix += inc_x2;
i++;


while(i < n) while(i < n)
{ {
if( CABS1(x,ix) > CABS1(maxf,0) )
if( CABS1(x,ix) > maxf )
{ {
max = i; max = i;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
maxf = CABS1(x,ix);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;


+ 7
- 7
kernel/arm/izamin.c View File

@@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf[2];
FLOAT minf;
BLASLONG min=0; BLASLONG min=0;
BLASLONG inc_x2; BLASLONG inc_x2;


if (n < 0 || inc_x < 1 ) return(min);
if (n <= 0 || inc_x <= 0) return(min);


inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;


minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
minf = CABS1(x,0);
ix += inc_x2;
i++;


while(i < n) while(i < n)
{ {
if( CABS1(x,ix) < CABS1(minf,0) )
if( CABS1(x,ix) < minf )
{ {
min = i; min = i;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
minf = CABS1(x,ix);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;


+ 3
- 1
kernel/arm/max.c View File

@@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf=0.0; FLOAT maxf=0.0;


if (n < 0 || inc_x < 1 ) return(maxf);
if (n <= 0 || inc_x <= 0) return(maxf);


maxf=x[0]; maxf=x[0];
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {


+ 3
- 1
kernel/arm/min.c View File

@@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf=0.0; FLOAT minf=0.0;


if (n < 0 || inc_x < 1 ) return(minf);
if (n <= 0 || inc_x <= 0) return(minf);


minf=x[0]; minf=x[0];
ix += inc_x;
i++;


while(i < n) while(i < n)
{ {


+ 1
- 1
kernel/arm/nrm2.c View File

@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
FLOAT absxi = 0.0; FLOAT absxi = 0.0;




if (n < 0 || inc_x < 1 ) return(0.0);
if (n <= 0 || inc_x <= 0) return(0.0);
if ( n == 1 ) return( ABS(x[0]) ); if ( n == 1 ) return( ABS(x[0]) );


n *= inc_x; n *= inc_x;


+ 8
- 10
kernel/arm/zamax.c View File

@@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT maxf[2];
BLASLONG max=0;
FLOAT maxf;
BLASLONG inc_x2; BLASLONG inc_x2;


if (n < 0 || inc_x < 1 ) return(0.0);
if (n <= 0 || inc_x <= 0) return(0.0);


inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;


maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
maxf = CABS1(x,0);
ix += inc_x2;
i++;


while(i < n) while(i < n)
{ {
if( CABS1(x,ix) > CABS1(maxf,0) )
if( CABS1(x,ix) > maxf )
{ {
max = i;
maxf[0] = ABS(x[ix]);
maxf[1] = ABS(x[ix+1]);
maxf = CABS1(x,ix);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;
} }
return(CABS1(maxf,0));
return(maxf);
} }





+ 8
- 10
kernel/arm/zamin.c View File

@@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{ {
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix=0; BLASLONG ix=0;
FLOAT minf[2];
BLASLONG min=0;
FLOAT minf;
BLASLONG inc_x2; BLASLONG inc_x2;


if (n < 0 || inc_x < 1 ) return(0.0);
if (n <= 0 || inc_x <= 0) return(0.0);


inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;


minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
minf = CABS1(x,0);
ix += inc_x2;
i++;


while(i < n) while(i < n)
{ {
if( CABS1(x,ix) < CABS1(minf,0) )
if( CABS1(x,ix) < minf )
{ {
min = i;
minf[0] = ABS(x[ix]);
minf[1] = ABS(x[ix+1]);
minf = CABS1(x,ix);
} }
ix += inc_x2; ix += inc_x2;
i++; i++;
} }
return(CABS1(minf,0));
return(minf);
} }





+ 2
- 1
kernel/arm/zasum.c View File

@@ -55,7 +55,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0; BLASLONG i=0;
FLOAT sumf = 0.0; FLOAT sumf = 0.0;
BLASLONG inc_x2; BLASLONG inc_x2;
if (n < 0 || inc_x < 1 ) return(sumf);

if (n <= 0 || inc_x <= 0) return(sumf);


inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;




+ 2
- 4
kernel/arm/zaxpby.c View File

@@ -37,11 +37,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL
BLASLONG i=0; BLASLONG i=0;
BLASLONG ix,iy; BLASLONG ix,iy;
FLOAT temp; FLOAT temp;
BLASLONG inc_x2, inc_y2;


BLASLONG inc_x2;
BLASLONG inc_y2;

if ( n < 0 ) return(0);
if ( n <= 0 ) return(0);


ix = 0; ix = 0;
iy = 0; iy = 0;


+ 1
- 1
kernel/arm/znrm2.c View File

@@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG inc_x2; BLASLONG inc_x2;
FLOAT temp; FLOAT temp;


if (n < 0 || inc_x < 1 ) return(0.0);
if (n <= 0 || inc_x <= 0) return(0.0);


inc_x2 = 2 * inc_x; inc_x2 = 2 * inc_x;




+ 91
- 0
kernel/arm64/KERNEL.CORTEXA57 View File

@@ -0,0 +1,91 @@
include $(KERNELDIR)/KERNEL.ARMV8

SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S

ISAMAXKERNEL = isamax.S
IDAMAXKERNEL = idamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S

SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S

SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S

SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S

DOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S

SNRM2KERNEL = snrm2.S
DNRM2KERNEL = dnrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S

SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S

SCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S

SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S

SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S

STRMMKERNEL = strmm_kernel_4x4.S
DTRMMKERNEL = dtrmm_kernel_4x4.S
CTRMMKERNEL = ctrmm_kernel_4x4.S
ZTRMMKERNEL = ztrmm_kernel_4x4.S

SGEMMKERNEL = sgemm_kernel_4x4.S
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o

DGEMMKERNEL = dgemm_kernel_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o

CGEMMKERNEL = cgemm_kernel_4x4.S
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o

ZGEMMKERNEL = zgemm_kernel_4x4.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o


+ 249
- 0
kernel/arm64/amax.S View File

@@ -0,0 +1,249 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif

#if !defined(DOUBLE)
#define REG0 wzr
#define MAXF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro INIT_F1
ldr MAXF, [X], #SZ
#if defined(USE_ABS)
fabs MAXF, MAXF
#endif
.endm

.macro KERNEL_F1
ldr TMPF, [X], #SZ
#if defined(USE_ABS)
fabs TMPF, TMPF
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm

.macro INIT_F4
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
#if defined(USE_ABS)
fabs v0.4s, v0.4s
#endif
#if defined(USE_MIN)
fminv MAXF, v0.4s
#else
fmaxv MAXF, v0.4s
#endif
#else // DOUBLE
ld2 {v0.2d,v1.2d}, [X], #32
#if defined(USE_ABS)
fabs v0.2d, v0.2d
fabs v1.2d, v1.2d
#endif
#if defined(USE_MIN)
fmin v0.2d, v0.2d, v1.2d
fminp MAXF, v0.2d
#else
fmax v0.2d, v0.2d, v1.2d
fmaxp MAXF, v0.2d
#endif
#endif
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v1.4s}, [X], #16
#if defined(USE_ABS)
fabs v1.4s, v1.4s
#endif
#if defined(USE_MIN)
fminv TMPF, v1.4s
#else
fmaxv TMPF, v1.4s
#endif
#else // DOUBLE
ld2 {v1.2d,v2.2d}, [X], #32
#if defined(USE_ABS)
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
#endif
#if defined(USE_MIN)
fmin v1.2d, v1.2d, v2.2d
fminp TMPF, v1.2d
#else
fmax v1.2d, v1.2d, v2.2d
fmaxp TMPF, v1.2d
#endif
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
ld1 {v0.s}[0], [X], INC_X
#else
lsl INC_X, INC_X, #3
ld1 {v0.d}[0], [X], INC_X
#endif
#if defined(USE_ABS)
fabs MAXF, MAXF
#endif
.endm

.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
#if defined(USE_ABS)
fabs TMPF, TMPF
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble amax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero

cmp INC_X, #1
bne amax_kernel_S_BEGIN

amax_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT

INIT_F4
subs I, I, #1
beq amax_kernel_F1

amax_kernel_F4:

KERNEL_F4

subs I, I, #1
bne amax_kernel_F4

amax_kernel_F1:

ands I, N, #3
ble amax_kernel_L999

amax_kernel_F10:

KERNEL_F1

subs I, I, #1
bne amax_kernel_F10

ret

amax_kernel_F1_INIT:

INIT_F1
subs N, N, #1
b amax_kernel_F1

amax_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble amax_kernel_L999

asr I, N, #2
cmp I, xzr
ble amax_kernel_S1

amax_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne amax_kernel_S4

amax_kernel_S1:

ands I, N, #3
ble amax_kernel_L999

amax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne amax_kernel_S10

amax_kernel_L999:

ret

amax_kernel_zero:

fmov MAXF, REG0
ret

EPILOGUE

+ 194
- 0
kernel/arm64/asum.S View File

@@ -0,0 +1,194 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1
ldr TMPF, [X], #SZ
fabs TMPF, TMPF
fadd SUMF, SUMF, TMPF
.endm

.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0]
fabs v1.4s, v1.4s // ABS() each value
fabs v2.4s, v2.4s // ABS() each value
fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0]
fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0]
PRFM PLDL1KEEP, [X, #1024]
#else // DOUBLE
ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X]
add X, X, #64
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fabs v5.2d, v5.2d

PRFM PLDL1KEEP, [X, #1024]

fadd v2.2d, v2.2d, v3.2d
fadd v4.2d, v4.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v4.2d
#endif
.endm

.macro KERNEL_F8_FINALIZE
#if !defined(DOUBLE)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
#else
faddp SUMF, v0.2d
#endif
.endm

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
#else
lsl INC_X, INC_X, #3
#endif
.endm

.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fabs TMPF, TMPF
fadd SUMF, SUMF, TMPF
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov SUMF, REG0
#if !defined(DOUBLE)
fmov s1, SUMF
#else
fmov d1, SUMF
#endif

cmp N, xzr
ble asum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN

asum_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq asum_kernel_F1

asum_kernel_F8:

KERNEL_F8

subs I, I, #1
bne asum_kernel_F8

KERNEL_F8_FINALIZE

asum_kernel_F1:

ands I, N, #7
ble asum_kernel_L999

asum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10

asum_kernel_L999:
ret

asum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble asum_kernel_S1

asum_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne asum_kernel_S4

asum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10

ret

EPILOGUE

+ 209
- 0
kernel/arm64/axpy.S View File

@@ -0,0 +1,209 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define Y x5 /* Y vector address */
#define INC_Y x6 /* Y stride */
#define I x1 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define DA s0 /* scale input value */
#define TMPX s1
#define TMPVX {v1.s}[0]
#define TMPY s2
#define TMPVY {v2.s}[0]
#define SZ 4
#else
#define DA d0 /* scale input value */
#define TMPX d1
#define TMPVX {v1.d}[0]
#define TMPY d2
#define TMPVY {v2.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1

ldr TMPX, [X], #SZ
ldr TMPY, [Y]
fmadd TMPY, TMPX, DA, TMPY
str TMPY, [Y], #SZ

.endm

.macro KERNEL_F4

#if !defined(DOUBLE)
ld1 {v1.4s}, [X], #16
ld1 {v2.4s}, [Y]
fmla v2.4s, v1.4s, v0.s[0]
st1 {v2.4s}, [Y], #16
#else // DOUBLE
ld1 {v1.2d, v2.2d}, [X], #32
ld1 {v3.2d, v4.2d}, [Y]
fmla v3.2d, v1.2d, v0.d[0]
fmla v4.2d, v2.2d, v0.d[0]
st1 {v3.2d, v4.2d}, [Y], #32
#endif

.endm

.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32
ld1 {v3.4s, v4.4s}, [Y]

fmla v3.4s, v1.4s, v0.s[0]
fmla v4.4s, v2.4s, v0.s[0]

st1 {v3.4s, v4.4s}, [Y], #32
#else // DOUBLE
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y]

fmla v16.2d, v1.2d, v0.d[0]
fmla v17.2d, v2.2d, v0.d[0]
fmla v18.2d, v3.2d, v0.d[0]
fmla v19.2d, v4.2d, v0.d[0]

st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64
#endif
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]
.endm

.macro INIT_S

#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif

.endm

.macro KERNEL_S1

ld1 TMPVX, [X], INC_X
ldr TMPY, [Y]
fmadd TMPY, TMPX, DA, TMPY
st1 TMPVY, [Y], INC_Y

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble axpy_kernel_L999

fcmp DA, #0.0
beq axpy_kernel_L999

cmp INC_X, #1
bne axpy_kernel_S_BEGIN
cmp INC_Y, #1
bne axpy_kernel_S_BEGIN

axpy_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq axpy_kernel_F1

axpy_kernel_F8:

KERNEL_F8

subs I, I, #1
bne axpy_kernel_F8

axpy_kernel_F1:

ands I, N, #7
ble axpy_kernel_L999

axpy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne axpy_kernel_F10

mov w0, wzr
ret

axpy_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble axpy_kernel_S1

axpy_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne axpy_kernel_S4

axpy_kernel_S1:

ands I, N, #3
ble axpy_kernel_L999

axpy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne axpy_kernel_S10

axpy_kernel_L999:

mov w0, wzr
ret

+ 170
- 0
kernel/arm64/casum.S View File

@@ -0,0 +1,170 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#define REG0 wzr
#define SUMF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4

/******************************************************************************/

.macro KERNEL_F1
ld1 {v1.2s}, [X], #8
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF
.endm

.macro KERNEL_F8
ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X]
add X, X, #64
fabs v1.4s, v1.4s
fabs v2.4s, v2.4s
fabs v3.4s, v3.4s
fabs v4.4s, v4.4s

PRFM PLDL1KEEP, [X, #1024]

fadd v1.4s, v1.4s, v2.4s
fadd v3.4s, v3.4s, v4.4s
fadd v0.4s, v0.4s, v1.4s
fadd v0.4s, v0.4s, v3.4s
.endm

.macro KERNEL_F8_FINALIZE
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SUMF, v0.2s
.endm

.macro INIT_S
lsl INC_X, INC_X, #3
.endm

.macro KERNEL_S1
ld1 {v1.2s}, [X], INC_X
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, TMPF, s2
fadd SUMF, SUMF, TMPF

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov SUMF, REG0
fmov s1, SUMF

cmp N, xzr
ble asum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN

asum_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq asum_kernel_F1

asum_kernel_F8:

KERNEL_F8

subs I, I, #1
bne asum_kernel_F8

KERNEL_F8_FINALIZE

asum_kernel_F1:

ands I, N, #7
ble asum_kernel_L999

asum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10

asum_kernel_L999:
ret

asum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble asum_kernel_S1

asum_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne asum_kernel_S4

asum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10

ret

EPILOGUE

+ 1667
- 0
kernel/arm64/cgemm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 232
- 0
kernel/arm64/copy.S View File

@@ -0,0 +1,232 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define TMPF s0
#define TMPVF {v0.s}[0]
#define SZ 4
#else
#define TMPF d0
#define TMPVF {v0.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1

#if !defined(COMPLEX)
ldr TMPF, [X], #SZ
str TMPF, [Y], #SZ
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X], #8
st1 {v0.2s}, [Y], #8
#else
ld1 {v0.2d}, [X], #16
st1 {v0.2d}, [Y], #16
#endif
#endif

.endm

.macro KERNEL_F4

#if !defined(COMPLEX)
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
st1 {v0.4s}, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
#endif
#else // COMPLEX
#if !defined(DOUBLE)
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
#else // DOUBLE
ld1 {v0.4s}, [X], #16
ld1 {v1.4s}, [X], #16
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [X], #16
st1 {v0.4s}, [Y], #16
st1 {v1.4s}, [Y], #16
st1 {v2.4s}, [Y], #16
st1 {v3.4s}, [Y], #16
#endif
#endif

.endm

.macro INIT_S

#if !defined(COMPLEX)
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
#else
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif
#endif

.endm

.macro KERNEL_S1

#if !defined(COMPLEX)
#if !defined(DOUBLE)
ldr w10, [X]
add X, X, INC_X
str w10, [Y]
add Y, Y, INC_Y
#else
ldr x10, [X]
add X, X, INC_X
str x10, [Y]
add Y, Y, INC_Y
#endif
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X]
add X, X, INC_X
st1 {v0.2s}, [Y]
add Y, Y, INC_Y
#else
ld1 {v0.2d}, [X]
add X, X, INC_X
st1 {v0.2d}, [Y]
add Y, Y, INC_Y
#endif
#endif

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble copy_kernel_L999

cmp INC_X, #1
bne copy_kernel_S_BEGIN
cmp INC_Y, #1
bne copy_kernel_S_BEGIN

copy_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq copy_kernel_F1

copy_kernel_F4:

KERNEL_F4

subs I, I, #1
bne copy_kernel_F4

copy_kernel_F1:

ands I, N, #3
ble copy_kernel_L999

copy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne copy_kernel_F10

mov w0, wzr
ret

copy_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble copy_kernel_S1

copy_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne copy_kernel_S4

copy_kernel_S1:

ands I, N, #3
ble copy_kernel_L999

copy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne copy_kernel_S10

copy_kernel_L999:

mov w0, wzr
ret

EPILOGUE

+ 1621
- 0
kernel/arm64/ctrmm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 1338
- 0
kernel/arm64/dgemm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 169
- 0
kernel/arm64/dnrm2.S View File

@@ -0,0 +1,169 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#define TMPF d6
#define SSQ d0
#define TMPVF {v6.d}[0]
#define SZ 8

/******************************************************************************/

.macro KERNEL_F1
ldr TMPF, [X], #SZ
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm

.macro KERNEL_F8
ld1 {v1.2d, v2.2d}, [X], #32
fmla v0.2d, v1.2d, v1.2d
fmla v5.2d, v2.2d, v2.2d
ld1 {v3.2d, v4.2d}, [X], #32
fmla v0.2d, v3.2d, v3.2d
fmla v5.2d, v4.2d, v4.2d
PRFM PLDL1KEEP, [X, #1024]
.endm

.macro nrm2_kernel_F8_FINALIZE
fadd v0.2d, v0.2d, v5.2d
faddp SSQ, v0.2d
.endm

.macro INIT_S
lsl INC_X, INC_X, #3
ld1 TMPVF, [X], INC_X
fmul SSQ, TMPF, TMPF
.endm

.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov SSQ, xzr
fmov d5, SSQ

cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN

nrm2_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq nrm2_kernel_F1_INIT

nrm2_kernel_F8:

KERNEL_F8

subs I, I, #1
bne nrm2_kernel_F8

nrm2_kernel_F8_FINALIZE

nrm2_kernel_F1:

ands I, N, #7
ble nrm2_kernel_L999

nrm2_kernel_F10:

KERNEL_F1

subs I, I, #1
bne nrm2_kernel_F10

b nrm2_kernel_L999

nrm2_kernel_F1_INIT:

b nrm2_kernel_F1

nrm2_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble nrm2_kernel_L999

asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1

nrm2_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S4

nrm2_kernel_S1:

ands I, N, #3
ble nrm2_kernel_L999

nrm2_kernel_S10:

KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S10

nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret

nrm2_kernel_zero:
ret

EPILOGUE

+ 227
- 0
kernel/arm64/dot.S View File

@@ -0,0 +1,227 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#if !defined(DSDOT)
#define REG0 wzr
#define DOTF s0
#else // DSDOT
#define REG0 xzr
#define DOTF d0
#endif
#define DOTI s1
#define TMPX s2
#define LD1VX {v2.s}[0]
#define TMPY s3
#define LD1VY {v3.s}[0]
#define TMPVY v3.s[0]
#define SZ 4
#else
#define REG0 xzr
#define DOTF d0
#define DOTI d1
#define TMPX d2
#define LD1VX {v2.d}[0]
#define TMPY d3
#define LD1VY {v3.d}[0]
#define TMPVY v3.d[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1
ldr TMPX, [X], #SZ
ldr TMPY, [Y], #SZ
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fmul TMPX, TMPX, TMPY
fcvt d2, TMPX
fadd DOTF, DOTF, d2
#endif
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [X], #16
ld1 {v3.4s}, [Y], #16
#if !defined(DSDOT)
fmla v0.4s, v2.4s, v3.4s
#else
fmul v2.4s, v2.4s, v3.4s
ext v3.16b, v2.16b, v2.16b, #8
fcvtl v2.2d, v2.2s
fcvtl v3.2d, v3.2s
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v3.2d
#endif
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [X], #32
ld1 {v4.2d, v5.2d}, [Y], #32
fmul v2.2d, v2.2d, v4.2d
fmul v3.2d, v3.2d, v5.2d
fadd v0.2d, v0.2d, v2.2d
fadd v0.2d, v0.2d, v3.2d
#endif
PRFM PLDL1KEEP, [X, #1024]
PRFM PLDL1KEEP, [Y, #1024]
.endm

.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
#if !defined(DSDOT)
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp DOTF, v0.2s
#else
faddp DOTF, v0.2d
#endif
#else //DOUBLE
faddp DOTF, v0.2d
#endif
.endm

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
.endm

.macro KERNEL_S1
ld1 LD1VX, [X], INC_X
ld1 LD1VY, [Y], INC_Y
#if !defined(DSDOT)
fmadd DOTF, TMPX, TMPY, DOTF
#else // DSDOT
fmul TMPX, TMPX, TMPY
fcvt d2, TMPX
fadd DOTF, DOTF, d2
#endif
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov DOTF, REG0
#if defined(DOUBLE)
fmov d6, DOTF
#endif

cmp N, xzr
ble dot_kernel_L999

cmp INC_X, #1
bne dot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN

dot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq dot_kernel_F1

dot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne dot_kernel_F4

KERNEL_F4_FINALIZE

dot_kernel_F1:

ands I, N, #3
ble dot_kernel_L999

dot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne dot_kernel_F10

ret

dot_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble dot_kernel_S1

dot_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne dot_kernel_S4

dot_kernel_S1:

ands I, N, #3
ble dot_kernel_L999

dot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne dot_kernel_S10

dot_kernel_L999:

ret

EPILOGUE

+ 1398
- 0
kernel/arm64/dtrmm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 320
- 0
kernel/arm64/gemv_n.S View File

@@ -0,0 +1,320 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define Y_IPTR x10 /* loop Y vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define Y_OPTR x13 /* loop Y vector address */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define ALPHA s0
#define TEMP s1
#define TEMPV {v1.s}[0]
#define TMP1 s2
#define TMPV1 {v2.s}[0]
#define TMP2 s3
#define TMPV2 {v3.s}[0]
#define SZ 4
#define SHZ 2
#else
#define ALPHA d0
#define TEMP d1
#define TEMPV {v1.d}[0]
#define TMP1 d2
#define TMPV1 {v2.d}[0]
#define TMP2 d3
#define TMPV2 {v3.d}[0]
#define SZ 8
#define SHZ 3
#endif

/******************************************************************************/

.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm

.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm

.macro KERNEL_F16
#if !defined(DOUBLE)
ld1 {v2.4s, v3.4s}, [A_PTR], #32
ld1 {v4.4s, v5.4s}, [Y_IPTR], #32
fmla v4.4s, v1.4s, v2.4s
fmla v5.4s, v1.4s, v3.4s
st1 {v4.4s, v5.4s}, [Y_OPTR], #32

ld1 {v6.4s, v7.4s}, [A_PTR], #32
ld1 {v8.4s, v9.4s}, [Y_IPTR], #32
fmla v8.4s, v1.4s, v6.4s
fmla v9.4s, v1.4s, v7.4s
st1 {v8.4s, v9.4s}, [Y_OPTR], #32
#else //DOUBLE
ld1 {v2.2d, v3.2d}, [A_PTR], #32
ld1 {v4.2d, v5.2d}, [Y_IPTR], #32
fmla v4.2d, v1.2d, v2.2d
fmla v5.2d, v1.2d, v3.2d
st1 {v4.2d, v5.2d}, [Y_OPTR], #32

ld1 {v6.2d, v7.2d}, [A_PTR], #32
ld1 {v8.2d, v9.2d}, [Y_IPTR], #32
fmla v8.2d, v1.2d, v6.2d
fmla v9.2d, v1.2d, v7.2d
st1 {v8.2d, v9.2d}, [Y_OPTR], #32

ld1 {v10.2d, v11.2d}, [A_PTR], #32
ld1 {v12.2d, v13.2d}, [Y_IPTR], #32
fmla v12.2d, v1.2d, v10.2d
fmla v13.2d, v1.2d, v11.2d
st1 {v12.2d, v13.2d}, [Y_OPTR], #32

ld1 {v14.2d, v15.2d}, [A_PTR], #32
ld1 {v16.2d, v17.2d}, [Y_IPTR], #32
fmla v16.2d, v1.2d, v14.2d
fmla v17.2d, v1.2d, v15.2d
st1 {v16.2d, v17.2d}, [Y_OPTR], #32
#endif
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [A_PTR], #16
ld1 {v3.4s}, [Y_IPTR], #16
fmla v3.4s, v1.4s, v2.4s
st1 {v3.4s}, [Y_OPTR], #16
#else
ld1 {v2.2d}, [A_PTR], #16
ld1 {v3.2d}, [Y_IPTR], #16
fmla v3.2d, v1.2d, v2.2d
st1 {v3.2d}, [Y_OPTR], #16

ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [Y_IPTR], #16
fmla v5.2d, v1.2d, v4.2d
st1 {v5.2d}, [Y_OPTR], #16
#endif
.endm

.macro KERNEL_F1

ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [Y_IPTR]
fmadd TMP2, TEMP, TMP1, TMP2
st1 TMPV2, [Y_IPTR], #SZ

.endm

.macro INIT_S

lsl INC_Y, INC_Y, #SHZ

.endm

.macro KERNEL_S1

ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [Y_IPTR]
fmadd TMP2, TEMP, TMP1, TMP2
st1 TMPV2, [Y_IPTR], INC_Y

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

ldr INC_Y, [sp]

SAVE_REGS

cmp N, xzr
ble gemv_n_kernel_L999
cmp M, xzr
ble gemv_n_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N

cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN

gemv_n_kernel_F_LOOP:

ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
#if !defined(DOUBLE)
ins v1.s[1], v1.s[0]
ins v1.s[2], v1.s[0]
ins v1.s[3], v1.s[0]
#else
ins v1.d[1], v1.d[0]
#endif
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y

gemv_n_kernel_F32:

asr I, M, #5
cmp I, xzr
beq gemv_n_kernel_F4

gemv_n_kernel_F320:

KERNEL_F16
KERNEL_F16

subs I, I, #1
bne gemv_n_kernel_F320

gemv_n_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_n_kernel_F1

gemv_n_kernel_F40:

KERNEL_F4

subs I, I, #1
bne gemv_n_kernel_F40

gemv_n_kernel_F1:
ands I, M, #3
ble gemv_n_kernel_F_END

gemv_n_kernel_F10:

KERNEL_F1

subs I, I, #1
bne gemv_n_kernel_F10

gemv_n_kernel_F_END:

add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_F_LOOP

b gemv_n_kernel_L999

gemv_n_kernel_S_BEGIN:

INIT_S

gemv_n_kernel_S_LOOP:

ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP
mov A_PTR, A
mov Y_IPTR, Y

asr I, M, #2
cmp I, xzr
ble gemv_n_kernel_S1

gemv_n_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne gemv_n_kernel_S4

gemv_n_kernel_S1:

ands I, M, #3
ble gemv_n_kernel_S_END

gemv_n_kernel_S10:

KERNEL_S1

subs I, I, #1
bne gemv_n_kernel_S10

gemv_n_kernel_S_END:

add A, A, LDA
subs J, J, #1
bne gemv_n_kernel_S_LOOP

gemv_n_kernel_L999:

mov w0, wzr

RESTORE_REGS

ret

EPILOGUE

+ 347
- 0
kernel/arm64/gemv_t.S View File

@@ -0,0 +1,347 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define X_PTR x10 /* loop X vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define REG0 wzr
#define ALPHA s0
#define TEMP s1
#define TEMP1 s2
#define TEMP2 s3
#define TEMP3 s4
#define TEMPV {v1.s}[0]
#define TMP1 s2
#define TMPV1 {v2.s}[0]
#define TMP2 s3
#define TMPV2 {v3.s}[0]
#define SZ 4
#define SHZ 2
#else
#define REG0 xzr
#define ALPHA d0
#define TEMP d1
#define TEMP1 d2
#define TEMP2 d3
#define TEMP3 d4
#define TEMPV {v1.d}[0]
#define TMP1 d2
#define TMPV1 {v2.d}[0]
#define TMP2 d3
#define TMPV2 {v3.d}[0]
#define SZ 8
#define SHZ 3
#endif

/******************************************************************************/

.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm

.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm

.macro KERNEL_F32
#if !defined(DOUBLE)
ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64
ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64
fmla v1.4s, v5.4s, v9.4s
fmla v2.4s, v6.4s, v10.4s
fmla v3.4s, v7.4s, v11.4s
fmla v4.4s, v8.4s, v12.4s

ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64
ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64
fmla v1.4s, v13.4s, v17.4s
fmla v2.4s, v14.4s, v18.4s
fmla v3.4s, v15.4s, v19.4s
fmla v4.4s, v16.4s, v20.4s
#else
ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
fmla v2.2d, v6.2d, v10.2d
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d

ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
fmla v2.2d, v14.2d, v18.2d
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d

ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64
ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64
fmla v1.2d, v5.2d, v9.2d
fmla v2.2d, v6.2d, v10.2d
fmla v3.2d, v7.2d, v11.2d
fmla v4.2d, v8.2d, v12.2d

ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64
ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64
fmla v1.2d, v13.2d, v17.2d
fmla v2.2d, v14.2d, v18.2d
fmla v3.2d, v15.2d, v19.2d
fmla v4.2d, v16.2d, v20.2d
#endif
.endm

.macro KERNEL_F32_FINALIZE
#if !defined(DOUBLE)
fadd v1.4s, v1.4s, v2.4s
fadd v1.4s, v1.4s, v3.4s
fadd v1.4s, v1.4s, v4.4s
#else
fadd v1.2d, v1.2d, v2.2d
fadd v1.2d, v1.2d, v3.2d
fadd v1.2d, v1.2d, v4.2d
#endif
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [A_PTR], #16
ld1 {v3.4s}, [X_PTR], #16
fmla v1.4s, v2.4s, v3.4s
#else
ld1 {v2.2d}, [A_PTR], #16
ld1 {v3.2d}, [X_PTR], #16
fmla v1.2d, v2.2d, v3.2d

ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [X_PTR], #16
fmla v1.2d, v4.2d, v5.2d
#endif
.endm

.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
ext v2.16b, v1.16b, v1.16b, #8
fadd v1.2s, v1.2s, v2.2s
faddp TEMP, v1.2s
#else
faddp TEMP, v1.2d
#endif
.endm

.macro KERNEL_F1
ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [X_PTR], #SZ
fmadd TEMP, TMP1, TMP2, TEMP
.endm

.macro INIT_S
lsl INC_X, INC_X, #SHZ
.endm

.macro KERNEL_S1
ld1 TMPV1, [A_PTR], #SZ
ld1 TMPV2, [X_PTR], INC_X
fmadd TEMP, TMP1, TMP2, TEMP
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

ldr INC_Y, [sp]

SAVE_REGS

cmp N, xzr
ble gemv_t_kernel_L999
cmp M, xzr
ble gemv_t_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N

cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN

gemv_t_kernel_F_LOOP:

fmov TEMP, REG0
fmov TEMP1, REG0
fmov TEMP2, REG0
fmov TEMP3, REG0

mov A_PTR, A
mov X_PTR, X

gemv_t_kernel_F32:

asr I, M, #5
cmp I, xzr
beq gemv_t_kernel_F4

gemv_t_kernel_F320:

KERNEL_F32

subs I, I, #1
bne gemv_t_kernel_F320

KERNEL_F32_FINALIZE

gemv_t_kernel_F4:
ands I, M, #31
asr I, I, #2
cmp I, xzr
beq gemv_t_kernel_F1

gemv_t_kernel_F40:

KERNEL_F4

subs I, I, #1
bne gemv_t_kernel_F40

gemv_t_kernel_F1:

KERNEL_F4_FINALIZE

ands I, M, #3
ble gemv_t_kernel_F_END

gemv_t_kernel_F10:

KERNEL_F1

subs I, I, #1
bne gemv_t_kernel_F10

gemv_t_kernel_F_END:

ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP

b gemv_t_kernel_L999

gemv_t_kernel_S_BEGIN:

INIT_S

gemv_t_kernel_S_LOOP:

fmov TEMP, REG0
mov A_PTR, A
mov X_PTR, X

asr I, M, #2
cmp I, xzr
ble gemv_t_kernel_S1

gemv_t_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne gemv_t_kernel_S4

gemv_t_kernel_S1:

ands I, M, #3
ble gemv_t_kernel_S_END

gemv_t_kernel_S10:

KERNEL_S1

subs I, I, #1
bne gemv_t_kernel_S10

gemv_t_kernel_S_END:

ld1 TMPV1, [Y]
add A, A, LDA
subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP

gemv_t_kernel_L999:

RESTORE_REGS

mov w0, wzr
ret

EPILOGUE

+ 124
- 0
kernel/arm64/idamax.S View File

@@ -0,0 +1,124 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define INDEX x3 /* index of max/min value */
#define Z x4 /* vector index */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif

#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8

/******************************************************************************/

.macro INIT_S
lsl INC_X, INC_X, #3
ld1 {v0.d}[0], [X], INC_X
mov Z, #1
mov INDEX, Z
fabs MAXF, MAXF
.endm

.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
fabs TMPF, TMPF
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble iamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero

INIT_S

subs N, N, #1
ble iamax_kernel_L999

asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1

iamax_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne iamax_kernel_S4

iamax_kernel_S1:

ands I, N, #3
ble iamax_kernel_L999

iamax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne iamax_kernel_S10

iamax_kernel_L999:

mov x0, INDEX
ret

iamax_kernel_zero:

mov x0, xzr
ret

EPILOGUE

+ 213
- 0
kernel/arm64/isamax.S View File

@@ -0,0 +1,213 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define INDEX x3 /* index of max/min value */
#define Z x4 /* vector index */
#define I x5 /* loop variable */
#define X_COPY x6 /* copy of X address */
#define MAXF_Z x7

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#define MAXF s5
#define TMPF s6
#define TMPVF {v6.s}[0]
#define SZ 4

/******************************************************************************/

.macro INIT_F1
ldr MAXF, [X], #SZ
mov Z, #1
mov INDEX, Z
fabs MAXF, MAXF
.endm

.macro KERNEL_F1
ldr TMPF, [X], #SZ
add Z, Z, #1
fabs TMPF, TMPF
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel INDEX, INDEX, Z, le
.endm

.macro INIT_F4
ld1 {v0.4s}, [X], #16
fabs v0.4s, v0.4s
fmaxv MAXF, v0.4s
mov Z, #5
mov MAXF_Z, #1
.endm

.macro KERNEL_F4
ld1 {v0.4s}, [X], #16
fabs v0.4s, v0.4s
fmaxv TMPF, v0.4s
PRFM PLDL1KEEP, [X, #512]
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel MAXF_Z, MAXF_Z, Z, le
add Z, Z, #4
.endm


.macro KERNEL_F4_FINALIZE
mov INDEX, MAXF_Z
sub MAXF_Z, MAXF_Z, #1
lsl MAXF_Z, MAXF_Z, #2
add X_COPY, X_COPY, MAXF_Z
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
ldr TMPF, [X_COPY], #SZ
fabs TMPF, TMPF
fcmp TMPF, MAXF
beq KERNEL_F4_FINALIZE_DONE
add INDEX, INDEX, #1
KERNEL_F4_FINALIZE_DONE:
.endm


.macro INIT_S
lsl INC_X, INC_X, #2
ld1 TMPVF, [X], INC_X
mov Z, #1
mov INDEX, Z
fabs MAXF, TMPF
.endm

.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
add Z, Z, #1
fabs TMPF, TMPF
fcmp TMPF, MAXF
fcsel MAXF, MAXF, TMPF, le
csel INDEX, INDEX, Z, le
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble iamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero

PRFM PLDL1KEEP, [X]
mov X_COPY, X

cmp INC_X, #1
bne iamax_kernel_S_BEGIN

iamax_kernel_F_BEGIN:
asr I, N, #2
cmp I, xzr
beq iamax_kernel_F1_INIT

INIT_F4
subs I, I, #1
beq iamax_kernel_F4_FINALIZE

iamax_kernel_F4:
KERNEL_F4
subs I, I, #1
bne iamax_kernel_F4

iamax_kernel_F4_FINALIZE:
KERNEL_F4_FINALIZE

iamax_kernel_F1:
ands I, N, #3
ble iamax_kernel_L999

iamax_kernel_F10:
KERNEL_F1
subs I, I, #1
bne iamax_kernel_F10
b iamax_kernel_L999

iamax_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b iamax_kernel_F1

iamax_kernel_S_BEGIN:
INIT_S

subs N, N, #1
ble iamax_kernel_L999

asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1

iamax_kernel_S4:
KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne iamax_kernel_S4

iamax_kernel_S1:
ands I, N, #3
ble iamax_kernel_L999

iamax_kernel_S10:
KERNEL_S1
subs I, I, #1
bne iamax_kernel_S10

iamax_kernel_L999:
mov x0, INDEX
ret

iamax_kernel_zero:
mov x0, xzr
ret

EPILOGUE

+ 151
- 0
kernel/arm64/izamax.S View File

@@ -0,0 +1,151 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define INDEX x3 /* index of max/min value */
#define Z x4 /* vector index */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif

#if !defined(DOUBLE)
#define MAXF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
ld1 {v0.2s}, [X], INC_X
mov Z, #1
mov INDEX, Z
fabs v0.2s, v0.2s
ext v1.8b, v0.8b, v0.8b, #4
fadd MAXF, s0, s1
#else
lsl INC_X, INC_X, #4
ld1 {v0.2d}, [X], INC_X
mov Z, #1
mov INDEX, Z
fabs v0.2d, v0.2d
faddp MAXF, v0.2d
#endif
.endm

.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
add Z, Z, #1
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, s1, s2
#else
ld1 {v1.2d}, [X], INC_X
add Z, Z, #1
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
csel INDEX, INDEX, Z, COND
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble iamax_kernel_zero
cmp INC_X, xzr
ble iamax_kernel_zero

INIT_S

subs N, N, #1
ble iamax_kernel_L999

asr I, N, #2
cmp I, xzr
ble iamax_kernel_S1

iamax_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne iamax_kernel_S4

iamax_kernel_S1:

ands I, N, #3
ble iamax_kernel_L999

iamax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne iamax_kernel_S10

iamax_kernel_L999:

mov x0, INDEX
ret

iamax_kernel_zero:

mov x0, xzr
ret

EPILOGUE

+ 243
- 0
kernel/arm64/rot.S View File

@@ -0,0 +1,243 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define C s0 /* scale input value */
#define S s1 /* scale input value */
#else
#define C d0 /* scale input value */
#define S d1 /* scale input value */
#endif

/******************************************************************************/

.macro INIT
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // [C, C]
#else
ins v0.d[1], v0.d[0] // [C, C]
#endif
.endm

.macro INIT_F1
#if !defined(DOUBLE)
fneg s2, S
ins v1.s[1], v2.s[0] // [-S, S]
#else
fneg d2, S
ins v1.d[1], v2.d[0] // [-S, S]
#endif
.endm

.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v2.s}[0], [X]
ld1 {v2.s}[1], [Y] // [Y, X]
ext v3.8b, v2.8b, v2.8b, #4 // [X, Y]
fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X]
fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y]
st1 {v4.s}[0], [X], #4
st1 {v4.s}[1], [Y], #4
#else
ld1 {v2.d}[0], [X]
ld1 {v2.d}[1], [Y] // [Y, X]
ext v3.16b, v2.16b, v2.16b, #8 // [X, Y]
fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X]
fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y]
st1 {v4.d}[0], [X], #8
st1 {v4.d}[1], [Y], #8
#endif
.endm

.macro KERNEL_INIT_F4
#if !defined(DOUBLE)
ins v0.d[1], v0.d[0] // [C, C, C, C]
ins v1.s[1], v1.s[0]
ins v1.d[1], v1.d[0] // [S, S, S, S]
#else
ins v1.d[1], v1.d[0] // [S, S]
#endif
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)
ld1 {v2.4s}, [X]
fmul v4.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0
ld1 {v3.4s}, [Y]
fmla v4.4s, v1.4s, v3.4s // C*X3+S*Y3, ..., C*X0+S*Y0
st1 {v4.4s}, [X], #16
fmul v5.4s, v0.4s, v3.4s // C*Y3, C*Y2, C*Y1, C*Y0
fmls v5.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0
st1 {v5.4s}, [Y], #16
#else // DOUBLE
ld1 {v2.2d, v3.2d}, [X]
fmul v6.2d, v0.2d, v2.2d // C*X1, C*X0
fmul v7.2d, v0.2d, v3.2d // C*X3, C*X2
ld1 {v4.2d, v5.2d}, [Y]
fmla v6.2d, v1.2d, v4.2d // C*X1+S*Y1, C*X0+S*Y0
fmla v7.2d, v1.2d, v5.2d // C*X3+S*Y3, C*X2+S*Y2
st1 {v6.2d, v7.2d}, [X], #32
fmul v16.2d, v0.2d, v4.2d // C*Y1, C*Y0
fmul v17.2d, v0.2d, v5.2d // C*Y3, C*Y2
fmls v16.2d, v1.2d, v2.2d // C*Y1-S*X1, C*Y0-S*X0
fmls v17.2d, v1.2d, v3.2d // C*Y3-S*X3, C*Y2-S*X2
st1 {v16.2d, v17.2d}, [Y], #32
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]
#endif
.endm

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
.endm

.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.s}[0], [X]
ld1 {v2.s}[1], [Y] // [Y, X]
ext v3.8b, v2.8b, v2.8b, #4 // [X, Y]
fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X]
fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y]
st1 {v4.s}[0], [X], INC_X
st1 {v4.s}[1], [Y], INC_Y
#else
ld1 {v2.d}[0], [X]
ld1 {v2.d}[1], [Y] // [Y, X]
ext v3.16b, v2.16b, v2.16b, #8 // [X, Y]
fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X]
fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y]
st1 {v4.d}[0], [X], INC_X
st1 {v4.d}[1], [Y], INC_Y
#endif

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble rot_kernel_L999

INIT

cmp INC_X, #1
bne rot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN

rot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq rot_kernel_F1

KERNEL_INIT_F4

rot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne rot_kernel_F4

rot_kernel_F1:

ands I, N, #3
ble rot_kernel_L999

INIT_F1

rot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne rot_kernel_F10

mov w0, wzr
ret

rot_kernel_S_BEGIN:

INIT_S
INIT_F1


asr I, N, #2
cmp I, xzr
ble rot_kernel_S1

rot_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne rot_kernel_S4

rot_kernel_S1:

ands I, N, #3
ble rot_kernel_L999


rot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne rot_kernel_S10

rot_kernel_L999:

mov w0, wzr
ret

+ 253
- 0
kernel/arm64/scal.S View File

@@ -0,0 +1,253 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x3 /* X vector address */
#define X_COPY x5 /* X vector address */
#define INC_X x4 /* X stride */
#define I x1 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define DA s0 /* scale input value */
#define DAV {v0.s}[0]
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define DA d0 /* scale input value */
#define DAV {v0.d}[0]
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1

ldr TMPF, [X]
fmul TMPF, TMPF, DA
str TMPF, [X], #SZ

.endm

.macro KERNEL_INIT_F8

#if !defined(DOUBLE)
ins v0.s[1], v0.s[0]
ins v0.s[2], v0.s[0]
ins v0.s[3], v0.s[0]
#else
ins v0.d[1], v0.d[0]
#endif

.endm

.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X]
fmul v1.4s, v1.4s, v0.4s
fmul v2.4s, v2.4s, v0.4s
st1 {v1.4s, v2.4s}, [X], #32
#else // DOUBLE
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X]
fmul v1.2d, v1.2d, v0.2d
fmul v2.2d, v2.2d, v0.2d
fmul v3.2d, v3.2d, v0.2d
fmul v4.2d, v4.2d, v0.2d
st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm

.macro INIT_S

#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
#else
lsl INC_X, INC_X, #3
#endif

.endm

.macro KERNEL_S1
ldr TMPF, [X]
fmul TMPF, TMPF, DA
st1 TMPVF, [X], INC_X
.endm

.macro KERNEL_S4
#if !defined(DOUBLE)
ldr s1, [X]
add X, X, INC_X
fmul s1, s1, s0
str s1, [X_COPY]
add X_COPY, X_COPY, INC_X

ldr s2, [X]
add X, X, INC_X
fmul s2, s2, s0
str s2, [X_COPY]
add X_COPY, X_COPY, INC_X

ldr s3, [X]
add X, X, INC_X
fmul s3, s3, s0
str s3, [X_COPY]
add X_COPY, X_COPY, INC_X

ldr s4, [X]
add X, X, INC_X
fmul s4, s4, s0
str s4, [X_COPY]
add X_COPY, X_COPY, INC_X
#else
ldr d1, [X]
add X, X, INC_X
fmul d1, d1, d0
str d1, [X_COPY]
add X_COPY, X_COPY, INC_X

ldr d2, [X]
add X, X, INC_X
fmul d2, d2, d0
str d2, [X_COPY]
add X_COPY, X_COPY, INC_X

ldr d3, [X]
add X, X, INC_X
fmul d3, d3, d0
str d3, [X_COPY]
add X_COPY, X_COPY, INC_X

ldr d4, [X]
add X, X, INC_X
fmul d4, d4, d0
str d4, [X_COPY]
add X_COPY, X_COPY, INC_X
#endif
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble scal_kernel_L999

fcmp DA, #0.0
beq scal_kernel_zero

cmp INC_X, #1
bne scal_kernel_S_BEGIN

scal_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq scal_kernel_F1

KERNEL_INIT_F8

scal_kernel_F8:

KERNEL_F8

subs I, I, #1
bne scal_kernel_F8

scal_kernel_F1:

ands I, N, #7
ble scal_kernel_L999

scal_kernel_F10:

KERNEL_F1

subs I, I, #1
bne scal_kernel_F10

mov w0, wzr
ret

scal_kernel_S_BEGIN:

INIT_S
mov X_COPY, X

asr I, N, #2
cmp I, xzr
ble scal_kernel_S1

scal_kernel_S4:

KERNEL_S4

subs I, I, #1
bne scal_kernel_S4

scal_kernel_S1:

ands I, N, #3
ble scal_kernel_L999

scal_kernel_S10:

KERNEL_S1

subs I, I, #1
bne scal_kernel_S10

scal_kernel_L999:

mov w0, wzr
ret

scal_kernel_zero:

INIT_S

scal_kernel_Z1:

st1 DAV, [X], INC_X
subs N, N, #1
bne scal_kernel_Z1

mov w0, wzr
ret

EPILOGUE

+ 807
- 571
kernel/arm64/sgemm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 178
- 0
kernel/arm64/snrm2.S View File

@@ -0,0 +1,178 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#define TMPF s6
#define SSQ s0
#define TMPVF {v6.s}[0]
#define SZ 4

/******************************************************************************/

.macro INIT_F1
ldr TMPF, [X], #SZ
fmul SSQ, TMPF, TMPF
.endm

.macro KERNEL_F1
ldr TMPF, [X], #SZ
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm

.macro INIT_F4
ld1 {v1.4s}, [X], #16
fmul v1.4s, v1.4s, v1.4s
ext v2.16b, v1.16b, v1.16b, #8
fadd v2.2s, v1.2s, v2.2s
faddp SSQ, v2.2s
.endm

.macro KERNEL_F4
ld1 {v1.4s}, [X], #16
fmul v1.4s, v1.4s, v1.4s
ext v2.16b, v1.16b, v1.16b, #8
fadd v2.2s, v1.2s, v2.2s
faddp TMPF, v2.2s
fadd SSQ, SSQ, TMPF
.endm

.macro INIT_S
lsl INC_X, INC_X, #2
ld1 TMPVF, [X], INC_X
fmul SSQ, TMPF, TMPF
.endm

.macro KERNEL_S1
ld1 TMPVF, [X], INC_X
fmul TMPF, TMPF, TMPF
fadd SSQ, SSQ, TMPF
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN

nrm2_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq nrm2_kernel_F1_INIT

INIT_F4
subs I, I, #1
beq nrm2_kernel_F1

nrm2_kernel_F4:

KERNEL_F4

subs I, I, #1
bne nrm2_kernel_F4

nrm2_kernel_F1:

ands I, N, #3
ble nrm2_kernel_L999

nrm2_kernel_F10:

KERNEL_F1

subs I, I, #1
bne nrm2_kernel_F10

b nrm2_kernel_L999

nrm2_kernel_F1_INIT:
INIT_F1
subs N, N, #1
b nrm2_kernel_F1

nrm2_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble nrm2_kernel_L999

asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1

nrm2_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S4

nrm2_kernel_S1:

ands I, N, #3
ble nrm2_kernel_L999

nrm2_kernel_S10:

KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S10

nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret

nrm2_kernel_zero:
fmov SSQ, wzr

ret

EPILOGUE

+ 1405
- 0
kernel/arm64/strmm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 266
- 0
kernel/arm64/swap.S View File

@@ -0,0 +1,266 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define Y x5 /* Y vector address */
#define INC_Y x6 /* Y stride */
#define I x1 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define TMP0 s0
#define TMPV0 {v0.s}[0]
#define TMP1 s1
#define TMPV1 {v1.s}[0]
#define SZ 4
#else
#define TMP0 d0
#define TMPV0 {v0.d}[0]
#define TMP1 d1
#define TMPV1 {v1.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1

#if !defined(COMPLEX)
ldr TMP0, [X]
ldr TMP1, [Y]
str TMP0, [Y], #SZ
str TMP1, [X], #SZ
#else
#if !defined(DOUBLE)
ld1 {v0.2s}, [X]
ld1 {v1.2s}, [Y]
st1 {v0.2s}, [Y], #8
st1 {v1.2s}, [X], #8
#else
ld1 {v0.2d}, [X]
ld1 {v1.2d}, [Y]
st1 {v0.2d}, [Y], #16
st1 {v1.2d}, [X], #16
#endif
#endif

.endm

.macro KERNEL_F8

#if !defined(COMPLEX)
#if !defined(DOUBLE)
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#else // DOUBLE
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#endif
#else // COMPLEX
#if !defined(DOUBLE)
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#else // DOUBLE
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
ld1 {v0.4s, v1.4s}, [X]
ld1 {v2.4s, v3.4s}, [Y]
st1 {v0.4s, v1.4s}, [Y], #32
st1 {v2.4s, v3.4s}, [X], #32
#endif
#endif

.endm

.macro INIT_S

#if !defined(COMPLEX)
#if !defined(DOUBLE)
lsl INC_X, INC_X, #2
lsl INC_Y, INC_Y, #2
#else
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#endif
#else
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif
#endif

.endm

.macro KERNEL_S1

#if !defined(COMPLEX)
#if !defined(DOUBLE)
ldr w10, [X]
ldr w11, [Y]
str w10, [Y]
str w11, [X]
#else
ldr x10, [X]
ldr x11, [Y]
str x10, [Y]
str x11, [X]
#endif
#else
#if !defined(DOUBLE)
ldr x10, [X]
ldr x11, [Y]
str x10, [Y]
str x11, [X]
#else
ldr x10, [X]
ldr x11, [Y]
str x10, [Y]
str x11, [X]

ldr x12, [X, #8]
ldr x13, [Y, #8]
str x12, [Y, #8]
str x13, [X, #8]
#endif
#endif
add Y, Y, INC_Y
add X, X, INC_X
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble swap_kernel_L999

cmp INC_X, #1
bne swap_kernel_S_BEGIN
cmp INC_Y, #1
bne swap_kernel_S_BEGIN

swap_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq swap_kernel_F1

swap_kernel_F8:

KERNEL_F8

subs I, I, #1
bne swap_kernel_F8

swap_kernel_F1:

ands I, N, #7
ble swap_kernel_L999

swap_kernel_F10:

KERNEL_F1

subs I, I, #1
bne swap_kernel_F10

b swap_kernel_L999


swap_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble swap_kernel_S1

swap_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne swap_kernel_S4

swap_kernel_S1:

ands I, N, #3
ble swap_kernel_L999

swap_kernel_S10:

KERNEL_S1

subs I, I, #1
bne swap_kernel_S10

swap_kernel_L999:

mov w0, wzr
ret

EPILOGUE

+ 273
- 0
kernel/arm64/zamax.S View File

@@ -0,0 +1,273 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if defined(USE_MIN)
#define COND le
#else
#define COND ge
#endif

#if !defined(DOUBLE)
#define REG0 wzr
#define MAXF s0
#define TMPF s1
#define TMPVF {v1.s}[0]
#define SZ 4
#else
#define REG0 xzr
#define MAXF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro INIT_F1
#if !defined(DOUBLE)
ld1 {v0.2s}, [X], #8
fabs v0.2s, v0.2s
ext v1.8b, v0.8b, v0.8b, #4
fadd MAXF, s0, s1
#else
ld1 {v0.2d}, [X], #16
fabs v0.2d, v0.2d
faddp MAXF, v0.2d
#endif
.endm

.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], #8
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, s1, s2
#else
ld1 {v1.2d}, [X], #16
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm

.macro INIT_F4
#if !defined(DOUBLE)
ld2 {v0.4s,v1.4s}, [X], #32
fabs v0.4s, v0.4s // [X6, X4, X2, X0]
fabs v1.4s, v1.4s // [X7, X5, X3, X1]
fadd v0.4s, v0.4s, v1.4s // [X7+X6, X5+X4, X3+X2, X1+X0]
#if defined(USE_MIN)
fminv MAXF, v0.4s
#else
fmaxv MAXF, v0.4s
#endif
#else // DOUBLE
ld4 {v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64
fabs v0.2d, v0.2d
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fadd v0.2d, v0.2d, v1.2d
fadd v2.2d, v2.2d, v3.2d
#if defined(USE_MIN)
fmin v0.2d, v0.2d, v2.2d
fminp MAXF, v0.2d
#else
fmax v0.2d, v0.2d, v2.2d
fmaxp MAXF, v0.2d
#endif
#endif
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)
ld2 {v1.4s,v2.4s}, [X], #32
fabs v1.4s, v1.4s // [X6, X4, X2, X0]
fabs v2.4s, v2.4s // [X7, X5, X3, X1]
fadd v1.4s, v1.4s, v2.4s // [X7+X6, X5+X4, X3+X2, X1+X0]
#if defined(USE_MIN)
fminv TMPF, v1.4s
#else
fmaxv TMPF, v1.4s
#endif
#else // DOUBLE
ld4 {v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d
fadd v1.2d, v1.2d, v2.2d
fadd v3.2d, v3.2d, v4.2d
#if defined(USE_MIN)
fmin v1.2d, v1.2d, v3.2d
fminp MAXF, v1.2d
#else
fmax v1.2d, v1.2d, v3.2d
fmaxp MAXF, v1.2d
#endif
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
ld1 {v0.2s}, [X], INC_X
fabs v0.2s, v0.2s
ext v1.8b, v0.8b, v0.8b, #4
fadd MAXF, s0, s1
#else
lsl INC_X, INC_X, #4
ld1 {v0.2d}, [X], INC_X
fabs v0.2d, v0.2d
faddp MAXF, v0.2d
#endif
.endm

.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
fabs v1.2s, v1.2s
ext v2.8b, v1.8b, v1.8b, #4
fadd TMPF, s1, s2
#else
ld1 {v1.2d}, [X], INC_X
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
#endif
fcmp MAXF, TMPF
fcsel MAXF, MAXF, TMPF, COND
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble amax_kernel_zero
cmp INC_X, xzr
ble amax_kernel_zero

cmp INC_X, #1
bne amax_kernel_S_BEGIN

amax_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq amax_kernel_F1_INIT

INIT_F4
subs I, I, #1
beq amax_kernel_F1

amax_kernel_F4:

KERNEL_F4

subs I, I, #1
bne amax_kernel_F4

amax_kernel_F1:

ands I, N, #3
ble amax_kernel_L999

amax_kernel_F10:

KERNEL_F1

subs I, I, #1
bne amax_kernel_F10

ret

amax_kernel_F1_INIT:

INIT_F1
subs N, N, #1
b amax_kernel_F1

amax_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble amax_kernel_L999

asr I, N, #2
cmp I, xzr
ble amax_kernel_S1

amax_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne amax_kernel_S4

amax_kernel_S1:

ands I, N, #3
ble amax_kernel_L999

amax_kernel_S10:

KERNEL_S1

subs I, I, #1
bne amax_kernel_S10

amax_kernel_L999:

ret

amax_kernel_zero:

fmov MAXF, REG0
ret

EPILOGUE

+ 164
- 0
kernel/arm64/zasum.S View File

@@ -0,0 +1,164 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#define REG0 xzr
#define SUMF d0
#define TMPF d1
#define TMPVF {v1.d}[0]
#define SZ 8

/******************************************************************************/

.macro KERNEL_F1
ld1 {v1.2d}, [X], #16
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm

.macro KERNEL_F4
ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64
fabs v1.2d, v1.2d
fabs v2.2d, v2.2d
fabs v3.2d, v3.2d
fabs v4.2d, v4.2d

fadd v1.2d, v1.2d, v2.2d
fadd v3.2d, v3.2d, v4.2d

fadd v0.2d, v0.2d, v1.2d
fadd v0.2d, v0.2d, v3.2d

PRFM PLDL1KEEP, [X, #1024]
.endm

.macro KERNEL_F4_FINALIZE
faddp SUMF, v0.2d
.endm

.macro INIT_S
lsl INC_X, INC_X, #4
.endm

.macro KERNEL_S1
ld1 {v1.2d}, [X], INC_X
fabs v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SUMF, SUMF, TMPF
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov SUMF, REG0

cmp N, xzr
ble asum_kernel_L999
cmp INC_X, xzr
ble asum_kernel_L999

cmp INC_X, #1
bne asum_kernel_S_BEGIN

asum_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq asum_kernel_F1

asum_kernel_F4:

KERNEL_F4

subs I, I, #1
bne asum_kernel_F4

KERNEL_F4_FINALIZE

asum_kernel_F1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_F10:

KERNEL_F1

subs I, I, #1
bne asum_kernel_F10

asum_kernel_L999:
ret

asum_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble asum_kernel_S1

asum_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne asum_kernel_S4

asum_kernel_S1:

ands I, N, #3
ble asum_kernel_L999

asum_kernel_S10:

KERNEL_S1

subs I, I, #1
bne asum_kernel_S10

ret

EPILOGUE

+ 301
- 0
kernel/arm64/zaxpy.S View File

@@ -0,0 +1,301 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define Y x5 /* Y vector address */
#define INC_Y x6 /* Y stride */
#define I x1 /* loop variable */
#define Y_COPY x7 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define DA_R s0 /* scale input value */
#define DA_I s1 /* scale input value */
#define TMPX v2.2s
#define TMPY v3.2s
#define SZ 4
#else
#define DA_R d0 /* scale input value */
#define DA_I d1 /* scale input value */
#define TMPX v2.2d
#define TMPY v3.2d
#define SZ 8
#endif

/******************************************************************************/

.macro INIT

#if !defined(CONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
fneg s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
fneg d2, DA_I
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif
#else
#if !defined(DOUBLE)
fneg s2, DA_R
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I
#else
fneg d2, DA_R
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I
#endif
#endif

.endm

.macro KERNEL_F1

#if !defined(DOUBLE)
ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2s}, [Y], #8
#else
ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2d}, [Y], #16
#endif

.endm

.macro KERNEL_INIT_F4

#if !defined(DOUBLE)
// Replicate the lower 2 floats into the upper 2 slots
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
#endif

.endm

.macro KERNEL_F4

#if !defined(DOUBLE)
ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0]
// V3 = X[7], X[6], X[5], X[4]
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]

ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0]
// V5 = Y[7], Y[6], Y[5], Y[4]

ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]

fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v4.4s}, [Y], #16

fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix]
fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += +-DA_R * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v5.4s}, [Y], #16
#else // DOUBLE
ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]

ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]

ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3

fmla v16.2d, v0.2d, v2.2d
fmla v17.2d, v0.2d, v3.2d

ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3

fmla v16.2d, v1.2d, v20.2d
fmla v17.2d, v1.2d, v21.2d
st1 {v16.2d,v17.2d}, [Y], #32

fmla v18.2d, v0.2d, v4.2d
fmla v19.2d, v0.2d, v5.2d
fmla v18.2d, v1.2d, v22.2d
fmla v19.2d, v1.2d, v23.2d
st1 {v18.2d,v19.2d}, [Y], #32
#endif
PRFM PLDL1KEEP, [X, #512]
PRFM PLDL1KEEP, [Y, #512]
.endm

.macro INIT_S

#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif

.endm

.macro KERNEL_S1

#if !defined(DOUBLE)
ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2s}, [Y], INC_Y
#else
ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy]
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix]
// Y[iy+1] += +-DA_R * X[ix+1]
fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1]
// Y[iy+1] += DA_I * X[ix]
st1 {v3.2d}, [Y], INC_Y
#endif

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble zaxpy_kernel_L999

mov Y_COPY, Y

fcmp DA_R, #0.0
bne .L1
fcmp DA_I, #0.0
beq zaxpy_kernel_L999

.L1:
INIT

cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN
cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN

zaxpy_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq zaxpy_kernel_F1

KERNEL_INIT_F4

zaxpy_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zaxpy_kernel_F4

zaxpy_kernel_F1:

ands I, N, #3
ble zaxpy_kernel_L999

zaxpy_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zaxpy_kernel_F10

mov w0, wzr
ret

zaxpy_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble zaxpy_kernel_S1

zaxpy_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne zaxpy_kernel_S4

zaxpy_kernel_S1:

ands I, N, #3
ble zaxpy_kernel_L999

zaxpy_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zaxpy_kernel_S10

zaxpy_kernel_L999:

mov w0, wzr
ret

+ 302
- 0
kernel/arm64/zdot.S View File

@@ -0,0 +1,302 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#if !defined(DSDOT)
#define REG0 wzr
#define DOTF s0
#else // DSDOT
#define REG0 xzr
#define DOTF d0
#endif
#define DOTI s1
#define TMPX s2
#define LD1VX {v2.s}[0]
#define TMPY s3
#define LD1VY {v3.s}[0]
#define TMPVY v3.s[0]
#define SZ 4
#else
#define REG0 xzr
#define DOTF d0
#define DOTI d1
#define TMPX d2
#define LD1VX {v2.d}[0]
#define TMPY d3
#define LD1VY {v3.d}[0]
#define TMPVY v3.d[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1

#if !defined(DOUBLE)
ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2
ins v4.s[0], v2.s[1] // V4 = X[ix+1]
#if !defined(CONJ)
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#else // DOUBLE
ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2
ins v4.d[0], v2.d[1] // V4 = X[ix+1]
#if !defined(CONJ)
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#endif

.endm


.macro KERNEL_F4

#if !defined(DOUBLE)
ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2

fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy]
fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1]
PRFM PLDL1KEEP, [X, #1024]
PRFM PLDL1KEEP, [Y, #1024]
#if !defined(CONJ)
fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1]
fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy]
#else
fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1]
fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy]
#endif
#else // DOUBLE
ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2
ld2 {v16.2d, v17.2d}, [Y], #32

fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy]
fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1]
ld2 {v4.2d, v5.2d}, [X], #32
ld2 {v18.2d, v19.2d}, [Y], #32
fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1]
fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1]
PRFM PLDL1KEEP, [X, #1024]
PRFM PLDL1KEEP, [Y, #1024]
#if !defined(CONJ)
fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1]
fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1]
fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy]
fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy]
#else
fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1]
fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1]
fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy]
fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy]
#endif
#endif

.endm

.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
ext v2.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v2.2s
faddp DOTF, v0.2s
ext v3.16b, v1.16b, v1.16b, #8
fadd v1.2s, v1.2s, v3.2s
faddp DOTI, v1.2s
#else
fadd v0.2d, v0.2d, v20.2d
faddp DOTF, v0.2d
fadd v1.2d, v1.2d, v21.2d
faddp DOTI, v1.2d
#endif
.endm

.macro INIT_S

#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif

.endm

.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1]
#if !defined(CONJ)
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#else // DOUBLE
ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2
ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2
ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1]
#if !defined(CONJ)
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1]
fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#else
fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy]
fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1]
fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy]
fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1]
#endif
#endif

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

fmov DOTF, REG0
fmov DOTI, DOTF
#if !defined(DOUBLE)
fmov s20, DOTF
fmov s21, DOTI
#else
fmov d20, DOTF
fmov d21, DOTI
#endif

cmp N, xzr
ble dot_kernel_L999

cmp INC_X, #1
bne dot_kernel_S_BEGIN
cmp INC_Y, #1
bne dot_kernel_S_BEGIN

dot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq dot_kernel_F1

dot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne dot_kernel_F4

KERNEL_F4_FINALIZE

dot_kernel_F1:

ands I, N, #3
ble dot_kernel_L999

dot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne dot_kernel_F10

ret

dot_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble dot_kernel_S1

dot_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne dot_kernel_S4

dot_kernel_S1:

ands I, N, #3
ble dot_kernel_L999

dot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne dot_kernel_S10

dot_kernel_L999:

ret

EPILOGUE

+ 1617
- 0
kernel/arm64/zgemm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 514
- 0
kernel/arm64/zgemv_n.S View File

@@ -0,0 +1,514 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define Y_IPTR x10 /* loop Y vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */
#define Y_OPTR x13 /* loop Y vector address */
#define X_PTR x14 /* loop X vector address */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
#define ALPHA_R_COPY s7
#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
#define ALPHA_R_COPY d7
#define ALPHA_I_COPY d8
#define SHZ 4
#endif

/******************************************************************************/

.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm

.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm


.macro INIT
/********** INIT FOR F4 LOOP **********/
fmov ALPHA_R_COPY, ALPHA_R
fmov ALPHA_I_COPY, ALPHA_I
#if !defined(DOUBLE)
ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA)
ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA)
ins v7.d[1], v7.d[0]
ins v8.d[1], v8.d[0]
#else
ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA)
ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA)
#endif

/******* INIT FOR F1 AND S1 LOOP ******/
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA)
fneg s2, ALPHA_I
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA)
#endif
#else
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA)
fneg d2, ALPHA_I
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA)
#if !defined(XCONJ)
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA)
#endif
#endif
.endm

.macro INIT_LOOP
/********** INIT_LOOP FOR F4 LOOP **********/
#if !defined(DOUBLE)
ld1 {v9.2s}, [X_PTR] // [I(X), R(X)]
ins v10.s[0], v9.s[1]
ins v9.s[1], v9.s[0] // [R(X), R(X)]
ins v10.s[1], v10.s[0] // [I(X), I(X)]
ins v9.d[1], v9.d[0]
ins v10.d[1], v10.d[0]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)]
fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)]
fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)]
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)]
#else
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)]
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)]
fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)]
fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)]
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ

/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2s}, [X_PTR] // [I(X), R(X)]
ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)]
fmul v2.2s, v0.2s, v2.2s
fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)]
ins v3.s[0], v2.s[1]
#if !defined(CONJ)
#if !defined(XCONJ)
fneg s4, s3
ins v3.s[1], v4.s[0]
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#else
fneg s4, s3
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)]
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)]
fneg s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#else
fneg s3, s3
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)]
fneg s4, s2
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ

#else // DOUBLE

/********** INIT_LOOP FOR F4 LOOP **********/
ld1 {v9.2d}, [X_PTR] // [I(X), R(X)]
ins v10.d[0], v9.d[1]
ins v9.d[1], v9.d[0] // [R(X), R(X)]
ins v10.d[1], v10.d[0] // [I(X), I(X)]
#if !defined(CONJ)
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)]
fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#else // CONJ
#if !defined(XCONJ)
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)]
fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)]
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)]
#else
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)]
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)]
fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)]
fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)]
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)]
#endif
#endif // CONJ

/****** INIT_LOOP FOR F1 AND S1 LOOP ******/
ld1 {v2.2d}, [X_PTR] // [I(X), R(X)]
ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)]
fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)]
ins v3.d[0], v2.d[1] // I(TEMP)
#if !defined(CONJ)
#if !defined(XCONJ)
fneg d4, d3 // -I(TEMP)
ins v3.d[1], v4.d[0]
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#else
fneg d4, d3 // -I(TEMP)
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)]
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)]
#endif
#else // CONJ
#if !defined(XCONJ)
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)]
fneg d4, d2 // -R(TEMP)
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#else
fneg d3, d3 // -I(TEMP)
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)]
fneg d4, d2 // -R(TEMP)
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)]
#endif
#endif // CONJ

#endif // DOUBLE
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)

ld2 {v13.4s, v14.4s}, [A_PTR], #32
ld2 {v15.4s, v16.4s}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R]
#else
fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R]
fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I]
fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I]
fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
st2 {v15.4s, v16.4s}, [Y_OPTR], #32

#else // DOUBLE

ld2 {v13.2d, v14.2d}, [A_PTR], #32
ld2 {v15.2d, v16.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R]
fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I]
fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I]
fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R]
#endif
#endif // CONJ
st2 {v15.2d, v16.2d}, [Y_OPTR], #32

ld2 {v17.2d, v18.2d}, [A_PTR], #32
ld2 {v19.2d, v20.2d}, [Y_IPTR], #32
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#else
fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R]
fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I]
fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I]
fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R]
#endif
#endif // CONJ
st2 {v19.2d, v20.2d}, [Y_OPTR], #32

#endif

.endm

.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v4.2s}, [A_PTR], #8
ld1 {v5.2s}, [Y_IPTR], #8
ext v6.8b, v4.8b, v4.8b, #4
fmla v5.2s, v2.2s, v4.2s
fmla v5.2s, v3.2s, v6.2s
st1 {v5.2s}, [Y_OPTR], #8
#else // DOUBLE
ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [Y_IPTR], #16
ext v6.16b, v4.16b, v4.16b, #8
fmla v5.2d, v2.2d, v4.2d
fmla v5.2d, v3.2d, v6.2d
st1 {v5.2d}, [Y_OPTR], #16
#endif
.endm

.macro INIT_S
lsl INC_Y, INC_Y, #SHZ
.endm

.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v4.2s}, [A_PTR], #8
ld1 {v5.2s}, [Y_IPTR], INC_Y
ext v6.8b, v4.8b, v4.8b, #4
fmla v5.2s, v2.2s, v4.2s
fmla v5.2s, v3.2s, v6.2s
st1 {v5.2s}, [Y_OPTR], INC_Y
#else // DOUBLE
ld1 {v4.2d}, [A_PTR], #16
ld1 {v5.2d}, [Y_IPTR], INC_Y
ext v6.16b, v4.16b, v4.16b, #8
fmla v5.2d, v2.2d, v4.2d
fmla v5.2d, v3.2d, v6.2d
st1 {v5.2d}, [Y_OPTR], INC_Y
#endif
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

ldr INC_Y, [sp]

SAVE_REGS

cmp N, xzr
ble zgemv_n_kernel_L999
cmp M, xzr
ble zgemv_n_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ
mov J, N

INIT

cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN

zgemv_n_kernel_F_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
mov X_PTR, X
add X, X, INC_X
INIT_LOOP

asr I, M, #2
cmp I, xzr
beq zgemv_n_kernel_F1

zgemv_n_kernel_F4:

KERNEL_F1
KERNEL_F1
KERNEL_F1
KERNEL_F1

subs I, I, #1
bne zgemv_n_kernel_F4

zgemv_n_kernel_F1:

ands I, M, #3
ble zgemv_n_kernel_F_END

zgemv_n_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zgemv_n_kernel_F10

zgemv_n_kernel_F_END:

add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_F_LOOP

b zgemv_n_kernel_L999

zgemv_n_kernel_S_BEGIN:

INIT_S

zgemv_n_kernel_S_LOOP:
mov A_PTR, A
mov Y_IPTR, Y
mov Y_OPTR, Y
mov X_PTR, X
add X, X, INC_X
INIT_LOOP

asr I, M, #2
cmp I, xzr
ble zgemv_n_kernel_S1

zgemv_n_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne zgemv_n_kernel_S4

zgemv_n_kernel_S1:

ands I, M, #3
ble zgemv_n_kernel_S_END

zgemv_n_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zgemv_n_kernel_S10

zgemv_n_kernel_S_END:

add A, A, LDA
subs J, J, #1
bne zgemv_n_kernel_S_LOOP

zgemv_n_kernel_L999:
RESTORE_REGS

mov w0, wzr
ret

EPILOGUE

+ 448
- 0
kernel/arm64/zgemv_t.S View File

@@ -0,0 +1,448 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define M x0 /* Y vector length */
#define N x1 /* X vector length */
#define A x3 /* A vector address */
#define LDA x4 /* A stride */
#define X x5 /* X vector address */
#define INC_X x6 /* X stride */
#define Y x7 /* Y vector address */
#define INC_Y x2 /* Y stride */
#define A_PTR x9 /* loop A vector address */
#define X_PTR x10 /* loop Y vector address */
#define J x11 /* loop variable */
#define I x12 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define ALPHA_R s0
#define ALPHA_I s1
#define ALPHA_R_COPY s7
#define ALPHA_I_COPY s8
#define SHZ 3
#else
#define ALPHA_R d0
#define ALPHA_I d1
#define ALPHA_R_COPY d7
#define ALPHA_I_COPY d8
#define SHZ 4
#endif

/******************************************************************************/


.macro SAVE_REGS
add sp, sp, #-(11 * 16)
stp d8, d9, [sp, #(0 * 16)]
stp d10, d11, [sp, #(1 * 16)]
stp d12, d13, [sp, #(2 * 16)]
stp d14, d15, [sp, #(3 * 16)]
stp d16, d17, [sp, #(4 * 16)]
stp x18, x19, [sp, #(5 * 16)]
stp x20, x21, [sp, #(6 * 16)]
stp x22, x23, [sp, #(7 * 16)]
stp x24, x25, [sp, #(8 * 16)]
stp x26, x27, [sp, #(9 * 16)]
str x28, [sp, #(10 * 16)]
.endm

.macro RESTORE_REGS
ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)]
ldp d12, d13, [sp, #(2 * 16)]
ldp d14, d15, [sp, #(3 * 16)]
ldp d16, d17, [sp, #(4 * 16)]
ldp x18, x19, [sp, #(5 * 16)]
ldp x20, x21, [sp, #(6 * 16)]
ldp x22, x23, [sp, #(7 * 16)]
ldp x24, x25, [sp, #(8 * 16)]
ldp x26, x27, [sp, #(9 * 16)]
ldr x28, [sp, #(10 * 16)]
add sp, sp, #(11*16)
.endm

.macro INIT
#if !defined(XCONJ)
#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R
fneg s2, ALPHA_I
ins v1.s[1], v2.s[0]
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I
#else
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R
fneg d2, ALPHA_I
ins v1.d[1], v2.d[0]
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I
#endif
#else // XCONJ
#if !defined(DOUBLE)
fneg s2, ALPHA_R
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I
#else
fneg d2, ALPHA_R
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I
#endif
#endif
.endm

.macro INIT_LOOP
fmov d9, xzr // TEMP_R = [0, 0]
fmov d10, xzr // TEMP_I = [0, 0]
#if !defined(DOUBLE)
#else
fmov d15, xzr // TEMP_R = [0, 0]
fmov d16, xzr // TEMP_I = [0, 0]
#endif

fmov d2, xzr // TEMP = [0, 0]
.endm

.macro KERNEL_F4
#if !defined(DOUBLE)

ld2 {v11.4s, v12.4s}, [X_PTR], #32
ld2 {v13.4s, v14.4s}, [A_PTR], #32

#if !defined(CONJ)
#if !defined(XCONJ)
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
#else
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I]
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I]
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R]
#else
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R]
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I]
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I]
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R]
#endif
#endif // CONJ

#else // DOUBLE
ld2 {v11.2d, v12.2d}, [X_PTR], #32
ld2 {v13.2d, v14.2d}, [A_PTR], #32
prfm PLDL1STRM, [X_PTR, #512]
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
#else
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I]
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I]
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R]
#else
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R]
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I]
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I]
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R]
#endif
#endif // CONJ
ld2 {v17.2d, v18.2d}, [X_PTR], #32
ld2 {v19.2d, v20.2d}, [A_PTR], #32
prfm PLDL1STRM, [A_PTR, #512]
#if !defined(CONJ)
#if !defined(XCONJ)
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#else
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#endif
#else // CONJ
#if !defined(XCONJ)
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#else
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R]
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I]
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I]
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R]
#endif
#endif // CONJ
#endif //DOUBLE
.endm

.macro KERNEL_F4_FINALIZE
#if !defined(DOUBLE)
ext v21.16b, v9.16b, v9.16b, #8
fadd v9.2s, v9.2s, v21.2s
faddp s9, v9.2s

ext v21.16b, v10.16b, v10.16b, #8
fadd v10.2s, v10.2s, v21.2s
faddp s10, v10.2s

ins v2.s[0], v9.s[0]
ins v2.s[1], v10.s[0]
#else
fadd v9.2d, v9.2d, v15.2d
fadd v10.2d, v10.2d, v16.2d

faddp d9, v9.2d
faddp d10, v10.2d

ins v2.d[0], v9.d[0]
ins v2.d[1], v10.d[0]
#endif
.endm


.macro KERNEL_F1
#if !defined(DOUBLE)
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0]
fneg s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
#endif
ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
fmla v2.2s, v4.2s, v6.2s
fmla v2.2s, v5.2s, v7.2s
#else // DOUBLE
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0]
fneg d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
#endif
ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
fmla v2.2d, v4.2d, v6.2d
fmla v2.2d, v5.2d, v7.2d
#endif
.endm

.macro INIT_S
lsl INC_X, INC_X, #SHZ
.endm

.macro KERNEL_S1
#if !defined(DOUBLE)
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0]
ld1 {v5.s}[0], [A_PTR], #4 // A1
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0]
fneg s16, s5
ins v5.s[1], v16.s[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1]
#endif
ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1]
fmla v2.2s, v4.2s, v6.2s
fmla v2.2s, v5.2s, v7.2s
#else // DOUBLE
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0]
ld1 {v5.d}[0], [A_PTR], #8 // A1
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0]
fneg d16, d5
ins v5.d[1], v16.d[0] // [-A1, A1]
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1]
#endif
ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1]
fmla v2.2d, v4.2d, v6.2d
fmla v2.2d, v5.2d, v7.2d
#endif
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

ldr INC_Y, [sp]
SAVE_REGS

cmp N, xzr
ble zgemv_t_kernel_L999
cmp M, xzr
ble zgemv_t_kernel_L999

lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ
mov J, N

INIT

cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN

zgemv_t_kernel_F_LOOP:

mov A_PTR, A
mov X_PTR, X

INIT_LOOP

asr I, M, #2
cmp I, xzr
beq zgemv_t_kernel_F1

zgemv_t_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zgemv_t_kernel_F4

KERNEL_F4_FINALIZE

zgemv_t_kernel_F1:

ands I, M, #3
ble zgemv_t_kernel_F_END

zgemv_t_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zgemv_t_kernel_F10

zgemv_t_kernel_F_END:

#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
fmla v4.2s, v0.2s, v2.2s
fmla v4.2s, v1.2s, v3.2s
st1 {v4.2s}, [Y], INC_Y
#else // DOUBLE
ld1 {v4.2d}, [Y]
ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
fmla v4.2d, v0.2d, v2.2d
fmla v4.2d, v1.2d, v3.2d
st1 {v4.2d}, [Y], INC_Y
#endif

add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_F_LOOP

b zgemv_t_kernel_L999

zgemv_t_kernel_S_BEGIN:

INIT_S

zgemv_t_kernel_S_LOOP:

mov A_PTR, A
mov X_PTR, X
INIT_LOOP

asr I, M, #2
cmp I, xzr
ble zgemv_t_kernel_S1

zgemv_t_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne zgemv_t_kernel_S4

zgemv_t_kernel_S1:

ands I, M, #3
ble zgemv_t_kernel_S_END

zgemv_t_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zgemv_t_kernel_S10

zgemv_t_kernel_S_END:

#if !defined(DOUBLE)
ld1 {v4.2s}, [Y]
ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I]
fmla v4.2s, v0.2s, v2.2s
fmla v4.2s, v1.2s, v3.2s
st1 {v4.2s}, [Y], INC_Y
#else // DOUBLE
ld1 {v4.2d}, [Y]
ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I]
fmla v4.2d, v0.2d, v2.2d
fmla v4.2d, v1.2d, v3.2d
st1 {v4.2d}, [Y], INC_Y
#endif

add A, A, LDA
subs J, J, #1
bne zgemv_t_kernel_S_LOOP

zgemv_t_kernel_L999:
RESTORE_REGS
mov w0, wzr
ret

EPILOGUE

+ 228
- 0
kernel/arm64/znrm2.S View File

@@ -0,0 +1,228 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define TMPF s6
#define SSQ s0
#define TMPVF {v6.s}[0]
#define SZ 4
#else
#define TMPF d6
#define SSQ d0
#define TMPVF {v6.d}[0]
#define SZ 8
#endif

/******************************************************************************/

.macro KERNEL_F1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], #8
fmul v1.2s, v1.2s, v1.2s
faddp TMPF, v1.2s
fadd SSQ, SSQ, TMPF
#else
ld1 {v1.2d}, [X], #16
fmul v1.2d, v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SSQ, SSQ, TMPF
#endif
.endm

.macro KERNEL_F8
#if !defined(DOUBLE)
ld1 {v1.4s, v2.4s}, [X], #32
fmla v0.4s, v1.4s, v1.4s
fmla v5.4s, v2.4s, v2.4s
ld1 {v3.4s,v4.4s}, [X], #32
fmla v0.4s, v3.4s, v3.4s
fmla v5.4s, v4.4s, v4.4s
PRFM PLDL1KEEP, [X, #1024]
#else // DOUBLE
ld1 {v1.2d, v2.2d}, [X], #32
fmla v0.2d, v1.2d, v1.2d
fmla v5.2d, v2.2d, v2.2d
ld1 {v3.2d, v4.2d}, [X], #32
fmla v0.2d, v3.2d, v3.2d
fmla v5.2d, v4.2d, v4.2d

ld1 {v16.2d, v17.2d}, [X], #32
fmla v0.2d, v16.2d, v16.2d
fmla v5.2d, v17.2d, v17.2d
ld1 {v18.2d, v19.2d}, [X], #32
fmla v0.2d, v18.2d, v18.2d
fmla v5.2d, v19.2d, v19.2d
#endif
.endm

.macro nrm2_kernel_F8_FINALIZE
#if !defined(DOUBLE)
fadd v0.4s, v0.4s, v5.4s
ext v1.16b, v0.16b, v0.16b, #8
fadd v0.2s, v0.2s, v1.2s
faddp SSQ, v0.2s
#else
fadd v0.2d, v0.2d, v5.2d
faddp SSQ, v0.2d
#endif
.endm

.macro INIT_S
#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
ld1 {v1.2s}, [X], INC_X
fmul v1.2s, v1.2s, v1.2s
faddp SSQ, v1.2s
#else
lsl INC_X, INC_X, #4
ld1 {v1.2d}, [X], INC_X
fmul v1.2d, v1.2d, v1.2d
faddp SSQ, v1.2d
#endif
.endm

.macro KERNEL_S1
#if !defined(DOUBLE)
ld1 {v1.2s}, [X], INC_X
fmul v1.2s, v1.2s, v1.2s
faddp TMPF, v1.2s
fadd SSQ, SSQ, TMPF
#else
ld1 {v1.2d}, [X], INC_X
fmul v1.2d, v1.2d, v1.2d
faddp TMPF, v1.2d
fadd SSQ, SSQ, TMPF
#endif
.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

#if !defined(DOUBLE)
fmov SSQ, wzr
fmov s5, SSQ
#else
fmov SSQ, xzr
fmov d5, SSQ
#endif

cmp N, xzr
ble nrm2_kernel_zero
cmp INC_X, xzr
ble nrm2_kernel_zero
cmp INC_X, #1
bne nrm2_kernel_S_BEGIN

nrm2_kernel_F_BEGIN:

asr I, N, #3
cmp I, xzr
beq nrm2_kernel_F1_INIT

nrm2_kernel_F8:

KERNEL_F8

subs I, I, #1
bne nrm2_kernel_F8

nrm2_kernel_F8_FINALIZE

nrm2_kernel_F1:

ands I, N, #7
ble nrm2_kernel_L999

nrm2_kernel_F10:

KERNEL_F1

subs I, I, #1
bne nrm2_kernel_F10

b nrm2_kernel_L999

nrm2_kernel_F1_INIT:

b nrm2_kernel_F1

nrm2_kernel_S_BEGIN:

INIT_S

subs N, N, #1
ble nrm2_kernel_L999

asr I, N, #2
cmp I, xzr
ble nrm2_kernel_S1

nrm2_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S4

nrm2_kernel_S1:

ands I, N, #3
ble nrm2_kernel_L999

nrm2_kernel_S10:

KERNEL_S1

subs I, I, #1
bne nrm2_kernel_S10

nrm2_kernel_L999:
fsqrt SSQ, SSQ
ret

nrm2_kernel_zero:
ret

EPILOGUE

+ 256
- 0
kernel/arm64/zrot.S View File

@@ -0,0 +1,256 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x1 /* X vector address */
#define INC_X x2 /* X stride */
#define Y x3 /* Y vector address */
#define INC_Y x4 /* Y stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define C s0 /* scale input value */
#define S s1 /* scale input value */
#else
#define C d0 /* scale input value */
#define S d1 /* scale input value */
#endif

/******************************************************************************/

.macro INIT

#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // [C, C]
ins v1.s[1], v1.s[0] // [S, S]
#else
ins v0.d[1], v0.d[0] // [C, C]
ins v1.d[1], v1.d[0] // [S, S]
#endif

.endm

.macro KERNEL_F1

#if !defined(DOUBLE)
ld1 {v2.2s}, [X]
ld1 {v3.2s}, [Y]
fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0]
fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0]
fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2s}, [X], #8
st1 {v5.2s}, [Y], #8
#else
ld1 {v2.2d}, [X]
ld1 {v3.2d}, [Y]
fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0]
fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0]
fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2d}, [X], #16
st1 {v5.2d}, [Y], #16
#endif

.endm

.macro KERNEL_INIT_F4

#if !defined(DOUBLE)
ins v0.d[1], v0.d[0] // [C, C, C, C]
ins v1.d[1], v1.d[0] // [S, S, S, S]
#endif

.endm

.macro KERNEL_F4

#if !defined(DOUBLE)
ld1 {v2.4s, v3.4s}, [X]
ld1 {v4.4s, v5.4s}, [Y]
fmul v6.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0
fmul v7.4s, v0.4s, v3.4s // C*X7, C*X6, C*X5, C*X4
fmla v6.4s, v1.4s, v4.4s // C*X3+S*Y3, ..., C*X0+S*Y0
fmla v7.4s, v1.4s, v5.4s // C*X7+S*Y7, ..., C*X4+S*Y4
fmul v16.4s, v0.4s, v4.4s // C*Y3, C*Y2, C*Y1, C*Y0
fmul v17.4s, v0.4s, v5.4s // C*Y7, C*Y6, C*Y5, C*Y4
fmls v16.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0
fmls v17.4s, v1.4s, v3.4s // C*Y7-S*X7, ..., C*Y4-S*X4
st1 {v6.4s,v7.4s}, [X], #32
st1 {v16.4s,v17.4s}, [Y], #32
#else // DOUBLE
ld1 {v2.2d, v3.2d}, [X]
ld1 {v4.2d, v5.2d}, [Y]
fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0
fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4
fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0
fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4
fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0
fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4
fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0
fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4
st1 {v6.2d,v7.2d}, [X], #32
st1 {v16.2d,v17.2d}, [Y], #32
ld1 {v2.2d, v3.2d}, [X]
ld1 {v4.2d, v5.2d}, [Y]
fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0
fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4
fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0
fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4
fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0
fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4
fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0
fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4
st1 {v6.2d,v7.2d}, [X], #32
st1 {v16.2d,v17.2d}, [Y], #32
#endif

.endm

.macro INIT_S

#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
lsl INC_Y, INC_Y, #3
#else
lsl INC_X, INC_X, #4
lsl INC_Y, INC_Y, #4
#endif

.endm

.macro KERNEL_S1

#if !defined(DOUBLE)
ld1 {v2.2s}, [X]
ld1 {v3.2s}, [Y]
fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0]
fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0]
fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2s}, [X], INC_X
st1 {v5.2s}, [Y], INC_Y
#else
ld1 {v2.2d}, [X]
ld1 {v3.2d}, [Y]
fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0]
fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0]
fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0]
fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0]
st1 {v4.2d}, [X], INC_X
st1 {v5.2d}, [Y], INC_Y
#endif

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble rot_kernel_L999

INIT

cmp INC_X, #1
bne rot_kernel_S_BEGIN
cmp INC_Y, #1
bne rot_kernel_S_BEGIN

rot_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq rot_kernel_F1

KERNEL_INIT_F4

rot_kernel_F4:

KERNEL_F4

subs I, I, #1
bne rot_kernel_F4

rot_kernel_F1:

ands I, N, #3
ble rot_kernel_L999

rot_kernel_F10:

KERNEL_F1

subs I, I, #1
bne rot_kernel_F10

mov w0, wzr
ret

rot_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble rot_kernel_S1

rot_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne rot_kernel_S4

rot_kernel_S1:

ands I, N, #3
ble rot_kernel_L999

rot_kernel_S10:

KERNEL_S1

subs I, I, #1
bne rot_kernel_S10

rot_kernel_L999:

mov w0, wzr
ret

+ 274
- 0
kernel/arm64/zscal.S View File

@@ -0,0 +1,274 @@
/*******************************************************************************
Copyright (c) 2015, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

#define ASSEMBLER
#include "common.h"

#define N x0 /* vector length */
#define X x3 /* X vector address */
#define INC_X x4 /* X stride */
#define I x5 /* loop variable */

/*******************************************************************************
* Macro definitions
*******************************************************************************/

#if !defined(DOUBLE)
#define DA_R s0 /* real scale input value */
#define DA_I s1 /* imaginary scale input value */
#else
#define DA_R d0 /* real scale input value */
#define DA_I d1 /* imaginary scale input value */
#endif

/******************************************************************************/

.macro INIT

#if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
fneg s2, DA_I
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I
#else
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
fneg d2, DA_I
ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I
#endif

.endm

.macro KERNEL_F1

#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2s}, [X], #8
#else
ld1 {v2.2d}, [X] // X1, X0
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2d}, [X], #16
#endif

.endm

.macro KERNEL_INIT_F4

#if !defined(DOUBLE)
// Replicate the lower 2 floats into the upper 2 slots
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I
#endif

.endm

.macro KERNEL_F4

#if !defined(DOUBLE)
ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0]
// V3 = X[7], X[6], X[5], X[4]

ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1]
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1]
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1]
fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix]
// X'[ix+1] += DA_R * X[ix+1]
fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1]
// X'[ix+1] += DA_I * X[ix]

ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5]
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5]
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5]
fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix]
// X'[ix+1] += DA_R * X[ix+1]
fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1]
// X'[ix+1] += DA_I * X[ix]

st1 {v2.4s,v3.4s}, [X], #32
#else // DOUBLE
ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1]
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1]
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1]
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1]

fmul v2.2d, v0.2d, v2.2d
fmla v2.2d, v1.2d, v20.2d

fmul v3.2d, v0.2d, v3.2d
fmla v3.2d, v1.2d, v21.2d
st1 {v2.2d,v3.2d}, [X], #32

fmul v4.2d, v0.2d, v4.2d
fmla v4.2d, v1.2d, v22.2d

fmul v5.2d, v0.2d, v5.2d
fmla v5.2d, v1.2d, v23.2d
st1 {v4.2d,v5.2d}, [X], #32
#endif
PRFM PLDL1KEEP, [X, #1024]
.endm

.macro INIT_S

#if !defined(DOUBLE)
lsl INC_X, INC_X, #3
#else
lsl INC_X, INC_X, #4
#endif

.endm

.macro KERNEL_S1

#if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2s}, [X], INC_X
#else
ld1 {v2.2d}, [X] // X1, X0
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1
st1 {v2.2d}, [X], INC_X
#endif

.endm

/*******************************************************************************
* End of macro definitions
*******************************************************************************/

PROLOGUE

cmp N, xzr
ble zscal_kernel_L999

fcmp DA_R, #0.0
bne zscal_kernel_1

fcmp DA_I, #0.0
beq zscal_kernel_zero

// TODO: special case DA_R == 0 && DA_I != 0

zscal_kernel_1:

// TODO: special case DA_R != 0 && DA_I == 0

INIT

cmp INC_X, #1
bne zscal_kernel_S_BEGIN

zscal_kernel_F_BEGIN:

asr I, N, #2
cmp I, xzr
beq zscal_kernel_F1

KERNEL_INIT_F4

zscal_kernel_F4:

KERNEL_F4

subs I, I, #1
bne zscal_kernel_F4

zscal_kernel_F1:

ands I, N, #3
ble zscal_kernel_L999

zscal_kernel_F10:

KERNEL_F1

subs I, I, #1
bne zscal_kernel_F10

mov w0, wzr
ret

zscal_kernel_S_BEGIN:

INIT_S

asr I, N, #2
cmp I, xzr
ble zscal_kernel_S1

zscal_kernel_S4:

KERNEL_S1
KERNEL_S1
KERNEL_S1
KERNEL_S1

subs I, I, #1
bne zscal_kernel_S4

zscal_kernel_S1:

ands I, N, #3
ble zscal_kernel_L999

zscal_kernel_S10:

KERNEL_S1

subs I, I, #1
bne zscal_kernel_S10

zscal_kernel_L999:

mov w0, wzr
ret

zscal_kernel_zero:

INIT_S

zscal_kernel_Z1:

stp DA_R, DA_I, [X]
add X, X, INC_X
subs N, N, #1
bne zscal_kernel_Z1

mov w0, wzr
ret

EPILOGUE

+ 1893
- 0
kernel/arm64/ztrmm_kernel_4x4.S
File diff suppressed because it is too large
View File


+ 883
- 0
kernel/generic/ztrmmkernel_4x4.c View File

@@ -0,0 +1,883 @@
#include "common.h"

#define MADD_ALPHA_N_STORE(C, res, alpha) \
C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \
C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r;

#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r -= op1 ## _i * op2 ## _i; \
res ## _i += op1 ## _r * op2 ## _i; \
res ## _i += op1 ## _i * op2 ## _r;
#elif defined(NR) || defined(NC) || defined(TR) || defined(TC)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r += op1 ## _i * op2 ## _i; \
res ## _i -= op1 ## _r * op2 ## _i; \
res ## _i += op1 ## _i * op2 ## _r;
#elif defined(RN) || defined(RT) || defined(CN) || defined(CT)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r += op1 ## _i * op2 ## _i; \
res ## _i += op1 ## _r * op2 ## _i; \
res ## _i -= op1 ## _i * op2 ## _r;
#elif defined(RR) || defined(RC) || defined(CR) || defined(CC)
#define MADD(res, op1, op2) \
res ## _r += op1 ## _r * op2 ## _r; \
res ## _r -= op1 ## _i * op2 ## _i; \
res ## _i -= op1 ## _r * op2 ## _i; \
res ## _i -= op1 ## _i * op2 ## _r;
#endif

int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
, BLASLONG offset
)
{

BLASLONG i,j,k;
FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
FLOAT res00_r, res01_r, res02_r, res03_r;
FLOAT res00_i, res01_i, res02_i, res03_i;
FLOAT res10_r, res11_r, res12_r, res13_r;
FLOAT res10_i, res11_i, res12_i, res13_i;
FLOAT res20_r, res21_r, res22_r, res23_r;
FLOAT res20_i, res21_i, res22_i, res23_i;
FLOAT res30_r, res31_r, res32_r, res33_r;
FLOAT res30_i, res31_i, res32_i, res33_i;
FLOAT a0_r, a1_r;
FLOAT a0_i, a1_i;
FLOAT b0_r, b1_r, b2_r, b3_r;
FLOAT b0_i, b1_i, b2_i, b3_i;
BLASLONG off, temp;

#if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset;
#endif

for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
{
C0 = C;
C1 = C0+2*ldc;
C2 = C1+2*ldc;
C3 = C2+2*ldc;


#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif

ptrba = ba;

for (i=0; i<bm/4; i+=1) // do blocks of 4x4
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else

ptrba += off*4*2; // number of values in A
ptrbb = bb + off*4*2; // number of values in B
#endif

res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res02_r = 0;
res02_i = 0;
res03_r = 0;
res03_i = 0;

res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;
res12_r = 0;
res12_i = 0;
res13_r = 0;
res13_i = 0;

res20_r = 0;
res20_i = 0;
res21_r = 0;
res21_i = 0;
res22_r = 0;
res22_i = 0;
res23_r = 0;
res23_i = 0;

res30_r = 0;
res30_i = 0;
res31_r = 0;
res31_i = 0;
res32_r = 0;
res32_i = 0;
res33_r = 0;
res33_i = 0;

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk - off;
#elif defined(LEFT)
temp = off + 4;
#else
temp = off + 4;
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
MADD(res20, a0, b2);
MADD(res30, a0, b3);

a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);
MADD(res21, a1, b2);
MADD(res31, a1, b3);

a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
MADD(res02, a0, b0);
MADD(res12, a0, b1);
MADD(res22, a0, b2);
MADD(res32, a0, b3);


a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
MADD(res03, a1, b0);
MADD(res13, a1, b1);
MADD(res23, a1, b2);
MADD(res33, a1, b3);

ptrba = ptrba+8;
ptrbb = ptrbb+8;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res02, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res03, alpha);
C0 = C0 + 2;

MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res12, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res13, alpha);
C1 = C1 + 2;

MADD_ALPHA_N_STORE(C2, res20, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res21, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res22, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res23, alpha);
C2 = C2 + 2;

MADD_ALPHA_N_STORE(C3, res30, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res31, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res32, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res33, alpha);
C3 = C3 + 2;


#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk-off;
#if defined(LEFT)
temp = temp - 4;
#else
temp = temp - 4;
#endif
ptrba += temp*4*2; // number of values in A
ptrbb += temp*4*2; // number of values in B
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif


}

if ( bm & 2 ) // do any 2x4 loop
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb + off*4*2;
#endif


res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;

res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;

res20_r = 0;
res20_i = 0;
res21_r = 0;
res21_i = 0;

res30_r = 0;
res30_i = 0;
res31_r = 0;
res31_i = 0;

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+4; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
MADD(res20, a0, b2);
MADD(res30, a0, b3);

a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);
MADD(res21, a1, b2);
MADD(res31, a1, b3);


ptrba = ptrba+4;
ptrbb = ptrbb+8;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;

MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;

MADD_ALPHA_N_STORE(C2, res20, alpha);
C2 = C2 + 2;
MADD_ALPHA_N_STORE(C2, res21, alpha);
C2 = C2 + 2;

MADD_ALPHA_N_STORE(C3, res30, alpha);
C3 = C3 + 2;
MADD_ALPHA_N_STORE(C3, res31, alpha);
C3 = C3 + 2;





#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*2*2;
ptrbb += temp*4*2;
#endif

#ifdef LEFT
off += 2; // number of values in A
#endif


}

if ( bm & 1 ) // do any 1x4 loop
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1*2;
ptrbb = bb + off*4*2;
#endif

res00_r = 0;
res00_i = 0;
res10_r = 0;
res10_i = 0;
res20_r = 0;
res20_i = 0;
res30_r = 0;
res30_i = 0;


#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+4; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];
b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1];
b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);
MADD(res20, a0, b2);
MADD(res30, a0, b3);


ptrba = ptrba+2;
ptrbb = ptrbb+8;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;

MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;

MADD_ALPHA_N_STORE(C2, res20, alpha);
C2 = C2 + 2;

MADD_ALPHA_N_STORE(C3, res30, alpha);
C3 = C3 + 2;


#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 4; // number of values in B
#endif
ptrba += temp*1*2;
ptrbb += temp*4*2;
#endif

#ifdef LEFT
off += 1; // number of values in A
#endif


}


#if defined(TRMMKERNEL) && !defined(LEFT)
off += 4;
#endif

k = (bk<<3);
bb = bb+k;
i = (ldc<<3);
C = C+i;
}

for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
{
C0 = C;
C1 = C0+ldc*2;

#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif


ptrba = ba;

for (i=0; i<bm/4; i+=1) // do blocks of 4x2
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4*2;
ptrbb = bb + off*2*2;
#endif

res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res02_r = 0;
res02_i = 0;
res03_r = 0;
res03_i = 0;

res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;
res12_r = 0;
res12_i = 0;
res13_r = 0;
res13_i = 0;


#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);

a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);

a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
MADD(res02, a0, b0);
MADD(res12, a0, b1);

a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
MADD(res03, a1, b0);
MADD(res13, a1, b1);

ptrba = ptrba+8;
ptrbb = ptrbb+4;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res02, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res03, alpha);
C0 = C0 + 2;

MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res12, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res13, alpha);
C1 = C1 + 2;


#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4*2;
ptrbb += temp*2*2;
#endif

#ifdef LEFT
off += 4; // number of values in A
#endif

}

if ( bm & 2 ) // do any 2x2 loop
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb + off*2*2;
#endif

res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;

res10_r = 0;
res10_i = 0;
res11_r = 0;
res11_i = 0;


#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);

a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);
MADD(res11, a1, b1);


ptrba = ptrba+4;
ptrbb = ptrbb+4;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;

MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;
MADD_ALPHA_N_STORE(C1, res11, alpha);
C1 = C1 + 2;

#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2*2;
ptrbb += temp*2*2;
#endif

#ifdef LEFT
off += 2; // number of values in A
#endif

}

if ( bm & 1 ) // do any 1x2 loop
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1*2;
ptrbb = bb + off*2*2;
#endif

res00_r = 0;
res00_i = 0;

res10_r = 0;
res10_i = 0;


#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];
b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);
MADD(res10, a0, b1);

ptrba = ptrba+2;
ptrbb = ptrbb+4;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;

MADD_ALPHA_N_STORE(C1, res10, alpha);
C1 = C1 + 2;

#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1*2;
ptrbb += temp*2*2;
#endif

#ifdef LEFT
off += 1; // number of values in A
#endif

}


#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif

k = (bk<<2);
bb = bb+k;
i = (ldc<<2);
C = C+i;
}







for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
{
C0 = C;

#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif

ptrba = ba;

for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4*2;
ptrbb = bb + off*1*2;
#endif

res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;
res02_r = 0;
res02_i = 0;
res03_r = 0;
res03_i = 0;


#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);

a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);

a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1];
MADD(res02, a0, b0);

a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1];
MADD(res03, a1, b0);

ptrba = ptrba+8;
ptrbb = ptrbb+2;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res02, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res03, alpha);
C0 = C0 + 2;


#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4*2;
ptrbb += temp*1*2;
#endif

#ifdef LEFT
off += 4; // number of values in A
#endif

}

if ( bm & 2 ) // do any 2x1 loop
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2*2;
ptrbb = bb + off*1*2;
#endif

res00_r = 0;
res00_i = 0;
res01_r = 0;
res01_i = 0;

#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);

a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1];
MADD(res01, a1, b0);


ptrba = ptrba+4;
ptrbb = ptrbb+2;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;
MADD_ALPHA_N_STORE(C0, res01, alpha);
C0 = C0 + 2;


#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2*2;
ptrbb += temp*1*2;
#endif

#ifdef LEFT
off += 2; // number of values in A
#endif

}

if ( bm & 1 ) // do any 1x1 loop
{

#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1*2;
ptrbb = bb + off*1*2;
#endif

res00_r = 0;
res00_i = 0;


#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif

for (k=0; k<temp; k++)
{
b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1];

a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1];
MADD(res00, a0, b0);

ptrba = ptrba+2;
ptrbb = ptrbb+2;
}

MADD_ALPHA_N_STORE(C0, res00, alpha);
C0 = C0 + 2;

#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1*2;
ptrbb += temp*1*2;
#endif

#ifdef LEFT
off += 1; // number of values in A
#endif

}



#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif

k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
return 0;
}

+ 40
- 0
param.h View File

@@ -2214,6 +2214,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_R 4096 #define ZGEMM_DEFAULT_R 4096




#define SYMV_P 16
#endif


#if defined(CORTEXA57)
#define SNUMOPT 2
#define DNUMOPT 2

#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL

#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_N 4

#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4

#define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_N 4

#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4

#define SGEMM_DEFAULT_P 128
#define DGEMM_DEFAULT_P 256
#define CGEMM_DEFAULT_P 256
#define ZGEMM_DEFAULT_P 128

#define SGEMM_DEFAULT_Q 240
#define DGEMM_DEFAULT_Q 1024
#define CGEMM_DEFAULT_Q 1024
#define ZGEMM_DEFAULT_Q 512

#define SGEMM_DEFAULT_R 12288
#define DGEMM_DEFAULT_R 4096
#define CGEMM_DEFAULT_R 4096
#define ZGEMM_DEFAULT_R 2048


#define SYMV_P 16 #define SYMV_P 16
#endif #endif




Loading…
Cancel
Save