| @@ -68,3 +68,4 @@ test/zblat2 | |||||
| test/zblat3 | test/zblat3 | ||||
| build | build | ||||
| build.* | build.* | ||||
| *.swp | |||||
| @@ -4,4 +4,8 @@ CCOMMON_OPT += -march=armv8-a | |||||
| FCOMMON_OPT += -march=armv8-a | FCOMMON_OPT += -march=armv8-a | ||||
| endif | endif | ||||
| ifeq ($(CORE), CORTEXA57) | |||||
| CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||||
| FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||||
| endif | |||||
| @@ -74,3 +74,5 @@ ARMV5 | |||||
| 7.ARM 64-bit CPU: | 7.ARM 64-bit CPU: | ||||
| ARMV8 | ARMV8 | ||||
| CORTEXA57 | |||||
| @@ -172,7 +172,7 @@ int main(int argc, char *argv[]){ | |||||
| srandom(getpid()); | srandom(getpid()); | ||||
| #endif | #endif | ||||
| for(j = 0; j < m; j++){ | |||||
| for(j = 0; j < to; j++){ | |||||
| for(i = 0; i < to * COMPSIZE; i++){ | for(i = 0; i < to * COMPSIZE; i++){ | ||||
| a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | a[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | ||||
| b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | b[i + j * to * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | ||||
| @@ -86,6 +86,7 @@ extern "C" { | |||||
| #if !defined(_MSC_VER) | #if !defined(_MSC_VER) | ||||
| #include <unistd.h> | #include <unistd.h> | ||||
| #endif | #endif | ||||
| #include <time.h> | |||||
| #ifdef OS_LINUX | #ifdef OS_LINUX | ||||
| #include <malloc.h> | #include <malloc.h> | ||||
| @@ -89,8 +89,10 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | #if defined(ASSEMBLER) && !defined(NEEDPARAM) | ||||
| #define PROLOGUE \ | #define PROLOGUE \ | ||||
| .text ;\ | |||||
| .align 4 ;\ | |||||
| .global REALNAME ;\ | .global REALNAME ;\ | ||||
| .func REALNAME ;\ | |||||
| .type REALNAME, %function ;\ | |||||
| REALNAME: | REALNAME: | ||||
| #define EPILOGUE | #define EPILOGUE | ||||
| @@ -107,7 +109,11 @@ REALNAME: | |||||
| #endif | #endif | ||||
| #define HUGE_PAGESIZE ( 4 << 20) | #define HUGE_PAGESIZE ( 4 << 20) | ||||
| #if defined(CORTEXA57) | |||||
| #define BUFFER_SIZE (40 << 20) | |||||
| #else | |||||
| #define BUFFER_SIZE (16 << 20) | #define BUFFER_SIZE (16 << 20) | ||||
| #endif | |||||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | ||||
| @@ -29,12 +29,19 @@ | |||||
| #define CPU_UNKNOWN 0 | #define CPU_UNKNOWN 0 | ||||
| #define CPU_ARMV8 1 | #define CPU_ARMV8 1 | ||||
| #define CPU_CORTEXA57 2 | |||||
| static char *cpuname[] = { | static char *cpuname[] = { | ||||
| "UNKOWN", | |||||
| "ARMV8" | |||||
| "UNKNOWN", | |||||
| "ARMV8" , | |||||
| "CORTEXA57" | |||||
| }; | }; | ||||
| static char *cpuname_lower[] = { | |||||
| "unknown", | |||||
| "armv8" , | |||||
| "cortexa57" | |||||
| }; | |||||
| int get_feature(char *search) | int get_feature(char *search) | ||||
| { | { | ||||
| @@ -53,13 +60,13 @@ int get_feature(char *search) | |||||
| { | { | ||||
| p = strchr(buffer, ':') + 2; | p = strchr(buffer, ':') + 2; | ||||
| break; | break; | ||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| fclose(infile); | |||||
| if( p == NULL ) return; | |||||
| if( p == NULL ) return 0; | |||||
| t = strtok(p," "); | t = strtok(p," "); | ||||
| while( t = strtok(NULL," ")) | while( t = strtok(NULL," ")) | ||||
| @@ -82,11 +89,30 @@ int detect(void) | |||||
| p = (char *) NULL ; | p = (char *) NULL ; | ||||
| infile = fopen("/proc/cpuinfo", "r"); | infile = fopen("/proc/cpuinfo", "r"); | ||||
| while (fgets(buffer, sizeof(buffer), infile)) | |||||
| { | |||||
| if (!strncmp("CPU part", buffer, 8)) | |||||
| { | |||||
| p = strchr(buffer, ':') + 2; | |||||
| break; | |||||
| } | |||||
| } | |||||
| fclose(infile); | |||||
| if(p != NULL) { | |||||
| if (strstr(p, "0xd07")) { | |||||
| return CPU_CORTEXA57; | |||||
| } | |||||
| } | |||||
| p = (char *) NULL ; | |||||
| infile = fopen("/proc/cpuinfo", "r"); | |||||
| while (fgets(buffer, sizeof(buffer), infile)) | while (fgets(buffer, sizeof(buffer), infile)) | ||||
| { | { | ||||
| if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9))) | |||||
| if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)) || | |||||
| (!strncmp("CPU architecture", buffer, 16))) | |||||
| { | { | ||||
| p = strchr(buffer, ':') + 2; | p = strchr(buffer, ':') + 2; | ||||
| break; | break; | ||||
| @@ -100,7 +126,7 @@ int detect(void) | |||||
| if (strstr(p, "AArch64")) | if (strstr(p, "AArch64")) | ||||
| { | { | ||||
| return CPU_ARMV8; | |||||
| return CPU_ARMV8; | |||||
| } | } | ||||
| @@ -118,23 +144,13 @@ char *get_corename(void) | |||||
| void get_architecture(void) | void get_architecture(void) | ||||
| { | { | ||||
| printf("ARM"); | |||||
| printf("ARM64"); | |||||
| } | } | ||||
| void get_subarchitecture(void) | void get_subarchitecture(void) | ||||
| { | { | ||||
| int d = detect(); | int d = detect(); | ||||
| switch (d) | |||||
| { | |||||
| case CPU_ARMV8: | |||||
| printf("ARMV8"); | |||||
| break; | |||||
| default: | |||||
| printf("UNKNOWN"); | |||||
| break; | |||||
| } | |||||
| printf("%s", cpuname[d]); | |||||
| } | } | ||||
| void get_subdirname(void) | void get_subdirname(void) | ||||
| @@ -160,26 +176,32 @@ void get_cpuconfig(void) | |||||
| printf("#define L2_ASSOCIATIVE 4\n"); | printf("#define L2_ASSOCIATIVE 4\n"); | ||||
| break; | break; | ||||
| case CPU_CORTEXA57: | |||||
| printf("#define CORTEXA57\n"); | |||||
| printf("#define HAVE_VFP\n"); | |||||
| printf("#define HAVE_VFPV3\n"); | |||||
| printf("#define HAVE_NEON\n"); | |||||
| printf("#define HAVE_VFPV4\n"); | |||||
| printf("#define L1_CODE_SIZE 49152\n"); | |||||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||||
| printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||||
| printf("#define L1_DATA_SIZE 32768\n"); | |||||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||||
| printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||||
| printf("#define L2_SIZE 2097152\n"); | |||||
| printf("#define L2_LINESIZE 64\n"); | |||||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||||
| break; | |||||
| } | } | ||||
| } | } | ||||
| void get_libname(void) | void get_libname(void) | ||||
| { | { | ||||
| int d = detect(); | int d = detect(); | ||||
| switch (d) | |||||
| { | |||||
| case CPU_ARMV8: | |||||
| printf("armv8\n"); | |||||
| break; | |||||
| } | |||||
| printf("%s", cpuname_lower[d]); | |||||
| } | } | ||||
| void get_features(void) | void get_features(void) | ||||
| { | { | ||||
| @@ -55,7 +55,7 @@ | |||||
| static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | ||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| BLASLONG incx, incy; | |||||
| BLASLONG incx; | |||||
| BLASLONG m_from, m_to, i; | BLASLONG m_from, m_to, i; | ||||
| #ifndef COMPLEX | #ifndef COMPLEX | ||||
| FLOAT result; | FLOAT result; | ||||
| @@ -68,7 +68,6 @@ static int spmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| y = (FLOAT *)args -> c; | y = (FLOAT *)args -> c; | ||||
| incx = args -> ldb; | incx = args -> ldb; | ||||
| incy = args -> ldc; | |||||
| m_from = 0; | m_from = 0; | ||||
| m_to = args -> m; | m_to = args -> m; | ||||
| @@ -43,7 +43,7 @@ | |||||
| static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | ||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| BLASLONG lda, incx, incy; | |||||
| BLASLONG incx, incy; | |||||
| BLASLONG i, m_from, m_to; | BLASLONG i, m_from, m_to; | ||||
| FLOAT alpha_r; | FLOAT alpha_r; | ||||
| #ifdef COMPLEX | #ifdef COMPLEX | ||||
| @@ -56,7 +56,6 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL | |||||
| incx = args -> lda; | incx = args -> lda; | ||||
| incy = args -> ldb; | incy = args -> ldb; | ||||
| lda = args -> ldc; | |||||
| alpha_r = *((FLOAT *)args -> alpha + 0); | alpha_r = *((FLOAT *)args -> alpha + 0); | ||||
| #ifdef COMPLEX | #ifdef COMPLEX | ||||
| @@ -46,7 +46,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL | |||||
| BLASLONG incx; | BLASLONG incx; | ||||
| BLASLONG i, m_from, m_to; | BLASLONG i, m_from, m_to; | ||||
| FLOAT alpha_r; | FLOAT alpha_r; | ||||
| #if defined(COMPLEX) && !defined(HER) && !defined(HERREV) | |||||
| #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) | |||||
| FLOAT alpha_i; | FLOAT alpha_i; | ||||
| #endif | #endif | ||||
| @@ -56,7 +56,7 @@ static int syr_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FL | |||||
| incx = args -> lda; | incx = args -> lda; | ||||
| alpha_r = *((FLOAT *)args -> alpha + 0); | alpha_r = *((FLOAT *)args -> alpha + 0); | ||||
| #if defined(COMPLEX) && !defined(HER) && !defined(HERREV) | |||||
| #if defined(COMPLEX) && !defined(HEMV) && !defined(HEMVREV) | |||||
| alpha_i = *((FLOAT *)args -> alpha + 1); | alpha_i = *((FLOAT *)args -> alpha + 1); | ||||
| #endif | #endif | ||||
| @@ -55,7 +55,7 @@ | |||||
| static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *buffer, BLASLONG pos){ | ||||
| FLOAT *a, *x, *y; | FLOAT *a, *x, *y; | ||||
| BLASLONG lda, incx, incy; | |||||
| BLASLONG lda, incx; | |||||
| BLASLONG m_from, m_to; | BLASLONG m_from, m_to; | ||||
| a = (FLOAT *)args -> a; | a = (FLOAT *)args -> a; | ||||
| @@ -64,7 +64,6 @@ static int symv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F | |||||
| lda = args -> lda; | lda = args -> lda; | ||||
| incx = args -> ldb; | incx = args -> ldb; | ||||
| incy = args -> ldc; | |||||
| m_from = 0; | m_from = 0; | ||||
| m_to = args -> m; | m_to = args -> m; | ||||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -45,13 +45,11 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) + 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -43,12 +43,10 @@ | |||||
| int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); | |||||
| COPY_K(m, b, incb, buffer, 1); | COPY_K(m, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -43,12 +43,10 @@ | |||||
| int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) + 4095) & ~4095); | |||||
| COPY_K(m, b, incb, buffer, 1); | COPY_K(m, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| #if (TRANSA == 2) || (TRANSA == 4) | #if (TRANSA == 2) || (TRANSA == 4) | ||||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| #if (TRANSA == 2) || (TRANSA == 4) | #if (TRANSA == 2) || (TRANSA == 4) | ||||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| #if (TRANSA == 2) || (TRANSA == 4) | #if (TRANSA == 2) || (TRANSA == 4) | ||||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE + 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -45,7 +45,6 @@ const static FLOAT dp1 = 1.; | |||||
| int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, void *buffer){ | ||||
| BLASLONG i; | BLASLONG i; | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| BLASLONG length; | BLASLONG length; | ||||
| #if (TRANSA == 2) || (TRANSA == 4) | #if (TRANSA == 2) || (TRANSA == 4) | ||||
| @@ -57,7 +56,6 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG inc | |||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + n * sizeof(FLOAT) * COMPSIZE+ 4095) & ~4095); | |||||
| COPY_K(n, b, incb, buffer, 1); | COPY_K(n, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||||
| #ifndef UNIT | #ifndef UNIT | ||||
| FLOAT atemp1, atemp2, btemp1, btemp2; | FLOAT atemp1, atemp2, btemp1, btemp2; | ||||
| #endif | #endif | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||||
| COPY_K(m, b, incb, buffer, 1); | COPY_K(m, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||||
| #ifndef UNIT | #ifndef UNIT | ||||
| FLOAT atemp1, atemp2, btemp1, btemp2; | FLOAT atemp1, atemp2, btemp1, btemp2; | ||||
| #endif | #endif | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||||
| COPY_K(m, b, incb, buffer, 1); | COPY_K(m, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -51,12 +51,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||||
| #ifndef UNIT | #ifndef UNIT | ||||
| FLOAT ar, ai, br, bi, ratio, den; | FLOAT ar, ai, br, bi, ratio, den; | ||||
| #endif | #endif | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||||
| COPY_K(m, b, incb, buffer, 1); | COPY_K(m, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -49,12 +49,10 @@ int CNAME(BLASLONG m, FLOAT *a, FLOAT *b, BLASLONG incb, void *buffer){ | |||||
| #ifndef UNIT | #ifndef UNIT | ||||
| FLOAT ar, ai, br, bi, ratio, den; | FLOAT ar, ai, br, bi, ratio, den; | ||||
| #endif | #endif | ||||
| FLOAT *gemvbuffer = (FLOAT *)buffer; | |||||
| FLOAT *B = b; | FLOAT *B = b; | ||||
| if (incb != 1) { | if (incb != 1) { | ||||
| B = buffer; | B = buffer; | ||||
| gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095); | |||||
| COPY_K(m, b, incb, buffer, 1); | COPY_K(m, b, incb, buffer, 1); | ||||
| } | } | ||||
| @@ -65,7 +65,7 @@ int CNAME(int mode, blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, int ( | |||||
| blas_queue_t queue[MAX_CPU_NUMBER]; | blas_queue_t queue[MAX_CPU_NUMBER]; | ||||
| BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; | BLASLONG range_M[MAX_CPU_NUMBER + 1], range_N[MAX_CPU_NUMBER + 1]; | ||||
| BLASLONG procs, total_procs, num_cpu_m, num_cpu_n; | |||||
| BLASLONG procs, num_cpu_m, num_cpu_n; | |||||
| BLASLONG width, i, j; | BLASLONG width, i, j; | ||||
| BLASLONG divM, divN; | BLASLONG divM, divN; | ||||
| @@ -230,7 +230,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| BLASLONG is, min_i, div_n; | BLASLONG is, min_i, div_n; | ||||
| BLASLONG i, current; | BLASLONG i, current; | ||||
| BLASLONG l1stride, l2size; | |||||
| BLASLONG l1stride; | |||||
| #ifdef TIMING | #ifdef TIMING | ||||
| BLASULONG rpcc_counter; | BLASULONG rpcc_counter; | ||||
| @@ -298,8 +298,6 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||||
| #endif | #endif | ||||
| ) return 0; | ) return 0; | ||||
| l2size = GEMM_P * GEMM_Q; | |||||
| #if 0 | #if 0 | ||||
| fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", | fprintf(stderr, "Thread[%ld] m_from : %ld m_to : %ld n_from : %ld n_to : %ld N_from : %ld N_to : %ld\n", | ||||
| mypos, m_from, m_to, n_from, n_to, N_from, N_to); | mypos, m_from, m_to, n_from, n_to, N_from, N_to); | ||||
| @@ -706,7 +704,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||||
| n = n_to - n_from; | n = n_to - n_from; | ||||
| } | } | ||||
| if ((args -> m < nthreads * SWITCH_RATIO) || (args -> n < nthreads * SWITCH_RATIO)) { | |||||
| if ((m < nthreads * SWITCH_RATIO) || (n < nthreads * SWITCH_RATIO)) { | |||||
| GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); | GEMM_LOCAL(args, range_m, range_n, sa, sb, 0); | ||||
| return 0; | return 0; | ||||
| } | } | ||||
| @@ -914,7 +914,6 @@ static volatile struct { | |||||
| } memory[NUM_BUFFERS]; | } memory[NUM_BUFFERS]; | ||||
| static int memory_initialized = 0; | static int memory_initialized = 0; | ||||
| static void gotoblas_memory_init(void); | |||||
| /* Memory allocation routine */ | /* Memory allocation routine */ | ||||
| /* procpos ... indicates where it comes from */ | /* procpos ... indicates where it comes from */ | ||||
| @@ -819,10 +819,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | ||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " | "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " | ||||
| #define LIBNAME "armv8" | #define LIBNAME "armv8" | ||||
| #define CORENAME "XGENE1" | |||||
| #else | |||||
| #define CORENAME "ARMV8" | |||||
| #endif | #endif | ||||
| #ifdef FORCE_CORTEXA57 | |||||
| #define FORCE | |||||
| #define ARCHITECTURE "ARM64" | |||||
| #define SUBARCHITECTURE "ARMV8" | |||||
| #define SUBDIRNAME "arm64" | |||||
| #define ARCHCONFIG "-DCORTEXA57 " \ | |||||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=2 " \ | |||||
| "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=16 " \ | |||||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||||
| #define LIBNAME "cortexa57" | |||||
| #define CORENAME "CORTEXA57" | |||||
| #else | |||||
| #endif | |||||
| #ifndef FORCE | #ifndef FORCE | ||||
| @@ -91,6 +91,27 @@ | |||||
| #endif | #endif | ||||
| #endif | #endif | ||||
| #ifdef SMP | |||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | |||||
| #define MODE (BLAS_XDOUBLE | BLAS_REAL) | |||||
| #elif defined(DOUBLE) | |||||
| #define MODE (BLAS_DOUBLE | BLAS_REAL) | |||||
| #else | |||||
| #define MODE (BLAS_SINGLE | BLAS_REAL) | |||||
| #endif | |||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| #define MODE (BLAS_XDOUBLE | BLAS_COMPLEX) | |||||
| #elif defined(DOUBLE) | |||||
| #define MODE (BLAS_DOUBLE | BLAS_COMPLEX) | |||||
| #else | |||||
| #define MODE (BLAS_SINGLE | BLAS_COMPLEX) | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | static int (*symm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | ||||
| #ifndef GEMM3M | #ifndef GEMM3M | ||||
| #ifndef HEMM | #ifndef HEMM | ||||
| @@ -135,26 +156,6 @@ void NAME(char *SIDE, char *UPLO, | |||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| FLOAT *sa, *sb; | FLOAT *sa, *sb; | ||||
| #ifdef SMP | |||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #endif | |||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #if defined(SMP) && !defined(NO_AFFINITY) | #if defined(SMP) && !defined(NO_AFFINITY) | ||||
| int nodes; | int nodes; | ||||
| #endif | #endif | ||||
| @@ -246,26 +247,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| FLOAT *sa, *sb; | FLOAT *sa, *sb; | ||||
| #ifdef SMP | |||||
| #ifndef COMPLEX | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_REAL; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_REAL; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_REAL; | |||||
| #endif | |||||
| #else | |||||
| #ifdef XDOUBLE | |||||
| int mode = BLAS_XDOUBLE | BLAS_COMPLEX; | |||||
| #elif defined(DOUBLE) | |||||
| int mode = BLAS_DOUBLE | BLAS_COMPLEX; | |||||
| #else | |||||
| int mode = BLAS_SINGLE | BLAS_COMPLEX; | |||||
| #endif | |||||
| #endif | |||||
| #endif | |||||
| #if defined(SMP) && !defined(NO_AFFINITY) | #if defined(SMP) && !defined(NO_AFFINITY) | ||||
| int nodes; | int nodes; | ||||
| #endif | #endif | ||||
| @@ -407,7 +388,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||||
| args.nthreads /= nodes; | args.nthreads /= nodes; | ||||
| gemm_thread_mn(mode, &args, NULL, NULL, | |||||
| gemm_thread_mn(MODE, &args, NULL, NULL, | |||||
| symm[4 | (side << 1) | uplo ], sa, sb, nodes); | symm[4 | (side << 1) | uplo ], sa, sb, nodes); | ||||
| } else { | } else { | ||||
| @@ -419,7 +400,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, | |||||
| #else | #else | ||||
| GEMM_THREAD(mode, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); | |||||
| GEMM_THREAD(MODE, &args, NULL, NULL, symm[(side << 1) | uplo ], sa, sb, args.nthreads); | |||||
| #endif | #endif | ||||
| @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { | void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| int trans, uplo; | |||||
| int uplo; | |||||
| blasint info; | blasint info; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| int nthreads; | int nthreads; | ||||
| @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| trans = -1; | |||||
| uplo = -1; | uplo = -1; | ||||
| info = 0; | info = 0; | ||||
| @@ -118,7 +118,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { | void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint incy, FLOAT *a, blasint lda) { | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| int trans, uplo; | |||||
| int uplo; | |||||
| blasint info; | blasint info; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| int nthreads; | int nthreads; | ||||
| @@ -126,7 +126,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| trans = -1; | |||||
| uplo = -1; | uplo = -1; | ||||
| info = 0; | info = 0; | ||||
| @@ -117,7 +117,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||||
| FLOAT beta_i = BETA[1]; | FLOAT beta_i = BETA[1]; | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| int trans, uplo; | |||||
| int uplo; | |||||
| blasint info; | blasint info; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| int nthreads; | int nthreads; | ||||
| @@ -135,7 +135,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| trans = -1; | |||||
| uplo = -1; | uplo = -1; | ||||
| info = 0; | info = 0; | ||||
| @@ -116,7 +116,7 @@ void NAME(char *UPLO, blasint *N, FLOAT *ALPHA, | |||||
| void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { | void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *a, blasint lda) { | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| int trans, uplo; | |||||
| int uplo; | |||||
| blasint info; | blasint info; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| int nthreads; | int nthreads; | ||||
| @@ -124,7 +124,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT alpha, | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| trans = -1; | |||||
| uplo = -1; | uplo = -1; | ||||
| info = 0; | info = 0; | ||||
| @@ -121,7 +121,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||||
| FLOAT alpha_r = ALPHA[0]; | FLOAT alpha_r = ALPHA[0]; | ||||
| FLOAT alpha_i = ALPHA[1]; | FLOAT alpha_i = ALPHA[1]; | ||||
| FLOAT *buffer; | FLOAT *buffer; | ||||
| int trans, uplo; | |||||
| int uplo; | |||||
| blasint info; | blasint info; | ||||
| #ifdef SMP | #ifdef SMP | ||||
| int nthreads; | int nthreads; | ||||
| @@ -129,7 +129,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint n, FLOAT *ALPHA | |||||
| PRINT_DEBUG_CNAME; | PRINT_DEBUG_CNAME; | ||||
| trans = -1; | |||||
| uplo = -1; | uplo = -1; | ||||
| info = 0; | info = 0; | ||||
| @@ -637,49 +637,49 @@ $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | ||||
| $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) | $(KDIR)daxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) | $(KDIR)qaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||||
| $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) | $(KDIR)caxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ | ||||
| $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) | $(KDIR)zaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ | |||||
| $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) | $(KDIR)xaxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DXDOUBLE $< -o $@ | |||||
| $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) | $(KDIR)caxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -UDOUBLE $< -o $@ | |||||
| $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) | $(KDIR)zaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DDOUBLE $< -o $@ | |||||
| $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) | $(KDIR)xaxpyc_k$(TSUFFIX).$(SUFFIX) $(KDIR)xaxpyc_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XAXPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCONJ -DXDOUBLE $< -o $@ | |||||
| $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) | $(KDIR)scopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)scopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ | |||||
| $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) | $(KDIR)dcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)dcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ | |||||
| $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) | $(KDIR)qcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)qcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QCOPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UCOMPLEX -DC_INTERFACE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ | |||||
| $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) | $(KDIR)ccopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)ccopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE -DC_INTERFACE $< -o $@ | |||||
| $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) | $(KDIR)zcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)zcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE -DC_INTERFACE $< -o $@ | |||||
| $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) | $(KDIR)xcopy_k$(TSUFFIX).$(SUFFIX) $(KDIR)xcopy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XCOPYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DCOMPLEX -DC_INTERFACE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DC_INTERFACE $< -o $@ | |||||
| $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) | $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DDOTKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | ||||
| @@ -799,15 +799,15 @@ $(KDIR)xswap_k$(TSUFFIX).$(SUFFIX) $(KDIR)xswap_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | ||||
| $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) | $(KDIR)saxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPBYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||||
| $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) | $(KDIR)daxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)daxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DAXPBYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | |||||
| $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) | $(KDIR)caxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)caxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CAXPBYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -UDOUBLE $< -o $@ | |||||
| $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) | $(KDIR)zaxpby_k$(TSUFFIX).$(SUFFIX) $(KDIR)zaxpby_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZAXPBYKERNEL) | ||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ | |||||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UCONJ -DDOUBLE $< -o $@ | |||||
| @@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT maxf=0.0; | FLOAT maxf=0.0; | ||||
| if (n < 0 || inc_x < 1 ) return(maxf); | |||||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||||
| maxf=ABS(x[0]); | maxf=ABS(x[0]); | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| if( ABS(x[ix]) > ABS(maxf) ) | |||||
| if( ABS(x[ix]) > maxf ) | |||||
| { | { | ||||
| maxf = ABS(x[ix]); | maxf = ABS(x[ix]); | ||||
| } | } | ||||
| @@ -54,13 +54,15 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT minf=0.0; | FLOAT minf=0.0; | ||||
| if (n < 0 || inc_x < 1 ) return(minf); | |||||
| if (n <= 0 || inc_x <= 0) return(minf); | |||||
| minf=ABS(x[0]); | minf=ABS(x[0]); | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| if( ABS(x[ix]) < ABS(minf) ) | |||||
| if( ABS(x[ix]) < minf ) | |||||
| { | { | ||||
| minf = ABS(x[ix]); | minf = ABS(x[ix]); | ||||
| } | } | ||||
| @@ -53,7 +53,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| FLOAT sumf = 0.0; | FLOAT sumf = 0.0; | ||||
| if (n < 0 || inc_x < 1 ) return(sumf); | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| n *= inc_x; | n *= inc_x; | ||||
| while(i < n) | while(i < n) | ||||
| @@ -55,13 +55,15 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT maxf=0.0; | FLOAT maxf=0.0; | ||||
| BLASLONG max=0; | BLASLONG max=0; | ||||
| if (n < 0 || inc_x < 1 ) return(max); | |||||
| if (n <= 0 || inc_x <= 0) return(max); | |||||
| maxf=ABS(x[0]); | maxf=ABS(x[0]); | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| if( ABS(x[ix]) > ABS(maxf) ) | |||||
| if( ABS(x[ix]) > maxf ) | |||||
| { | { | ||||
| max = i; | max = i; | ||||
| maxf = ABS(x[ix]); | maxf = ABS(x[ix]); | ||||
| @@ -55,9 +55,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT minf=0.0; | FLOAT minf=0.0; | ||||
| BLASLONG min=0; | BLASLONG min=0; | ||||
| if (n < 0 || inc_x < 1 ) return(min); | |||||
| if (n <= 0 || inc_x <= 0) return(min); | |||||
| minf=ABS(x[0]); | minf=ABS(x[0]); | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -47,9 +47,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT maxf=0.0; | FLOAT maxf=0.0; | ||||
| BLASLONG max=0; | BLASLONG max=0; | ||||
| if (n < 0 || inc_x < 1 ) return(max); | |||||
| if (n <= 0 || inc_x <= 0) return(max); | |||||
| maxf=x[0]; | maxf=x[0]; | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -45,9 +45,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT minf=0.0; | FLOAT minf=0.0; | ||||
| BLASLONG min=0; | BLASLONG min=0; | ||||
| if (n < 0 || inc_x < 1 ) return(min); | |||||
| if (n <= 0 || inc_x <= 0) return(min); | |||||
| minf=x[0]; | minf=x[0]; | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT maxf[2]; | |||||
| FLOAT maxf; | |||||
| BLASLONG max=0; | BLASLONG max=0; | ||||
| BLASLONG inc_x2; | BLASLONG inc_x2; | ||||
| if (n < 0 || inc_x < 1 ) return(max); | |||||
| if (n <= 0 || inc_x <= 0) return(max); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| maxf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| if( CABS1(x,ix) > CABS1(maxf,0) ) | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | { | ||||
| max = i; | max = i; | ||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| maxf = CABS1(x,ix); | |||||
| } | } | ||||
| ix += inc_x2; | ix += inc_x2; | ||||
| i++; | i++; | ||||
| @@ -53,24 +53,24 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT minf[2]; | |||||
| FLOAT minf; | |||||
| BLASLONG min=0; | BLASLONG min=0; | ||||
| BLASLONG inc_x2; | BLASLONG inc_x2; | ||||
| if (n < 0 || inc_x < 1 ) return(min); | |||||
| if (n <= 0 || inc_x <= 0) return(min); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| minf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| if( CABS1(x,ix) < CABS1(minf,0) ) | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | { | ||||
| min = i; | min = i; | ||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| minf = CABS1(x,ix); | |||||
| } | } | ||||
| ix += inc_x2; | ix += inc_x2; | ||||
| i++; | i++; | ||||
| @@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT maxf=0.0; | FLOAT maxf=0.0; | ||||
| if (n < 0 || inc_x < 1 ) return(maxf); | |||||
| if (n <= 0 || inc_x <= 0) return(maxf); | |||||
| maxf=x[0]; | maxf=x[0]; | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -44,9 +44,11 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT minf=0.0; | FLOAT minf=0.0; | ||||
| if (n < 0 || inc_x < 1 ) return(minf); | |||||
| if (n <= 0 || inc_x <= 0) return(minf); | |||||
| minf=x[0]; | minf=x[0]; | ||||
| ix += inc_x; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| FLOAT absxi = 0.0; | FLOAT absxi = 0.0; | ||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||||
| if ( n == 1 ) return( ABS(x[0]) ); | if ( n == 1 ) return( ABS(x[0]) ); | ||||
| n *= inc_x; | n *= inc_x; | ||||
| @@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT maxf[2]; | |||||
| BLASLONG max=0; | |||||
| FLOAT maxf; | |||||
| BLASLONG inc_x2; | BLASLONG inc_x2; | ||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| maxf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| if( CABS1(x,ix) > CABS1(maxf,0) ) | |||||
| if( CABS1(x,ix) > maxf ) | |||||
| { | { | ||||
| max = i; | |||||
| maxf[0] = ABS(x[ix]); | |||||
| maxf[1] = ABS(x[ix+1]); | |||||
| maxf = CABS1(x,ix); | |||||
| } | } | ||||
| ix += inc_x2; | ix += inc_x2; | ||||
| i++; | i++; | ||||
| } | } | ||||
| return(CABS1(maxf,0)); | |||||
| return(maxf); | |||||
| } | } | ||||
| @@ -53,29 +53,27 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| { | { | ||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix=0; | BLASLONG ix=0; | ||||
| FLOAT minf[2]; | |||||
| BLASLONG min=0; | |||||
| FLOAT minf; | |||||
| BLASLONG inc_x2; | BLASLONG inc_x2; | ||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| minf = CABS1(x,0); | |||||
| ix += inc_x2; | |||||
| i++; | |||||
| while(i < n) | while(i < n) | ||||
| { | { | ||||
| if( CABS1(x,ix) < CABS1(minf,0) ) | |||||
| if( CABS1(x,ix) < minf ) | |||||
| { | { | ||||
| min = i; | |||||
| minf[0] = ABS(x[ix]); | |||||
| minf[1] = ABS(x[ix+1]); | |||||
| minf = CABS1(x,ix); | |||||
| } | } | ||||
| ix += inc_x2; | ix += inc_x2; | ||||
| i++; | i++; | ||||
| } | } | ||||
| return(CABS1(minf,0)); | |||||
| return(minf); | |||||
| } | } | ||||
| @@ -55,7 +55,8 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| FLOAT sumf = 0.0; | FLOAT sumf = 0.0; | ||||
| BLASLONG inc_x2; | BLASLONG inc_x2; | ||||
| if (n < 0 || inc_x < 1 ) return(sumf); | |||||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| @@ -37,11 +37,9 @@ int CNAME(BLASLONG n, FLOAT alpha_r, FLOAT alpha_i, FLOAT *x, BLASLONG inc_x, FL | |||||
| BLASLONG i=0; | BLASLONG i=0; | ||||
| BLASLONG ix,iy; | BLASLONG ix,iy; | ||||
| FLOAT temp; | FLOAT temp; | ||||
| BLASLONG inc_x2, inc_y2; | |||||
| BLASLONG inc_x2; | |||||
| BLASLONG inc_y2; | |||||
| if ( n < 0 ) return(0); | |||||
| if ( n <= 0 ) return(0); | |||||
| ix = 0; | ix = 0; | ||||
| iy = 0; | iy = 0; | ||||
| @@ -57,7 +57,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||||
| BLASLONG inc_x2; | BLASLONG inc_x2; | ||||
| FLOAT temp; | FLOAT temp; | ||||
| if (n < 0 || inc_x < 1 ) return(0.0); | |||||
| if (n <= 0 || inc_x <= 0) return(0.0); | |||||
| inc_x2 = 2 * inc_x; | inc_x2 = 2 * inc_x; | ||||
| @@ -0,0 +1,91 @@ | |||||
| include $(KERNELDIR)/KERNEL.ARMV8 | |||||
| SAMAXKERNEL = amax.S | |||||
| DAMAXKERNEL = amax.S | |||||
| CAMAXKERNEL = zamax.S | |||||
| ZAMAXKERNEL = zamax.S | |||||
| ISAMAXKERNEL = isamax.S | |||||
| IDAMAXKERNEL = idamax.S | |||||
| ICAMAXKERNEL = izamax.S | |||||
| IZAMAXKERNEL = izamax.S | |||||
| SASUMKERNEL = asum.S | |||||
| DASUMKERNEL = asum.S | |||||
| CASUMKERNEL = casum.S | |||||
| ZASUMKERNEL = zasum.S | |||||
| SAXPYKERNEL = axpy.S | |||||
| DAXPYKERNEL = axpy.S | |||||
| CAXPYKERNEL = zaxpy.S | |||||
| ZAXPYKERNEL = zaxpy.S | |||||
| SCOPYKERNEL = copy.S | |||||
| DCOPYKERNEL = copy.S | |||||
| CCOPYKERNEL = copy.S | |||||
| ZCOPYKERNEL = copy.S | |||||
| DOTKERNEL = dot.S | |||||
| DDOTKERNEL = dot.S | |||||
| CDOTKERNEL = zdot.S | |||||
| ZDOTKERNEL = zdot.S | |||||
| SNRM2KERNEL = snrm2.S | |||||
| DNRM2KERNEL = dnrm2.S | |||||
| CNRM2KERNEL = znrm2.S | |||||
| ZNRM2KERNEL = znrm2.S | |||||
| SROTKERNEL = rot.S | |||||
| DROTKERNEL = rot.S | |||||
| CROTKERNEL = zrot.S | |||||
| ZROTKERNEL = zrot.S | |||||
| SCALKERNEL = scal.S | |||||
| DSCALKERNEL = scal.S | |||||
| CSCALKERNEL = zscal.S | |||||
| ZSCALKERNEL = zscal.S | |||||
| SSWAPKERNEL = swap.S | |||||
| DSWAPKERNEL = swap.S | |||||
| CSWAPKERNEL = swap.S | |||||
| ZSWAPKERNEL = swap.S | |||||
| SGEMVNKERNEL = gemv_n.S | |||||
| DGEMVNKERNEL = gemv_n.S | |||||
| CGEMVNKERNEL = zgemv_n.S | |||||
| ZGEMVNKERNEL = zgemv_n.S | |||||
| SGEMVTKERNEL = gemv_t.S | |||||
| DGEMVTKERNEL = gemv_t.S | |||||
| CGEMVTKERNEL = zgemv_t.S | |||||
| ZGEMVTKERNEL = zgemv_t.S | |||||
| STRMMKERNEL = strmm_kernel_4x4.S | |||||
| DTRMMKERNEL = dtrmm_kernel_4x4.S | |||||
| CTRMMKERNEL = ctrmm_kernel_4x4.S | |||||
| ZTRMMKERNEL = ztrmm_kernel_4x4.S | |||||
| SGEMMKERNEL = sgemm_kernel_4x4.S | |||||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
| DGEMMKERNEL = dgemm_kernel_4x4.S | |||||
| DGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||||
| CGEMMKERNEL = cgemm_kernel_4x4.S | |||||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||||
| ZGEMMKERNEL = zgemm_kernel_4x4.S | |||||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||||
| @@ -0,0 +1,249 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if defined(USE_MIN) | |||||
| #define COND le | |||||
| #else | |||||
| #define COND ge | |||||
| #endif | |||||
| #if !defined(DOUBLE) | |||||
| #define REG0 wzr | |||||
| #define MAXF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define REG0 xzr | |||||
| #define MAXF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro INIT_F1 | |||||
| ldr MAXF, [X], #SZ | |||||
| #if defined(USE_ABS) | |||||
| fabs MAXF, MAXF | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| #if defined(USE_ABS) | |||||
| fabs TMPF, TMPF | |||||
| #endif | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| .endm | |||||
| .macro INIT_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| #if defined(USE_ABS) | |||||
| fabs v0.4s, v0.4s | |||||
| #endif | |||||
| #if defined(USE_MIN) | |||||
| fminv MAXF, v0.4s | |||||
| #else | |||||
| fmaxv MAXF, v0.4s | |||||
| #endif | |||||
| #else // DOUBLE | |||||
| ld2 {v0.2d,v1.2d}, [X], #32 | |||||
| #if defined(USE_ABS) | |||||
| fabs v0.2d, v0.2d | |||||
| fabs v1.2d, v1.2d | |||||
| #endif | |||||
| #if defined(USE_MIN) | |||||
| fmin v0.2d, v0.2d, v1.2d | |||||
| fminp MAXF, v0.2d | |||||
| #else | |||||
| fmax v0.2d, v0.2d, v1.2d | |||||
| fmaxp MAXF, v0.2d | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| #if defined(USE_ABS) | |||||
| fabs v1.4s, v1.4s | |||||
| #endif | |||||
| #if defined(USE_MIN) | |||||
| fminv TMPF, v1.4s | |||||
| #else | |||||
| fmaxv TMPF, v1.4s | |||||
| #endif | |||||
| #else // DOUBLE | |||||
| ld2 {v1.2d,v2.2d}, [X], #32 | |||||
| #if defined(USE_ABS) | |||||
| fabs v1.2d, v1.2d | |||||
| fabs v2.2d, v2.2d | |||||
| #endif | |||||
| #if defined(USE_MIN) | |||||
| fmin v1.2d, v1.2d, v2.2d | |||||
| fminp TMPF, v1.2d | |||||
| #else | |||||
| fmax v1.2d, v1.2d, v2.2d | |||||
| fmaxp TMPF, v1.2d | |||||
| #endif | |||||
| #endif | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| ld1 {v0.s}[0], [X], INC_X | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| ld1 {v0.d}[0], [X], INC_X | |||||
| #endif | |||||
| #if defined(USE_ABS) | |||||
| fabs MAXF, MAXF | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| #if defined(USE_ABS) | |||||
| fabs TMPF, TMPF | |||||
| #endif | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble amax_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble amax_kernel_zero | |||||
| cmp INC_X, #1 | |||||
| bne amax_kernel_S_BEGIN | |||||
| amax_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq amax_kernel_F1_INIT | |||||
| INIT_F4 | |||||
| subs I, I, #1 | |||||
| beq amax_kernel_F1 | |||||
| amax_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_F4 | |||||
| amax_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble amax_kernel_L999 | |||||
| amax_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_F10 | |||||
| ret | |||||
| amax_kernel_F1_INIT: | |||||
| INIT_F1 | |||||
| subs N, N, #1 | |||||
| b amax_kernel_F1 | |||||
| amax_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble amax_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble amax_kernel_S1 | |||||
| amax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_S4 | |||||
| amax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble amax_kernel_L999 | |||||
| amax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_S10 | |||||
| amax_kernel_L999: | |||||
| ret | |||||
| amax_kernel_zero: | |||||
| fmov MAXF, REG0 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,194 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define REG0 wzr | |||||
| #define SUMF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define REG0 xzr | |||||
| #define SUMF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| fabs TMPF, TMPF | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] | |||||
| fabs v1.4s, v1.4s // ABS() each value | |||||
| fabs v2.4s, v2.4s // ABS() each value | |||||
| fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] | |||||
| fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] | |||||
| add X, X, #64 | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| fabs v4.2d, v4.2d | |||||
| fabs v5.2d, v5.2d | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| fadd v2.2d, v2.2d, v3.2d | |||||
| fadd v4.2d, v4.2d, v5.2d | |||||
| fadd v0.2d, v0.2d, v2.2d | |||||
| fadd v0.2d, v0.2d, v4.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F8_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| ext v1.16b, v0.16b, v0.16b, #8 | |||||
| fadd v0.2s, v0.2s, v1.2s | |||||
| faddp SUMF, v0.2s | |||||
| #else | |||||
| faddp SUMF, v0.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| fabs TMPF, TMPF | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov SUMF, REG0 | |||||
| #if !defined(DOUBLE) | |||||
| fmov s1, SUMF | |||||
| #else | |||||
| fmov d1, SUMF | |||||
| #endif | |||||
| cmp N, xzr | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, xzr | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne asum_kernel_S_BEGIN | |||||
| asum_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq asum_kernel_F1 | |||||
| asum_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F8 | |||||
| KERNEL_F8_FINALIZE | |||||
| asum_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F10 | |||||
| asum_kernel_L999: | |||||
| ret | |||||
| asum_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble asum_kernel_S1 | |||||
| asum_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S4 | |||||
| asum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S10 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,209 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x3 /* X vector address */ | |||||
| #define INC_X x4 /* X stride */ | |||||
| #define Y x5 /* Y vector address */ | |||||
| #define INC_Y x6 /* Y stride */ | |||||
| #define I x1 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define DA s0 /* scale input value */ | |||||
| #define TMPX s1 | |||||
| #define TMPVX {v1.s}[0] | |||||
| #define TMPY s2 | |||||
| #define TMPVY {v2.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define DA d0 /* scale input value */ | |||||
| #define TMPX d1 | |||||
| #define TMPVX {v1.d}[0] | |||||
| #define TMPY d2 | |||||
| #define TMPVY {v2.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPX, [X], #SZ | |||||
| ldr TMPY, [Y] | |||||
| fmadd TMPY, TMPX, DA, TMPY | |||||
| str TMPY, [Y], #SZ | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| ld1 {v2.4s}, [Y] | |||||
| fmla v2.4s, v1.4s, v0.s[0] | |||||
| st1 {v2.4s}, [Y], #16 | |||||
| #else // DOUBLE | |||||
| ld1 {v1.2d, v2.2d}, [X], #32 | |||||
| ld1 {v3.2d, v4.2d}, [Y] | |||||
| fmla v3.2d, v1.2d, v0.d[0] | |||||
| fmla v4.2d, v2.2d, v0.d[0] | |||||
| st1 {v3.2d, v4.2d}, [Y], #32 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.4s, v2.4s}, [X], #32 | |||||
| ld1 {v3.4s, v4.4s}, [Y] | |||||
| fmla v3.4s, v1.4s, v0.s[0] | |||||
| fmla v4.4s, v2.4s, v0.s[0] | |||||
| st1 {v3.4s, v4.4s}, [Y], #32 | |||||
| #else // DOUBLE | |||||
| ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||||
| ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y] | |||||
| fmla v16.2d, v1.2d, v0.d[0] | |||||
| fmla v17.2d, v2.2d, v0.d[0] | |||||
| fmla v18.2d, v3.2d, v0.d[0] | |||||
| fmla v19.2d, v4.2d, v0.d[0] | |||||
| st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [Y], #64 | |||||
| #endif | |||||
| PRFM PLDL1KEEP, [X, #512] | |||||
| PRFM PLDL1KEEP, [Y, #512] | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| lsl INC_Y, INC_Y, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVX, [X], INC_X | |||||
| ldr TMPY, [Y] | |||||
| fmadd TMPY, TMPX, DA, TMPY | |||||
| st1 TMPVY, [Y], INC_Y | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble axpy_kernel_L999 | |||||
| fcmp DA, #0.0 | |||||
| beq axpy_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne axpy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne axpy_kernel_S_BEGIN | |||||
| axpy_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq axpy_kernel_F1 | |||||
| axpy_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_F8 | |||||
| axpy_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble axpy_kernel_L999 | |||||
| axpy_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_F10 | |||||
| mov w0, wzr | |||||
| ret | |||||
| axpy_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble axpy_kernel_S1 | |||||
| axpy_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_S4 | |||||
| axpy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble axpy_kernel_L999 | |||||
| axpy_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne axpy_kernel_S10 | |||||
| axpy_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| @@ -0,0 +1,170 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define REG0 wzr | |||||
| #define SUMF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ld1 {v1.2s}, [X], #8 | |||||
| fabs v1.2s, v1.2s | |||||
| ext v2.8b, v1.8b, v1.8b, #4 | |||||
| fadd TMPF, TMPF, s2 | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] | |||||
| add X, X, #64 | |||||
| fabs v1.4s, v1.4s | |||||
| fabs v2.4s, v2.4s | |||||
| fabs v3.4s, v3.4s | |||||
| fabs v4.4s, v4.4s | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| fadd v1.4s, v1.4s, v2.4s | |||||
| fadd v3.4s, v3.4s, v4.4s | |||||
| fadd v0.4s, v0.4s, v1.4s | |||||
| fadd v0.4s, v0.4s, v3.4s | |||||
| .endm | |||||
| .macro KERNEL_F8_FINALIZE | |||||
| ext v1.16b, v0.16b, v0.16b, #8 | |||||
| fadd v0.2s, v0.2s, v1.2s | |||||
| faddp SUMF, v0.2s | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #3 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 {v1.2s}, [X], INC_X | |||||
| fabs v1.2s, v1.2s | |||||
| ext v2.8b, v1.8b, v1.8b, #4 | |||||
| fadd TMPF, TMPF, s2 | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov SUMF, REG0 | |||||
| fmov s1, SUMF | |||||
| cmp N, xzr | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, xzr | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne asum_kernel_S_BEGIN | |||||
| asum_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq asum_kernel_F1 | |||||
| asum_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F8 | |||||
| KERNEL_F8_FINALIZE | |||||
| asum_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F10 | |||||
| asum_kernel_L999: | |||||
| ret | |||||
| asum_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble asum_kernel_S1 | |||||
| asum_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S4 | |||||
| asum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S10 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,232 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define Y x3 /* Y vector address */ | |||||
| #define INC_Y x4 /* Y stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define TMPF s0 | |||||
| #define TMPVF {v0.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define TMPF d0 | |||||
| #define TMPVF {v0.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(COMPLEX) | |||||
| ldr TMPF, [X], #SZ | |||||
| str TMPF, [Y], #SZ | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.2s}, [X], #8 | |||||
| st1 {v0.2s}, [Y], #8 | |||||
| #else | |||||
| ld1 {v0.2d}, [X], #16 | |||||
| st1 {v0.2d}, [Y], #16 | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| #else // DOUBLE | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| st1 {v1.4s}, [Y], #16 | |||||
| #endif | |||||
| #else // COMPLEX | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| st1 {v1.4s}, [Y], #16 | |||||
| #else // DOUBLE | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| ld1 {v2.4s}, [X], #16 | |||||
| ld1 {v3.4s}, [X], #16 | |||||
| st1 {v0.4s}, [Y], #16 | |||||
| st1 {v1.4s}, [Y], #16 | |||||
| st1 {v2.4s}, [Y], #16 | |||||
| st1 {v3.4s}, [Y], #16 | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| lsl INC_Y, INC_Y, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #endif | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| lsl INC_Y, INC_Y, #4 | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| ldr w10, [X] | |||||
| add X, X, INC_X | |||||
| str w10, [Y] | |||||
| add Y, Y, INC_Y | |||||
| #else | |||||
| ldr x10, [X] | |||||
| add X, X, INC_X | |||||
| str x10, [Y] | |||||
| add Y, Y, INC_Y | |||||
| #endif | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.2s}, [X] | |||||
| add X, X, INC_X | |||||
| st1 {v0.2s}, [Y] | |||||
| add Y, Y, INC_Y | |||||
| #else | |||||
| ld1 {v0.2d}, [X] | |||||
| add X, X, INC_X | |||||
| st1 {v0.2d}, [Y] | |||||
| add Y, Y, INC_Y | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble copy_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne copy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne copy_kernel_S_BEGIN | |||||
| copy_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq copy_kernel_F1 | |||||
| copy_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne copy_kernel_F4 | |||||
| copy_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble copy_kernel_L999 | |||||
| copy_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne copy_kernel_F10 | |||||
| mov w0, wzr | |||||
| ret | |||||
| copy_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble copy_kernel_S1 | |||||
| copy_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne copy_kernel_S4 | |||||
| copy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble copy_kernel_L999 | |||||
| copy_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne copy_kernel_S10 | |||||
| copy_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,169 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define TMPF d6 | |||||
| #define SSQ d0 | |||||
| #define TMPVF {v6.d}[0] | |||||
| #define SZ 8 | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| fmul TMPF, TMPF, TMPF | |||||
| fadd SSQ, SSQ, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| ld1 {v1.2d, v2.2d}, [X], #32 | |||||
| fmla v0.2d, v1.2d, v1.2d | |||||
| fmla v5.2d, v2.2d, v2.2d | |||||
| ld1 {v3.2d, v4.2d}, [X], #32 | |||||
| fmla v0.2d, v3.2d, v3.2d | |||||
| fmla v5.2d, v4.2d, v4.2d | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| .endm | |||||
| .macro nrm2_kernel_F8_FINALIZE | |||||
| fadd v0.2d, v0.2d, v5.2d | |||||
| faddp SSQ, v0.2d | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #3 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| fmul SSQ, TMPF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| fmul TMPF, TMPF, TMPF | |||||
| fadd SSQ, SSQ, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov SSQ, xzr | |||||
| fmov d5, SSQ | |||||
| cmp N, xzr | |||||
| ble nrm2_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble nrm2_kernel_zero | |||||
| cmp INC_X, #1 | |||||
| bne nrm2_kernel_S_BEGIN | |||||
| nrm2_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq nrm2_kernel_F1_INIT | |||||
| nrm2_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F8 | |||||
| nrm2_kernel_F8_FINALIZE | |||||
| nrm2_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F10 | |||||
| b nrm2_kernel_L999 | |||||
| nrm2_kernel_F1_INIT: | |||||
| b nrm2_kernel_F1 | |||||
| nrm2_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble nrm2_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble nrm2_kernel_S1 | |||||
| nrm2_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S4 | |||||
| nrm2_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S10 | |||||
| nrm2_kernel_L999: | |||||
| fsqrt SSQ, SSQ | |||||
| ret | |||||
| nrm2_kernel_zero: | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,227 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define Y x3 /* Y vector address */ | |||||
| #define INC_Y x4 /* Y stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #if !defined(DSDOT) | |||||
| #define REG0 wzr | |||||
| #define DOTF s0 | |||||
| #else // DSDOT | |||||
| #define REG0 xzr | |||||
| #define DOTF d0 | |||||
| #endif | |||||
| #define DOTI s1 | |||||
| #define TMPX s2 | |||||
| #define LD1VX {v2.s}[0] | |||||
| #define TMPY s3 | |||||
| #define LD1VY {v3.s}[0] | |||||
| #define TMPVY v3.s[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define REG0 xzr | |||||
| #define DOTF d0 | |||||
| #define DOTI d1 | |||||
| #define TMPX d2 | |||||
| #define LD1VX {v2.d}[0] | |||||
| #define TMPY d3 | |||||
| #define LD1VY {v3.d}[0] | |||||
| #define TMPVY v3.d[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPX, [X], #SZ | |||||
| ldr TMPY, [Y], #SZ | |||||
| #if !defined(DSDOT) | |||||
| fmadd DOTF, TMPX, TMPY, DOTF | |||||
| #else // DSDOT | |||||
| fmul TMPX, TMPX, TMPY | |||||
| fcvt d2, TMPX | |||||
| fadd DOTF, DOTF, d2 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s}, [X], #16 | |||||
| ld1 {v3.4s}, [Y], #16 | |||||
| #if !defined(DSDOT) | |||||
| fmla v0.4s, v2.4s, v3.4s | |||||
| #else | |||||
| fmul v2.4s, v2.4s, v3.4s | |||||
| ext v3.16b, v2.16b, v2.16b, #8 | |||||
| fcvtl v2.2d, v2.2s | |||||
| fcvtl v3.2d, v3.2s | |||||
| fadd v0.2d, v0.2d, v2.2d | |||||
| fadd v0.2d, v0.2d, v3.2d | |||||
| #endif | |||||
| #else //DOUBLE | |||||
| ld1 {v2.2d, v3.2d}, [X], #32 | |||||
| ld1 {v4.2d, v5.2d}, [Y], #32 | |||||
| fmul v2.2d, v2.2d, v4.2d | |||||
| fmul v3.2d, v3.2d, v5.2d | |||||
| fadd v0.2d, v0.2d, v2.2d | |||||
| fadd v0.2d, v0.2d, v3.2d | |||||
| #endif | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| PRFM PLDL1KEEP, [Y, #1024] | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| #if !defined(DSDOT) | |||||
| ext v1.16b, v0.16b, v0.16b, #8 | |||||
| fadd v0.2s, v0.2s, v1.2s | |||||
| faddp DOTF, v0.2s | |||||
| #else | |||||
| faddp DOTF, v0.2d | |||||
| #endif | |||||
| #else //DOUBLE | |||||
| faddp DOTF, v0.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| lsl INC_Y, INC_Y, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 LD1VX, [X], INC_X | |||||
| ld1 LD1VY, [Y], INC_Y | |||||
| #if !defined(DSDOT) | |||||
| fmadd DOTF, TMPX, TMPY, DOTF | |||||
| #else // DSDOT | |||||
| fmul TMPX, TMPX, TMPY | |||||
| fcvt d2, TMPX | |||||
| fadd DOTF, DOTF, d2 | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov DOTF, REG0 | |||||
| #if defined(DOUBLE) | |||||
| fmov d6, DOTF | |||||
| #endif | |||||
| cmp N, xzr | |||||
| ble dot_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne dot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne dot_kernel_S_BEGIN | |||||
| dot_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq dot_kernel_F1 | |||||
| dot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_F4 | |||||
| KERNEL_F4_FINALIZE | |||||
| dot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble dot_kernel_L999 | |||||
| dot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_F10 | |||||
| ret | |||||
| dot_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble dot_kernel_S1 | |||||
| dot_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_S4 | |||||
| dot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble dot_kernel_L999 | |||||
| dot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_S10 | |||||
| dot_kernel_L999: | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,320 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M x0 /* Y vector length */ | |||||
| #define N x1 /* X vector length */ | |||||
| #define A x3 /* A vector address */ | |||||
| #define LDA x4 /* A stride */ | |||||
| #define X x5 /* X vector address */ | |||||
| #define INC_X x6 /* X stride */ | |||||
| #define Y x7 /* Y vector address */ | |||||
| #define INC_Y x2 /* Y stride */ | |||||
| #define A_PTR x9 /* loop A vector address */ | |||||
| #define Y_IPTR x10 /* loop Y vector address */ | |||||
| #define J x11 /* loop variable */ | |||||
| #define I x12 /* loop variable */ | |||||
| #define Y_OPTR x13 /* loop Y vector address */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define ALPHA s0 | |||||
| #define TEMP s1 | |||||
| #define TEMPV {v1.s}[0] | |||||
| #define TMP1 s2 | |||||
| #define TMPV1 {v2.s}[0] | |||||
| #define TMP2 s3 | |||||
| #define TMPV2 {v3.s}[0] | |||||
| #define SZ 4 | |||||
| #define SHZ 2 | |||||
| #else | |||||
| #define ALPHA d0 | |||||
| #define TEMP d1 | |||||
| #define TEMPV {v1.d}[0] | |||||
| #define TMP1 d2 | |||||
| #define TMPV1 {v2.d}[0] | |||||
| #define TMP2 d3 | |||||
| #define TMPV2 {v3.d}[0] | |||||
| #define SZ 8 | |||||
| #define SHZ 3 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro SAVE_REGS | |||||
| add sp, sp, #-(11 * 16) | |||||
| stp d8, d9, [sp, #(0 * 16)] | |||||
| stp d10, d11, [sp, #(1 * 16)] | |||||
| stp d12, d13, [sp, #(2 * 16)] | |||||
| stp d14, d15, [sp, #(3 * 16)] | |||||
| stp d16, d17, [sp, #(4 * 16)] | |||||
| stp x18, x19, [sp, #(5 * 16)] | |||||
| stp x20, x21, [sp, #(6 * 16)] | |||||
| stp x22, x23, [sp, #(7 * 16)] | |||||
| stp x24, x25, [sp, #(8 * 16)] | |||||
| stp x26, x27, [sp, #(9 * 16)] | |||||
| str x28, [sp, #(10 * 16)] | |||||
| .endm | |||||
| .macro RESTORE_REGS | |||||
| ldp d8, d9, [sp, #(0 * 16)] | |||||
| ldp d10, d11, [sp, #(1 * 16)] | |||||
| ldp d12, d13, [sp, #(2 * 16)] | |||||
| ldp d14, d15, [sp, #(3 * 16)] | |||||
| ldp d16, d17, [sp, #(4 * 16)] | |||||
| ldp x18, x19, [sp, #(5 * 16)] | |||||
| ldp x20, x21, [sp, #(6 * 16)] | |||||
| ldp x22, x23, [sp, #(7 * 16)] | |||||
| ldp x24, x25, [sp, #(8 * 16)] | |||||
| ldp x26, x27, [sp, #(9 * 16)] | |||||
| ldr x28, [sp, #(10 * 16)] | |||||
| add sp, sp, #(11*16) | |||||
| .endm | |||||
| .macro KERNEL_F16 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s, v3.4s}, [A_PTR], #32 | |||||
| ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 | |||||
| fmla v4.4s, v1.4s, v2.4s | |||||
| fmla v5.4s, v1.4s, v3.4s | |||||
| st1 {v4.4s, v5.4s}, [Y_OPTR], #32 | |||||
| ld1 {v6.4s, v7.4s}, [A_PTR], #32 | |||||
| ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 | |||||
| fmla v8.4s, v1.4s, v6.4s | |||||
| fmla v9.4s, v1.4s, v7.4s | |||||
| st1 {v8.4s, v9.4s}, [Y_OPTR], #32 | |||||
| #else //DOUBLE | |||||
| ld1 {v2.2d, v3.2d}, [A_PTR], #32 | |||||
| ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 | |||||
| fmla v4.2d, v1.2d, v2.2d | |||||
| fmla v5.2d, v1.2d, v3.2d | |||||
| st1 {v4.2d, v5.2d}, [Y_OPTR], #32 | |||||
| ld1 {v6.2d, v7.2d}, [A_PTR], #32 | |||||
| ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 | |||||
| fmla v8.2d, v1.2d, v6.2d | |||||
| fmla v9.2d, v1.2d, v7.2d | |||||
| st1 {v8.2d, v9.2d}, [Y_OPTR], #32 | |||||
| ld1 {v10.2d, v11.2d}, [A_PTR], #32 | |||||
| ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 | |||||
| fmla v12.2d, v1.2d, v10.2d | |||||
| fmla v13.2d, v1.2d, v11.2d | |||||
| st1 {v12.2d, v13.2d}, [Y_OPTR], #32 | |||||
| ld1 {v14.2d, v15.2d}, [A_PTR], #32 | |||||
| ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 | |||||
| fmla v16.2d, v1.2d, v14.2d | |||||
| fmla v17.2d, v1.2d, v15.2d | |||||
| st1 {v16.2d, v17.2d}, [Y_OPTR], #32 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s}, [A_PTR], #16 | |||||
| ld1 {v3.4s}, [Y_IPTR], #16 | |||||
| fmla v3.4s, v1.4s, v2.4s | |||||
| st1 {v3.4s}, [Y_OPTR], #16 | |||||
| #else | |||||
| ld1 {v2.2d}, [A_PTR], #16 | |||||
| ld1 {v3.2d}, [Y_IPTR], #16 | |||||
| fmla v3.2d, v1.2d, v2.2d | |||||
| st1 {v3.2d}, [Y_OPTR], #16 | |||||
| ld1 {v4.2d}, [A_PTR], #16 | |||||
| ld1 {v5.2d}, [Y_IPTR], #16 | |||||
| fmla v5.2d, v1.2d, v4.2d | |||||
| st1 {v5.2d}, [Y_OPTR], #16 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| ld1 TMPV1, [A_PTR], #SZ | |||||
| ld1 TMPV2, [Y_IPTR] | |||||
| fmadd TMP2, TEMP, TMP1, TMP2 | |||||
| st1 TMPV2, [Y_IPTR], #SZ | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_Y, INC_Y, #SHZ | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPV1, [A_PTR], #SZ | |||||
| ld1 TMPV2, [Y_IPTR] | |||||
| fmadd TMP2, TEMP, TMP1, TMP2 | |||||
| st1 TMPV2, [Y_IPTR], INC_Y | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| ldr INC_Y, [sp] | |||||
| SAVE_REGS | |||||
| cmp N, xzr | |||||
| ble gemv_n_kernel_L999 | |||||
| cmp M, xzr | |||||
| ble gemv_n_kernel_L999 | |||||
| lsl LDA, LDA, #SHZ | |||||
| lsl INC_X, INC_X, #SHZ | |||||
| mov J, N | |||||
| cmp INC_Y, #1 | |||||
| bne gemv_n_kernel_S_BEGIN | |||||
| gemv_n_kernel_F_LOOP: | |||||
| ld1 TEMPV, [X], INC_X | |||||
| fmul TEMP, ALPHA, TEMP | |||||
| #if !defined(DOUBLE) | |||||
| ins v1.s[1], v1.s[0] | |||||
| ins v1.s[2], v1.s[0] | |||||
| ins v1.s[3], v1.s[0] | |||||
| #else | |||||
| ins v1.d[1], v1.d[0] | |||||
| #endif | |||||
| mov A_PTR, A | |||||
| mov Y_IPTR, Y | |||||
| mov Y_OPTR, Y | |||||
| gemv_n_kernel_F32: | |||||
| asr I, M, #5 | |||||
| cmp I, xzr | |||||
| beq gemv_n_kernel_F4 | |||||
| gemv_n_kernel_F320: | |||||
| KERNEL_F16 | |||||
| KERNEL_F16 | |||||
| subs I, I, #1 | |||||
| bne gemv_n_kernel_F320 | |||||
| gemv_n_kernel_F4: | |||||
| ands I, M, #31 | |||||
| asr I, I, #2 | |||||
| cmp I, xzr | |||||
| beq gemv_n_kernel_F1 | |||||
| gemv_n_kernel_F40: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne gemv_n_kernel_F40 | |||||
| gemv_n_kernel_F1: | |||||
| ands I, M, #3 | |||||
| ble gemv_n_kernel_F_END | |||||
| gemv_n_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne gemv_n_kernel_F10 | |||||
| gemv_n_kernel_F_END: | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| bne gemv_n_kernel_F_LOOP | |||||
| b gemv_n_kernel_L999 | |||||
| gemv_n_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| gemv_n_kernel_S_LOOP: | |||||
| ld1 TEMPV, [X], INC_X | |||||
| fmul TEMP, ALPHA, TEMP | |||||
| mov A_PTR, A | |||||
| mov Y_IPTR, Y | |||||
| asr I, M, #2 | |||||
| cmp I, xzr | |||||
| ble gemv_n_kernel_S1 | |||||
| gemv_n_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne gemv_n_kernel_S4 | |||||
| gemv_n_kernel_S1: | |||||
| ands I, M, #3 | |||||
| ble gemv_n_kernel_S_END | |||||
| gemv_n_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne gemv_n_kernel_S10 | |||||
| gemv_n_kernel_S_END: | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| bne gemv_n_kernel_S_LOOP | |||||
| gemv_n_kernel_L999: | |||||
| mov w0, wzr | |||||
| RESTORE_REGS | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,347 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M x0 /* Y vector length */ | |||||
| #define N x1 /* X vector length */ | |||||
| #define A x3 /* A vector address */ | |||||
| #define LDA x4 /* A stride */ | |||||
| #define X x5 /* X vector address */ | |||||
| #define INC_X x6 /* X stride */ | |||||
| #define Y x7 /* Y vector address */ | |||||
| #define INC_Y x2 /* Y stride */ | |||||
| #define A_PTR x9 /* loop A vector address */ | |||||
| #define X_PTR x10 /* loop X vector address */ | |||||
| #define J x11 /* loop variable */ | |||||
| #define I x12 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define REG0 wzr | |||||
| #define ALPHA s0 | |||||
| #define TEMP s1 | |||||
| #define TEMP1 s2 | |||||
| #define TEMP2 s3 | |||||
| #define TEMP3 s4 | |||||
| #define TEMPV {v1.s}[0] | |||||
| #define TMP1 s2 | |||||
| #define TMPV1 {v2.s}[0] | |||||
| #define TMP2 s3 | |||||
| #define TMPV2 {v3.s}[0] | |||||
| #define SZ 4 | |||||
| #define SHZ 2 | |||||
| #else | |||||
| #define REG0 xzr | |||||
| #define ALPHA d0 | |||||
| #define TEMP d1 | |||||
| #define TEMP1 d2 | |||||
| #define TEMP2 d3 | |||||
| #define TEMP3 d4 | |||||
| #define TEMPV {v1.d}[0] | |||||
| #define TMP1 d2 | |||||
| #define TMPV1 {v2.d}[0] | |||||
| #define TMP2 d3 | |||||
| #define TMPV2 {v3.d}[0] | |||||
| #define SZ 8 | |||||
| #define SHZ 3 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro SAVE_REGS | |||||
| add sp, sp, #-(11 * 16) | |||||
| stp d8, d9, [sp, #(0 * 16)] | |||||
| stp d10, d11, [sp, #(1 * 16)] | |||||
| stp d12, d13, [sp, #(2 * 16)] | |||||
| stp d14, d15, [sp, #(3 * 16)] | |||||
| stp d16, d17, [sp, #(4 * 16)] | |||||
| stp x18, x19, [sp, #(5 * 16)] | |||||
| stp x20, x21, [sp, #(6 * 16)] | |||||
| stp x22, x23, [sp, #(7 * 16)] | |||||
| stp x24, x25, [sp, #(8 * 16)] | |||||
| stp x26, x27, [sp, #(9 * 16)] | |||||
| str x28, [sp, #(10 * 16)] | |||||
| .endm | |||||
| .macro RESTORE_REGS | |||||
| ldp d8, d9, [sp, #(0 * 16)] | |||||
| ldp d10, d11, [sp, #(1 * 16)] | |||||
| ldp d12, d13, [sp, #(2 * 16)] | |||||
| ldp d14, d15, [sp, #(3 * 16)] | |||||
| ldp d16, d17, [sp, #(4 * 16)] | |||||
| ldp x18, x19, [sp, #(5 * 16)] | |||||
| ldp x20, x21, [sp, #(6 * 16)] | |||||
| ldp x22, x23, [sp, #(7 * 16)] | |||||
| ldp x24, x25, [sp, #(8 * 16)] | |||||
| ldp x26, x27, [sp, #(9 * 16)] | |||||
| ldr x28, [sp, #(10 * 16)] | |||||
| add sp, sp, #(11*16) | |||||
| .endm | |||||
| .macro KERNEL_F32 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 | |||||
| ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 | |||||
| fmla v1.4s, v5.4s, v9.4s | |||||
| fmla v2.4s, v6.4s, v10.4s | |||||
| fmla v3.4s, v7.4s, v11.4s | |||||
| fmla v4.4s, v8.4s, v12.4s | |||||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||||
| ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 | |||||
| fmla v1.4s, v13.4s, v17.4s | |||||
| fmla v2.4s, v14.4s, v18.4s | |||||
| fmla v3.4s, v15.4s, v19.4s | |||||
| fmla v4.4s, v16.4s, v20.4s | |||||
| #else | |||||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||||
| fmla v1.2d, v5.2d, v9.2d | |||||
| fmla v2.2d, v6.2d, v10.2d | |||||
| fmla v3.2d, v7.2d, v11.2d | |||||
| fmla v4.2d, v8.2d, v12.2d | |||||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||||
| fmla v1.2d, v13.2d, v17.2d | |||||
| fmla v2.2d, v14.2d, v18.2d | |||||
| fmla v3.2d, v15.2d, v19.2d | |||||
| fmla v4.2d, v16.2d, v20.2d | |||||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||||
| fmla v1.2d, v5.2d, v9.2d | |||||
| fmla v2.2d, v6.2d, v10.2d | |||||
| fmla v3.2d, v7.2d, v11.2d | |||||
| fmla v4.2d, v8.2d, v12.2d | |||||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||||
| fmla v1.2d, v13.2d, v17.2d | |||||
| fmla v2.2d, v14.2d, v18.2d | |||||
| fmla v3.2d, v15.2d, v19.2d | |||||
| fmla v4.2d, v16.2d, v20.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F32_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| fadd v1.4s, v1.4s, v2.4s | |||||
| fadd v1.4s, v1.4s, v3.4s | |||||
| fadd v1.4s, v1.4s, v4.4s | |||||
| #else | |||||
| fadd v1.2d, v1.2d, v2.2d | |||||
| fadd v1.2d, v1.2d, v3.2d | |||||
| fadd v1.2d, v1.2d, v4.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s}, [A_PTR], #16 | |||||
| ld1 {v3.4s}, [X_PTR], #16 | |||||
| fmla v1.4s, v2.4s, v3.4s | |||||
| #else | |||||
| ld1 {v2.2d}, [A_PTR], #16 | |||||
| ld1 {v3.2d}, [X_PTR], #16 | |||||
| fmla v1.2d, v2.2d, v3.2d | |||||
| ld1 {v4.2d}, [A_PTR], #16 | |||||
| ld1 {v5.2d}, [X_PTR], #16 | |||||
| fmla v1.2d, v4.2d, v5.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| ext v2.16b, v1.16b, v1.16b, #8 | |||||
| fadd v1.2s, v1.2s, v2.2s | |||||
| faddp TEMP, v1.2s | |||||
| #else | |||||
| faddp TEMP, v1.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| ld1 TMPV1, [A_PTR], #SZ | |||||
| ld1 TMPV2, [X_PTR], #SZ | |||||
| fmadd TEMP, TMP1, TMP2, TEMP | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #SHZ | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPV1, [A_PTR], #SZ | |||||
| ld1 TMPV2, [X_PTR], INC_X | |||||
| fmadd TEMP, TMP1, TMP2, TEMP | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| ldr INC_Y, [sp] | |||||
| SAVE_REGS | |||||
| cmp N, xzr | |||||
| ble gemv_t_kernel_L999 | |||||
| cmp M, xzr | |||||
| ble gemv_t_kernel_L999 | |||||
| lsl LDA, LDA, #SHZ | |||||
| lsl INC_Y, INC_Y, #SHZ | |||||
| mov J, N | |||||
| cmp INC_X, #1 | |||||
| bne gemv_t_kernel_S_BEGIN | |||||
| gemv_t_kernel_F_LOOP: | |||||
| fmov TEMP, REG0 | |||||
| fmov TEMP1, REG0 | |||||
| fmov TEMP2, REG0 | |||||
| fmov TEMP3, REG0 | |||||
| mov A_PTR, A | |||||
| mov X_PTR, X | |||||
| gemv_t_kernel_F32: | |||||
| asr I, M, #5 | |||||
| cmp I, xzr | |||||
| beq gemv_t_kernel_F4 | |||||
| gemv_t_kernel_F320: | |||||
| KERNEL_F32 | |||||
| subs I, I, #1 | |||||
| bne gemv_t_kernel_F320 | |||||
| KERNEL_F32_FINALIZE | |||||
| gemv_t_kernel_F4: | |||||
| ands I, M, #31 | |||||
| asr I, I, #2 | |||||
| cmp I, xzr | |||||
| beq gemv_t_kernel_F1 | |||||
| gemv_t_kernel_F40: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne gemv_t_kernel_F40 | |||||
| gemv_t_kernel_F1: | |||||
| KERNEL_F4_FINALIZE | |||||
| ands I, M, #3 | |||||
| ble gemv_t_kernel_F_END | |||||
| gemv_t_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne gemv_t_kernel_F10 | |||||
| gemv_t_kernel_F_END: | |||||
| ld1 TMPV1, [Y] | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| fmadd TMP1, ALPHA, TEMP, TMP1 | |||||
| st1 TMPV1, [Y], INC_Y | |||||
| bne gemv_t_kernel_F_LOOP | |||||
| b gemv_t_kernel_L999 | |||||
| gemv_t_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| gemv_t_kernel_S_LOOP: | |||||
| fmov TEMP, REG0 | |||||
| mov A_PTR, A | |||||
| mov X_PTR, X | |||||
| asr I, M, #2 | |||||
| cmp I, xzr | |||||
| ble gemv_t_kernel_S1 | |||||
| gemv_t_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne gemv_t_kernel_S4 | |||||
| gemv_t_kernel_S1: | |||||
| ands I, M, #3 | |||||
| ble gemv_t_kernel_S_END | |||||
| gemv_t_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne gemv_t_kernel_S10 | |||||
| gemv_t_kernel_S_END: | |||||
| ld1 TMPV1, [Y] | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| fmadd TMP1, ALPHA, TEMP, TMP1 | |||||
| st1 TMPV1, [Y], INC_Y | |||||
| bne gemv_t_kernel_S_LOOP | |||||
| gemv_t_kernel_L999: | |||||
| RESTORE_REGS | |||||
| mov w0, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,124 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define INDEX x3 /* index of max/min value */ | |||||
| #define Z x4 /* vector index */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if defined(USE_MIN) | |||||
| #define COND le | |||||
| #else | |||||
| #define COND ge | |||||
| #endif | |||||
| #define MAXF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| /******************************************************************************/ | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #3 | |||||
| ld1 {v0.d}[0], [X], INC_X | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| fabs MAXF, MAXF | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| add Z, Z, #1 | |||||
| fabs TMPF, TMPF | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| csel INDEX, INDEX, Z, COND | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble iamax_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble iamax_kernel_zero | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble iamax_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble iamax_kernel_S1 | |||||
| iamax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S4 | |||||
| iamax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S10 | |||||
| iamax_kernel_L999: | |||||
| mov x0, INDEX | |||||
| ret | |||||
| iamax_kernel_zero: | |||||
| mov x0, xzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,213 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define INDEX x3 /* index of max/min value */ | |||||
| #define Z x4 /* vector index */ | |||||
| #define I x5 /* loop variable */ | |||||
| #define X_COPY x6 /* copy of X address */ | |||||
| #define MAXF_Z x7 | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define MAXF s5 | |||||
| #define TMPF s6 | |||||
| #define TMPVF {v6.s}[0] | |||||
| #define SZ 4 | |||||
| /******************************************************************************/ | |||||
| .macro INIT_F1 | |||||
| ldr MAXF, [X], #SZ | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| fabs MAXF, MAXF | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| add Z, Z, #1 | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| fcsel MAXF, MAXF, TMPF, le | |||||
| csel INDEX, INDEX, Z, le | |||||
| .endm | |||||
| .macro INIT_F4 | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| fabs v0.4s, v0.4s | |||||
| fmaxv MAXF, v0.4s | |||||
| mov Z, #5 | |||||
| mov MAXF_Z, #1 | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| ld1 {v0.4s}, [X], #16 | |||||
| fabs v0.4s, v0.4s | |||||
| fmaxv TMPF, v0.4s | |||||
| PRFM PLDL1KEEP, [X, #512] | |||||
| fcmp TMPF, MAXF | |||||
| fcsel MAXF, MAXF, TMPF, le | |||||
| csel MAXF_Z, MAXF_Z, Z, le | |||||
| add Z, Z, #4 | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| mov INDEX, MAXF_Z | |||||
| sub MAXF_Z, MAXF_Z, #1 | |||||
| lsl MAXF_Z, MAXF_Z, #2 | |||||
| add X_COPY, X_COPY, MAXF_Z | |||||
| ldr TMPF, [X_COPY], #SZ | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| beq KERNEL_F4_FINALIZE_DONE | |||||
| add INDEX, INDEX, #1 | |||||
| ldr TMPF, [X_COPY], #SZ | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| beq KERNEL_F4_FINALIZE_DONE | |||||
| add INDEX, INDEX, #1 | |||||
| ldr TMPF, [X_COPY], #SZ | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| beq KERNEL_F4_FINALIZE_DONE | |||||
| add INDEX, INDEX, #1 | |||||
| KERNEL_F4_FINALIZE_DONE: | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #2 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| fabs MAXF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| add Z, Z, #1 | |||||
| fabs TMPF, TMPF | |||||
| fcmp TMPF, MAXF | |||||
| fcsel MAXF, MAXF, TMPF, le | |||||
| csel INDEX, INDEX, Z, le | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble iamax_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble iamax_kernel_zero | |||||
| PRFM PLDL1KEEP, [X] | |||||
| mov X_COPY, X | |||||
| cmp INC_X, #1 | |||||
| bne iamax_kernel_S_BEGIN | |||||
| iamax_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq iamax_kernel_F1_INIT | |||||
| INIT_F4 | |||||
| subs I, I, #1 | |||||
| beq iamax_kernel_F4_FINALIZE | |||||
| iamax_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F4 | |||||
| iamax_kernel_F4_FINALIZE: | |||||
| KERNEL_F4_FINALIZE | |||||
| iamax_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_F10 | |||||
| b iamax_kernel_L999 | |||||
| iamax_kernel_F1_INIT: | |||||
| INIT_F1 | |||||
| subs N, N, #1 | |||||
| b iamax_kernel_F1 | |||||
| iamax_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble iamax_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble iamax_kernel_S1 | |||||
| iamax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S4 | |||||
| iamax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S10 | |||||
| iamax_kernel_L999: | |||||
| mov x0, INDEX | |||||
| ret | |||||
| iamax_kernel_zero: | |||||
| mov x0, xzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,151 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define INDEX x3 /* index of max/min value */ | |||||
| #define Z x4 /* vector index */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if defined(USE_MIN) | |||||
| #define COND le | |||||
| #else | |||||
| #define COND ge | |||||
| #endif | |||||
| #if !defined(DOUBLE) | |||||
| #define MAXF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define MAXF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| ld1 {v0.2s}, [X], INC_X | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| fabs v0.2s, v0.2s | |||||
| ext v1.8b, v0.8b, v0.8b, #4 | |||||
| fadd MAXF, s0, s1 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| ld1 {v0.2d}, [X], INC_X | |||||
| mov Z, #1 | |||||
| mov INDEX, Z | |||||
| fabs v0.2d, v0.2d | |||||
| faddp MAXF, v0.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.2s}, [X], INC_X | |||||
| add Z, Z, #1 | |||||
| fabs v1.2s, v1.2s | |||||
| ext v2.8b, v1.8b, v1.8b, #4 | |||||
| fadd TMPF, s1, s2 | |||||
| #else | |||||
| ld1 {v1.2d}, [X], INC_X | |||||
| add Z, Z, #1 | |||||
| fabs v1.2d, v1.2d | |||||
| faddp TMPF, v1.2d | |||||
| #endif | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| csel INDEX, INDEX, Z, COND | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble iamax_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble iamax_kernel_zero | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble iamax_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble iamax_kernel_S1 | |||||
| iamax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S4 | |||||
| iamax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble iamax_kernel_L999 | |||||
| iamax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne iamax_kernel_S10 | |||||
| iamax_kernel_L999: | |||||
| mov x0, INDEX | |||||
| ret | |||||
| iamax_kernel_zero: | |||||
| mov x0, xzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,243 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define Y x3 /* Y vector address */ | |||||
| #define INC_Y x4 /* Y stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define C s0 /* scale input value */ | |||||
| #define S s1 /* scale input value */ | |||||
| #else | |||||
| #define C d0 /* scale input value */ | |||||
| #define S d1 /* scale input value */ | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro INIT | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] // [C, C] | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] // [C, C] | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_F1 | |||||
| #if !defined(DOUBLE) | |||||
| fneg s2, S | |||||
| ins v1.s[1], v2.s[0] // [-S, S] | |||||
| #else | |||||
| fneg d2, S | |||||
| ins v1.d[1], v2.d[0] // [-S, S] | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.s}[0], [X] | |||||
| ld1 {v2.s}[1], [Y] // [Y, X] | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] | |||||
| fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] | |||||
| fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] | |||||
| st1 {v4.s}[0], [X], #4 | |||||
| st1 {v4.s}[1], [Y], #4 | |||||
| #else | |||||
| ld1 {v2.d}[0], [X] | |||||
| ld1 {v2.d}[1], [Y] // [Y, X] | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] | |||||
| fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] | |||||
| fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] | |||||
| st1 {v4.d}[0], [X], #8 | |||||
| st1 {v4.d}[1], [Y], #8 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_INIT_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.d[1], v0.d[0] // [C, C, C, C] | |||||
| ins v1.s[1], v1.s[0] | |||||
| ins v1.d[1], v1.d[0] // [S, S, S, S] | |||||
| #else | |||||
| ins v1.d[1], v1.d[0] // [S, S] | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s}, [X] | |||||
| fmul v4.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 | |||||
| ld1 {v3.4s}, [Y] | |||||
| fmla v4.4s, v1.4s, v3.4s // C*X3+S*Y3, ..., C*X0+S*Y0 | |||||
| st1 {v4.4s}, [X], #16 | |||||
| fmul v5.4s, v0.4s, v3.4s // C*Y3, C*Y2, C*Y1, C*Y0 | |||||
| fmls v5.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 | |||||
| st1 {v5.4s}, [Y], #16 | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d, v3.2d}, [X] | |||||
| fmul v6.2d, v0.2d, v2.2d // C*X1, C*X0 | |||||
| fmul v7.2d, v0.2d, v3.2d // C*X3, C*X2 | |||||
| ld1 {v4.2d, v5.2d}, [Y] | |||||
| fmla v6.2d, v1.2d, v4.2d // C*X1+S*Y1, C*X0+S*Y0 | |||||
| fmla v7.2d, v1.2d, v5.2d // C*X3+S*Y3, C*X2+S*Y2 | |||||
| st1 {v6.2d, v7.2d}, [X], #32 | |||||
| fmul v16.2d, v0.2d, v4.2d // C*Y1, C*Y0 | |||||
| fmul v17.2d, v0.2d, v5.2d // C*Y3, C*Y2 | |||||
| fmls v16.2d, v1.2d, v2.2d // C*Y1-S*X1, C*Y0-S*X0 | |||||
| fmls v17.2d, v1.2d, v3.2d // C*Y3-S*X3, C*Y2-S*X2 | |||||
| st1 {v16.2d, v17.2d}, [Y], #32 | |||||
| PRFM PLDL1KEEP, [X, #512] | |||||
| PRFM PLDL1KEEP, [Y, #512] | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| lsl INC_Y, INC_Y, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.s}[0], [X] | |||||
| ld1 {v2.s}[1], [Y] // [Y, X] | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // [X, Y] | |||||
| fmul v4.2s, v2.2s, v0.2s // [C*Y, C*X] | |||||
| fmla v4.2s, v3.2s, v1.2s // [C*Y - S*X, C*X + S*Y] | |||||
| st1 {v4.s}[0], [X], INC_X | |||||
| st1 {v4.s}[1], [Y], INC_Y | |||||
| #else | |||||
| ld1 {v2.d}[0], [X] | |||||
| ld1 {v2.d}[1], [Y] // [Y, X] | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // [X, Y] | |||||
| fmul v4.2d, v2.2d, v0.2d // [C*Y, C*X] | |||||
| fmla v4.2d, v3.2d, v1.2d // [C*Y - S*X, C*X + S*Y] | |||||
| st1 {v4.d}[0], [X], INC_X | |||||
| st1 {v4.d}[1], [Y], INC_Y | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble rot_kernel_L999 | |||||
| INIT | |||||
| cmp INC_X, #1 | |||||
| bne rot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne rot_kernel_S_BEGIN | |||||
| rot_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq rot_kernel_F1 | |||||
| KERNEL_INIT_F4 | |||||
| rot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_F4 | |||||
| rot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble rot_kernel_L999 | |||||
| INIT_F1 | |||||
| rot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_F10 | |||||
| mov w0, wzr | |||||
| ret | |||||
| rot_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| INIT_F1 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble rot_kernel_S1 | |||||
| rot_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_S4 | |||||
| rot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble rot_kernel_L999 | |||||
| rot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_S10 | |||||
| rot_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| @@ -0,0 +1,253 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x3 /* X vector address */ | |||||
| #define X_COPY x5 /* X vector address */ | |||||
| #define INC_X x4 /* X stride */ | |||||
| #define I x1 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define DA s0 /* scale input value */ | |||||
| #define DAV {v0.s}[0] | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define DA d0 /* scale input value */ | |||||
| #define DAV {v0.d}[0] | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X] | |||||
| fmul TMPF, TMPF, DA | |||||
| str TMPF, [X], #SZ | |||||
| .endm | |||||
| .macro KERNEL_INIT_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] | |||||
| ins v0.s[2], v0.s[0] | |||||
| ins v0.s[3], v0.s[0] | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.4s, v2.4s}, [X] | |||||
| fmul v1.4s, v1.4s, v0.4s | |||||
| fmul v2.4s, v2.4s, v0.4s | |||||
| st1 {v1.4s, v2.4s}, [X], #32 | |||||
| #else // DOUBLE | |||||
| ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X] | |||||
| fmul v1.2d, v1.2d, v0.2d | |||||
| fmul v2.2d, v2.2d, v0.2d | |||||
| fmul v3.2d, v3.2d, v0.2d | |||||
| fmul v4.2d, v4.2d, v0.2d | |||||
| st1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||||
| #endif | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ldr TMPF, [X] | |||||
| fmul TMPF, TMPF, DA | |||||
| st1 TMPVF, [X], INC_X | |||||
| .endm | |||||
| .macro KERNEL_S4 | |||||
| #if !defined(DOUBLE) | |||||
| ldr s1, [X] | |||||
| add X, X, INC_X | |||||
| fmul s1, s1, s0 | |||||
| str s1, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| ldr s2, [X] | |||||
| add X, X, INC_X | |||||
| fmul s2, s2, s0 | |||||
| str s2, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| ldr s3, [X] | |||||
| add X, X, INC_X | |||||
| fmul s3, s3, s0 | |||||
| str s3, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| ldr s4, [X] | |||||
| add X, X, INC_X | |||||
| fmul s4, s4, s0 | |||||
| str s4, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| #else | |||||
| ldr d1, [X] | |||||
| add X, X, INC_X | |||||
| fmul d1, d1, d0 | |||||
| str d1, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| ldr d2, [X] | |||||
| add X, X, INC_X | |||||
| fmul d2, d2, d0 | |||||
| str d2, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| ldr d3, [X] | |||||
| add X, X, INC_X | |||||
| fmul d3, d3, d0 | |||||
| str d3, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| ldr d4, [X] | |||||
| add X, X, INC_X | |||||
| fmul d4, d4, d0 | |||||
| str d4, [X_COPY] | |||||
| add X_COPY, X_COPY, INC_X | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble scal_kernel_L999 | |||||
| fcmp DA, #0.0 | |||||
| beq scal_kernel_zero | |||||
| cmp INC_X, #1 | |||||
| bne scal_kernel_S_BEGIN | |||||
| scal_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq scal_kernel_F1 | |||||
| KERNEL_INIT_F8 | |||||
| scal_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_F8 | |||||
| scal_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble scal_kernel_L999 | |||||
| scal_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_F10 | |||||
| mov w0, wzr | |||||
| ret | |||||
| scal_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| mov X_COPY, X | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble scal_kernel_S1 | |||||
| scal_kernel_S4: | |||||
| KERNEL_S4 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_S4 | |||||
| scal_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble scal_kernel_L999 | |||||
| scal_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne scal_kernel_S10 | |||||
| scal_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| scal_kernel_zero: | |||||
| INIT_S | |||||
| scal_kernel_Z1: | |||||
| st1 DAV, [X], INC_X | |||||
| subs N, N, #1 | |||||
| bne scal_kernel_Z1 | |||||
| mov w0, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,178 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define TMPF s6 | |||||
| #define SSQ s0 | |||||
| #define TMPVF {v6.s}[0] | |||||
| #define SZ 4 | |||||
| /******************************************************************************/ | |||||
| .macro INIT_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| fmul SSQ, TMPF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| ldr TMPF, [X], #SZ | |||||
| fmul TMPF, TMPF, TMPF | |||||
| fadd SSQ, SSQ, TMPF | |||||
| .endm | |||||
| .macro INIT_F4 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| fmul v1.4s, v1.4s, v1.4s | |||||
| ext v2.16b, v1.16b, v1.16b, #8 | |||||
| fadd v2.2s, v1.2s, v2.2s | |||||
| faddp SSQ, v2.2s | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| ld1 {v1.4s}, [X], #16 | |||||
| fmul v1.4s, v1.4s, v1.4s | |||||
| ext v2.16b, v1.16b, v1.16b, #8 | |||||
| fadd v2.2s, v1.2s, v2.2s | |||||
| faddp TMPF, v2.2s | |||||
| fadd SSQ, SSQ, TMPF | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #2 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| fmul SSQ, TMPF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 TMPVF, [X], INC_X | |||||
| fmul TMPF, TMPF, TMPF | |||||
| fadd SSQ, SSQ, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble nrm2_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble nrm2_kernel_zero | |||||
| cmp INC_X, #1 | |||||
| bne nrm2_kernel_S_BEGIN | |||||
| nrm2_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq nrm2_kernel_F1_INIT | |||||
| INIT_F4 | |||||
| subs I, I, #1 | |||||
| beq nrm2_kernel_F1 | |||||
| nrm2_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F4 | |||||
| nrm2_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F10 | |||||
| b nrm2_kernel_L999 | |||||
| nrm2_kernel_F1_INIT: | |||||
| INIT_F1 | |||||
| subs N, N, #1 | |||||
| b nrm2_kernel_F1 | |||||
| nrm2_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble nrm2_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble nrm2_kernel_S1 | |||||
| nrm2_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S4 | |||||
| nrm2_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S10 | |||||
| nrm2_kernel_L999: | |||||
| fsqrt SSQ, SSQ | |||||
| ret | |||||
| nrm2_kernel_zero: | |||||
| fmov SSQ, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,266 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x3 /* X vector address */ | |||||
| #define INC_X x4 /* X stride */ | |||||
| #define Y x5 /* Y vector address */ | |||||
| #define INC_Y x6 /* Y stride */ | |||||
| #define I x1 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define TMP0 s0 | |||||
| #define TMPV0 {v0.s}[0] | |||||
| #define TMP1 s1 | |||||
| #define TMPV1 {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define TMP0 d0 | |||||
| #define TMPV0 {v0.d}[0] | |||||
| #define TMP1 d1 | |||||
| #define TMPV1 {v1.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(COMPLEX) | |||||
| ldr TMP0, [X] | |||||
| ldr TMP1, [Y] | |||||
| str TMP0, [Y], #SZ | |||||
| str TMP1, [X], #SZ | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.2s}, [X] | |||||
| ld1 {v1.2s}, [Y] | |||||
| st1 {v0.2s}, [Y], #8 | |||||
| st1 {v1.2s}, [X], #8 | |||||
| #else | |||||
| ld1 {v0.2d}, [X] | |||||
| ld1 {v1.2d}, [Y] | |||||
| st1 {v0.2d}, [Y], #16 | |||||
| st1 {v1.2d}, [X], #16 | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| #else // DOUBLE | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| #endif | |||||
| #else // COMPLEX | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| #else // DOUBLE | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| ld1 {v0.4s, v1.4s}, [X] | |||||
| ld1 {v2.4s, v3.4s}, [Y] | |||||
| st1 {v0.4s, v1.4s}, [Y], #32 | |||||
| st1 {v2.4s, v3.4s}, [X], #32 | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #2 | |||||
| lsl INC_Y, INC_Y, #2 | |||||
| #else | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #endif | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| lsl INC_Y, INC_Y, #4 | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(COMPLEX) | |||||
| #if !defined(DOUBLE) | |||||
| ldr w10, [X] | |||||
| ldr w11, [Y] | |||||
| str w10, [Y] | |||||
| str w11, [X] | |||||
| #else | |||||
| ldr x10, [X] | |||||
| ldr x11, [Y] | |||||
| str x10, [Y] | |||||
| str x11, [X] | |||||
| #endif | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| ldr x10, [X] | |||||
| ldr x11, [Y] | |||||
| str x10, [Y] | |||||
| str x11, [X] | |||||
| #else | |||||
| ldr x10, [X] | |||||
| ldr x11, [Y] | |||||
| str x10, [Y] | |||||
| str x11, [X] | |||||
| ldr x12, [X, #8] | |||||
| ldr x13, [Y, #8] | |||||
| str x12, [Y, #8] | |||||
| str x13, [X, #8] | |||||
| #endif | |||||
| #endif | |||||
| add Y, Y, INC_Y | |||||
| add X, X, INC_X | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble swap_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne swap_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne swap_kernel_S_BEGIN | |||||
| swap_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq swap_kernel_F1 | |||||
| swap_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_F8 | |||||
| swap_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble swap_kernel_L999 | |||||
| swap_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_F10 | |||||
| b swap_kernel_L999 | |||||
| swap_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble swap_kernel_S1 | |||||
| swap_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_S4 | |||||
| swap_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble swap_kernel_L999 | |||||
| swap_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne swap_kernel_S10 | |||||
| swap_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,273 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if defined(USE_MIN) | |||||
| #define COND le | |||||
| #else | |||||
| #define COND ge | |||||
| #endif | |||||
| #if !defined(DOUBLE) | |||||
| #define REG0 wzr | |||||
| #define MAXF s0 | |||||
| #define TMPF s1 | |||||
| #define TMPVF {v1.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define REG0 xzr | |||||
| #define MAXF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro INIT_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v0.2s}, [X], #8 | |||||
| fabs v0.2s, v0.2s | |||||
| ext v1.8b, v0.8b, v0.8b, #4 | |||||
| fadd MAXF, s0, s1 | |||||
| #else | |||||
| ld1 {v0.2d}, [X], #16 | |||||
| fabs v0.2d, v0.2d | |||||
| faddp MAXF, v0.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.2s}, [X], #8 | |||||
| fabs v1.2s, v1.2s | |||||
| ext v2.8b, v1.8b, v1.8b, #4 | |||||
| fadd TMPF, s1, s2 | |||||
| #else | |||||
| ld1 {v1.2d}, [X], #16 | |||||
| fabs v1.2d, v1.2d | |||||
| faddp TMPF, v1.2d | |||||
| #endif | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| .endm | |||||
| .macro INIT_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld2 {v0.4s,v1.4s}, [X], #32 | |||||
| fabs v0.4s, v0.4s // [X6, X4, X2, X0] | |||||
| fabs v1.4s, v1.4s // [X7, X5, X3, X1] | |||||
| fadd v0.4s, v0.4s, v1.4s // [X7+X6, X5+X4, X3+X2, X1+X0] | |||||
| #if defined(USE_MIN) | |||||
| fminv MAXF, v0.4s | |||||
| #else | |||||
| fmaxv MAXF, v0.4s | |||||
| #endif | |||||
| #else // DOUBLE | |||||
| ld4 {v0.2d,v1.2d,v2.2d,v3.2d}, [X], #64 | |||||
| fabs v0.2d, v0.2d | |||||
| fabs v1.2d, v1.2d | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| fadd v0.2d, v0.2d, v1.2d | |||||
| fadd v2.2d, v2.2d, v3.2d | |||||
| #if defined(USE_MIN) | |||||
| fmin v0.2d, v0.2d, v2.2d | |||||
| fminp MAXF, v0.2d | |||||
| #else | |||||
| fmax v0.2d, v0.2d, v2.2d | |||||
| fmaxp MAXF, v0.2d | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld2 {v1.4s,v2.4s}, [X], #32 | |||||
| fabs v1.4s, v1.4s // [X6, X4, X2, X0] | |||||
| fabs v2.4s, v2.4s // [X7, X5, X3, X1] | |||||
| fadd v1.4s, v1.4s, v2.4s // [X7+X6, X5+X4, X3+X2, X1+X0] | |||||
| #if defined(USE_MIN) | |||||
| fminv TMPF, v1.4s | |||||
| #else | |||||
| fmaxv TMPF, v1.4s | |||||
| #endif | |||||
| #else // DOUBLE | |||||
| ld4 {v1.2d,v2.2d,v3.2d,v4.2d}, [X], #64 | |||||
| fabs v1.2d, v1.2d | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| fabs v4.2d, v4.2d | |||||
| fadd v1.2d, v1.2d, v2.2d | |||||
| fadd v3.2d, v3.2d, v4.2d | |||||
| #if defined(USE_MIN) | |||||
| fmin v1.2d, v1.2d, v3.2d | |||||
| fminp MAXF, v1.2d | |||||
| #else | |||||
| fmax v1.2d, v1.2d, v3.2d | |||||
| fmaxp MAXF, v1.2d | |||||
| #endif | |||||
| #endif | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| ld1 {v0.2s}, [X], INC_X | |||||
| fabs v0.2s, v0.2s | |||||
| ext v1.8b, v0.8b, v0.8b, #4 | |||||
| fadd MAXF, s0, s1 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| ld1 {v0.2d}, [X], INC_X | |||||
| fabs v0.2d, v0.2d | |||||
| faddp MAXF, v0.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.2s}, [X], INC_X | |||||
| fabs v1.2s, v1.2s | |||||
| ext v2.8b, v1.8b, v1.8b, #4 | |||||
| fadd TMPF, s1, s2 | |||||
| #else | |||||
| ld1 {v1.2d}, [X], INC_X | |||||
| fabs v1.2d, v1.2d | |||||
| faddp TMPF, v1.2d | |||||
| #endif | |||||
| fcmp MAXF, TMPF | |||||
| fcsel MAXF, MAXF, TMPF, COND | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble amax_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble amax_kernel_zero | |||||
| cmp INC_X, #1 | |||||
| bne amax_kernel_S_BEGIN | |||||
| amax_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq amax_kernel_F1_INIT | |||||
| INIT_F4 | |||||
| subs I, I, #1 | |||||
| beq amax_kernel_F1 | |||||
| amax_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_F4 | |||||
| amax_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble amax_kernel_L999 | |||||
| amax_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_F10 | |||||
| ret | |||||
| amax_kernel_F1_INIT: | |||||
| INIT_F1 | |||||
| subs N, N, #1 | |||||
| b amax_kernel_F1 | |||||
| amax_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble amax_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble amax_kernel_S1 | |||||
| amax_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_S4 | |||||
| amax_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble amax_kernel_L999 | |||||
| amax_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne amax_kernel_S10 | |||||
| amax_kernel_L999: | |||||
| ret | |||||
| amax_kernel_zero: | |||||
| fmov MAXF, REG0 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,164 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #define REG0 xzr | |||||
| #define SUMF d0 | |||||
| #define TMPF d1 | |||||
| #define TMPVF {v1.d}[0] | |||||
| #define SZ 8 | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| ld1 {v1.2d}, [X], #16 | |||||
| fabs v1.2d, v1.2d | |||||
| faddp TMPF, v1.2d | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||||
| fabs v1.2d, v1.2d | |||||
| fabs v2.2d, v2.2d | |||||
| fabs v3.2d, v3.2d | |||||
| fabs v4.2d, v4.2d | |||||
| fadd v1.2d, v1.2d, v2.2d | |||||
| fadd v3.2d, v3.2d, v4.2d | |||||
| fadd v0.2d, v0.2d, v1.2d | |||||
| fadd v0.2d, v0.2d, v3.2d | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| faddp SUMF, v0.2d | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #4 | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| ld1 {v1.2d}, [X], INC_X | |||||
| fabs v1.2d, v1.2d | |||||
| faddp TMPF, v1.2d | |||||
| fadd SUMF, SUMF, TMPF | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov SUMF, REG0 | |||||
| cmp N, xzr | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, xzr | |||||
| ble asum_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne asum_kernel_S_BEGIN | |||||
| asum_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq asum_kernel_F1 | |||||
| asum_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F4 | |||||
| KERNEL_F4_FINALIZE | |||||
| asum_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_F10 | |||||
| asum_kernel_L999: | |||||
| ret | |||||
| asum_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble asum_kernel_S1 | |||||
| asum_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S4 | |||||
| asum_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble asum_kernel_L999 | |||||
| asum_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne asum_kernel_S10 | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,301 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x3 /* X vector address */ | |||||
| #define INC_X x4 /* X stride */ | |||||
| #define Y x5 /* Y vector address */ | |||||
| #define INC_Y x6 /* Y stride */ | |||||
| #define I x1 /* loop variable */ | |||||
| #define Y_COPY x7 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define DA_R s0 /* scale input value */ | |||||
| #define DA_I s1 /* scale input value */ | |||||
| #define TMPX v2.2s | |||||
| #define TMPY v3.2s | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define DA_R d0 /* scale input value */ | |||||
| #define DA_I d1 /* scale input value */ | |||||
| #define TMPX v2.2d | |||||
| #define TMPY v3.2d | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro INIT | |||||
| #if !defined(CONJ) | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
| fneg s2, DA_I | |||||
| ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||||
| ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||||
| fneg d2, DA_I | |||||
| ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | |||||
| ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | |||||
| #endif | |||||
| #else | |||||
| #if !defined(DOUBLE) | |||||
| fneg s2, DA_R | |||||
| ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R | |||||
| ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I | |||||
| #else | |||||
| fneg d2, DA_R | |||||
| ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R | |||||
| ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] | |||||
| ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] | |||||
| fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v3.2s}, [Y], #8 | |||||
| #else | |||||
| ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] | |||||
| ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] | |||||
| fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v3.2d}, [Y], #16 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_INIT_F4 | |||||
| #if !defined(DOUBLE) | |||||
| // Replicate the lower 2 floats into the upper 2 slots | |||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||||
| ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0] | |||||
| // V3 = X[7], X[6], X[5], X[4] | |||||
| ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||||
| ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||||
| ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||||
| ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0] | |||||
| // V5 = Y[7], Y[6], Y[5], Y[4] | |||||
| ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||||
| ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||||
| ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||||
| fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v4.4s}, [Y], #16 | |||||
| fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix] | |||||
| fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v5.4s}, [Y], #16 | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||||
| ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||||
| ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||||
| ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||||
| ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||||
| ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||||
| ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||||
| fmla v16.2d, v0.2d, v2.2d | |||||
| fmla v17.2d, v0.2d, v3.2d | |||||
| ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||||
| fmla v16.2d, v1.2d, v20.2d | |||||
| fmla v17.2d, v1.2d, v21.2d | |||||
| st1 {v16.2d,v17.2d}, [Y], #32 | |||||
| fmla v18.2d, v0.2d, v4.2d | |||||
| fmla v19.2d, v0.2d, v5.2d | |||||
| fmla v18.2d, v1.2d, v22.2d | |||||
| fmla v19.2d, v1.2d, v23.2d | |||||
| st1 {v18.2d,v19.2d}, [Y], #32 | |||||
| #endif | |||||
| PRFM PLDL1KEEP, [X, #512] | |||||
| PRFM PLDL1KEEP, [Y, #512] | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| lsl INC_Y, INC_Y, #4 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2s}, [Y] // V3 = Y[iy+1], Y[iy] | |||||
| ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] | |||||
| fmla v3.2s, v0.2s, v2.2s // Y[iy] += DA_R * X[ix] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| fmla v3.2s, v1.2s, v4.2s // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v3.2s}, [Y], INC_Y | |||||
| #else | |||||
| ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2d}, [Y] // V3 = Y[iy+1], Y[iy] | |||||
| ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] | |||||
| fmla v3.2d, v0.2d, v2.2d // Y[iy] += DA_R * X[ix] | |||||
| // Y[iy+1] += +-DA_R * X[ix+1] | |||||
| fmla v3.2d, v1.2d, v4.2d // Y[iy] += +-DA_I * X[ix+1] | |||||
| // Y[iy+1] += DA_I * X[ix] | |||||
| st1 {v3.2d}, [Y], INC_Y | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble zaxpy_kernel_L999 | |||||
| mov Y_COPY, Y | |||||
| fcmp DA_R, #0.0 | |||||
| bne .L1 | |||||
| fcmp DA_I, #0.0 | |||||
| beq zaxpy_kernel_L999 | |||||
| .L1: | |||||
| INIT | |||||
| cmp INC_X, #1 | |||||
| bne zaxpy_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne zaxpy_kernel_S_BEGIN | |||||
| zaxpy_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq zaxpy_kernel_F1 | |||||
| KERNEL_INIT_F4 | |||||
| zaxpy_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne zaxpy_kernel_F4 | |||||
| zaxpy_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble zaxpy_kernel_L999 | |||||
| zaxpy_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne zaxpy_kernel_F10 | |||||
| mov w0, wzr | |||||
| ret | |||||
| zaxpy_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble zaxpy_kernel_S1 | |||||
| zaxpy_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zaxpy_kernel_S4 | |||||
| zaxpy_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble zaxpy_kernel_L999 | |||||
| zaxpy_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zaxpy_kernel_S10 | |||||
| zaxpy_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| @@ -0,0 +1,302 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define Y x3 /* Y vector address */ | |||||
| #define INC_Y x4 /* Y stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #if !defined(DSDOT) | |||||
| #define REG0 wzr | |||||
| #define DOTF s0 | |||||
| #else // DSDOT | |||||
| #define REG0 xzr | |||||
| #define DOTF d0 | |||||
| #endif | |||||
| #define DOTI s1 | |||||
| #define TMPX s2 | |||||
| #define LD1VX {v2.s}[0] | |||||
| #define TMPY s3 | |||||
| #define LD1VY {v3.s}[0] | |||||
| #define TMPVY v3.s[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define REG0 xzr | |||||
| #define DOTF d0 | |||||
| #define DOTI d1 | |||||
| #define TMPX d2 | |||||
| #define LD1VX {v2.d}[0] | |||||
| #define TMPY d3 | |||||
| #define LD1VY {v3.d}[0] | |||||
| #define TMPVY v3.d[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X], #8 // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2s}, [Y], #8 // V3 = Y[iy+1], Y[iy]; Y += 2 | |||||
| ins v4.s[0], v2.s[1] // V4 = X[ix+1] | |||||
| #if !defined(CONJ) | |||||
| fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] | |||||
| fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] | |||||
| fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #else | |||||
| fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] | |||||
| fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] | |||||
| fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #endif | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d}, [X], #16 // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2d}, [Y], #16 // V3 = Y[iy+1], Y[iy]; Y += 2 | |||||
| ins v4.d[0], v2.d[1] // V4 = X[ix+1] | |||||
| #if !defined(CONJ) | |||||
| fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] | |||||
| fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] | |||||
| fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #else | |||||
| fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] | |||||
| fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] | |||||
| fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld2 {v2.4s, v3.4s}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld2 {v4.4s, v5.4s}, [Y], #32 // V2 = X[ix+1], X[ix]; X += 2 | |||||
| fmla v0.4s, v2.4s, v4.4s // dot[0] += X[ix] * Y[iy] | |||||
| fmla v1.4s, v2.4s, v5.4s // dot[1] += X[ix] * Y[iy+1] | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| PRFM PLDL1KEEP, [Y, #1024] | |||||
| #if !defined(CONJ) | |||||
| fmls v0.4s, v3.4s, v5.4s // dot[0] -= X[ix+1] * Y[iy+1] | |||||
| fmla v1.4s, v3.4s, v4.4s // dot[1] += X[ix+1] * Y[iy] | |||||
| #else | |||||
| fmla v0.4s, v3.4s, v5.4s // dot[0] += X[ix+1] * Y[iy+1] | |||||
| fmls v1.4s, v3.4s, v4.4s // dot[1] -= X[ix+1] * Y[iy] | |||||
| #endif | |||||
| #else // DOUBLE | |||||
| ld2 {v2.2d, v3.2d}, [X], #32 // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld2 {v16.2d, v17.2d}, [Y], #32 | |||||
| fmla v0.2d, v2.2d, v16.2d // dot[0] += X[ix] * Y[iy] | |||||
| fmla v1.2d, v2.2d, v17.2d // dot[1] += X[ix] * Y[iy+1] | |||||
| ld2 {v4.2d, v5.2d}, [X], #32 | |||||
| ld2 {v18.2d, v19.2d}, [Y], #32 | |||||
| fmla v0.2d, v4.2d, v18.2d // dot[1] += X[ix] * Y[iy+1] | |||||
| fmla v1.2d, v4.2d, v19.2d // dot[1] += X[ix] * Y[iy+1] | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| PRFM PLDL1KEEP, [Y, #1024] | |||||
| #if !defined(CONJ) | |||||
| fmls v0.2d, v3.2d, v17.2d // dot[0] -= X[ix+1] * Y[iy+1] | |||||
| fmls v20.2d, v5.2d, v19.2d // dot[0] -= X[ix+1] * Y[iy+1] | |||||
| fmla v1.2d, v3.2d, v16.2d // dot[1] += X[ix+1] * Y[iy] | |||||
| fmla v21.2d, v5.2d, v18.2d // dot[1] += X[ix+1] * Y[iy] | |||||
| #else | |||||
| fmla v0.2d, v3.2d, v17.2d // dot[0] += X[ix+1] * Y[iy+1] | |||||
| fmla v20.2d, v5.2d, v19.2d // dot[0] += X[ix+1] * Y[iy+1] | |||||
| fmls v1.2d, v3.2d, v16.2d // dot[1] -= X[ix+1] * Y[iy] | |||||
| fmls v21.2d, v5.2d, v18.2d // dot[1] -= X[ix+1] * Y[iy] | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| ext v2.16b, v0.16b, v0.16b, #8 | |||||
| fadd v0.2s, v0.2s, v2.2s | |||||
| faddp DOTF, v0.2s | |||||
| ext v3.16b, v1.16b, v1.16b, #8 | |||||
| fadd v1.2s, v1.2s, v3.2s | |||||
| faddp DOTI, v1.2s | |||||
| #else | |||||
| fadd v0.2d, v0.2d, v20.2d | |||||
| faddp DOTF, v0.2d | |||||
| fadd v1.2d, v1.2d, v21.2d | |||||
| faddp DOTI, v1.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| lsl INC_Y, INC_Y, #4 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2s}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 | |||||
| ext v4.8b, v2.8b, v2.8b, #4 // V4 = X[ix], X[ix+1] | |||||
| #if !defined(CONJ) | |||||
| fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmls DOTF, s4, v3.s[1] // dot[0] -= X[ix+1] * Y[iy+1] | |||||
| fmla DOTI, s4, v3.s[0] // dot[1] += X[ix+1] * Y[iy] | |||||
| fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #else | |||||
| fmla DOTF, s2, v3.s[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmla DOTF, s4, v3.s[1] // dot[0] += X[ix+1] * Y[iy+1] | |||||
| fmls DOTI, s4, v3.s[0] // dot[1] -= X[ix+1] * Y[iy] | |||||
| fmla DOTI, s2, v3.s[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #endif | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d}, [X], INC_X // V2 = X[ix+1], X[ix]; X += 2 | |||||
| ld1 {v3.2d}, [Y], INC_Y // V3 = Y[iy+1], Y[iy]; Y += 2 | |||||
| ext v4.16b, v2.16b, v2.16b, #8 // V4 = X[ix], X[ix+1] | |||||
| #if !defined(CONJ) | |||||
| fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmls DOTF, d4, v3.d[1] // dot[0] -= X[ix+1] * Y[iy+1] | |||||
| fmla DOTI, d4, v3.d[0] // dot[1] += X[ix+1] * Y[iy] | |||||
| fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #else | |||||
| fmla DOTF, d2, v3.d[0] // dot[0] += X[ix] * Y[iy] | |||||
| fmla DOTF, d4, v3.d[1] // dot[0] += X[ix+1] * Y[iy+1] | |||||
| fmls DOTI, d4, v3.d[0] // dot[1] -= X[ix+1] * Y[iy] | |||||
| fmla DOTI, d2, v3.d[1] // dot[1] += X[ix] * Y[iy+1] | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| fmov DOTF, REG0 | |||||
| fmov DOTI, DOTF | |||||
| #if !defined(DOUBLE) | |||||
| fmov s20, DOTF | |||||
| fmov s21, DOTI | |||||
| #else | |||||
| fmov d20, DOTF | |||||
| fmov d21, DOTI | |||||
| #endif | |||||
| cmp N, xzr | |||||
| ble dot_kernel_L999 | |||||
| cmp INC_X, #1 | |||||
| bne dot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne dot_kernel_S_BEGIN | |||||
| dot_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq dot_kernel_F1 | |||||
| dot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_F4 | |||||
| KERNEL_F4_FINALIZE | |||||
| dot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble dot_kernel_L999 | |||||
| dot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_F10 | |||||
| ret | |||||
| dot_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble dot_kernel_S1 | |||||
| dot_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_S4 | |||||
| dot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble dot_kernel_L999 | |||||
| dot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne dot_kernel_S10 | |||||
| dot_kernel_L999: | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,514 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M x0 /* Y vector length */ | |||||
| #define N x1 /* X vector length */ | |||||
| #define A x3 /* A vector address */ | |||||
| #define LDA x4 /* A stride */ | |||||
| #define X x5 /* X vector address */ | |||||
| #define INC_X x6 /* X stride */ | |||||
| #define Y x7 /* Y vector address */ | |||||
| #define INC_Y x2 /* Y stride */ | |||||
| #define A_PTR x9 /* loop A vector address */ | |||||
| #define Y_IPTR x10 /* loop Y vector address */ | |||||
| #define J x11 /* loop variable */ | |||||
| #define I x12 /* loop variable */ | |||||
| #define Y_OPTR x13 /* loop Y vector address */ | |||||
| #define X_PTR x14 /* loop X vector address */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define ALPHA_R s0 | |||||
| #define ALPHA_I s1 | |||||
| #define ALPHA_R_COPY s7 | |||||
| #define ALPHA_I_COPY s8 | |||||
| #define SHZ 3 | |||||
| #else | |||||
| #define ALPHA_R d0 | |||||
| #define ALPHA_I d1 | |||||
| #define ALPHA_R_COPY d7 | |||||
| #define ALPHA_I_COPY d8 | |||||
| #define SHZ 4 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro SAVE_REGS | |||||
| add sp, sp, #-(11 * 16) | |||||
| stp d8, d9, [sp, #(0 * 16)] | |||||
| stp d10, d11, [sp, #(1 * 16)] | |||||
| stp d12, d13, [sp, #(2 * 16)] | |||||
| stp d14, d15, [sp, #(3 * 16)] | |||||
| stp d16, d17, [sp, #(4 * 16)] | |||||
| stp x18, x19, [sp, #(5 * 16)] | |||||
| stp x20, x21, [sp, #(6 * 16)] | |||||
| stp x22, x23, [sp, #(7 * 16)] | |||||
| stp x24, x25, [sp, #(8 * 16)] | |||||
| stp x26, x27, [sp, #(9 * 16)] | |||||
| str x28, [sp, #(10 * 16)] | |||||
| .endm | |||||
| .macro RESTORE_REGS | |||||
| ldp d8, d9, [sp, #(0 * 16)] | |||||
| ldp d10, d11, [sp, #(1 * 16)] | |||||
| ldp d12, d13, [sp, #(2 * 16)] | |||||
| ldp d14, d15, [sp, #(3 * 16)] | |||||
| ldp d16, d17, [sp, #(4 * 16)] | |||||
| ldp x18, x19, [sp, #(5 * 16)] | |||||
| ldp x20, x21, [sp, #(6 * 16)] | |||||
| ldp x22, x23, [sp, #(7 * 16)] | |||||
| ldp x24, x25, [sp, #(8 * 16)] | |||||
| ldp x26, x27, [sp, #(9 * 16)] | |||||
| ldr x28, [sp, #(10 * 16)] | |||||
| add sp, sp, #(11*16) | |||||
| .endm | |||||
| .macro INIT | |||||
| /********** INIT FOR F4 LOOP **********/ | |||||
| fmov ALPHA_R_COPY, ALPHA_R | |||||
| fmov ALPHA_I_COPY, ALPHA_I | |||||
| #if !defined(DOUBLE) | |||||
| ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) | |||||
| ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) | |||||
| ins v7.d[1], v7.d[0] | |||||
| ins v8.d[1], v8.d[0] | |||||
| #else | |||||
| ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) | |||||
| ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) | |||||
| #endif | |||||
| /******* INIT FOR F1 AND S1 LOOP ******/ | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||||
| fneg s2, ALPHA_I | |||||
| ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) | |||||
| #if !defined(XCONJ) | |||||
| ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) | |||||
| #endif | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||||
| fneg d2, ALPHA_I | |||||
| ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) | |||||
| #if !defined(XCONJ) | |||||
| ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_LOOP | |||||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] | |||||
| ins v10.s[0], v9.s[1] | |||||
| ins v9.s[1], v9.s[0] // [R(X), R(X)] | |||||
| ins v10.s[1], v10.s[0] // [I(X), I(X)] | |||||
| ins v9.d[1], v9.d[0] | |||||
| ins v10.d[1], v10.d[0] | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||||
| fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||||
| #else | |||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||||
| fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||||
| #else | |||||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||||
| fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] | |||||
| fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] | |||||
| fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||||
| fmul v2.2s, v0.2s, v2.2s | |||||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||||
| ins v3.s[0], v2.s[1] | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fneg s4, s3 | |||||
| ins v3.s[1], v4.s[0] | |||||
| ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] | |||||
| ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | |||||
| #else | |||||
| fneg s4, s3 | |||||
| ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] | |||||
| ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] | |||||
| fneg s4, s2 | |||||
| ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | |||||
| #else | |||||
| fneg s3, s3 | |||||
| ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] | |||||
| fneg s4, s2 | |||||
| ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| #else // DOUBLE | |||||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||||
| ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] | |||||
| ins v10.d[0], v9.d[1] | |||||
| ins v9.d[1], v9.d[0] // [R(X), R(X)] | |||||
| ins v10.d[1], v10.d[0] // [I(X), I(X)] | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||||
| fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||||
| #else | |||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||||
| fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||||
| fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||||
| #else | |||||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||||
| fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] | |||||
| fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] | |||||
| fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||||
| fmul v2.2d, v0.2d, v2.2d | |||||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fneg d4, d3 // -I(TEMP) | |||||
| ins v3.d[1], v4.d[0] | |||||
| ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] | |||||
| ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | |||||
| #else | |||||
| fneg d4, d3 // -I(TEMP) | |||||
| ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] | |||||
| ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] | |||||
| fneg d4, d2 // -R(TEMP) | |||||
| ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | |||||
| #else | |||||
| fneg d3, d3 // -I(TEMP) | |||||
| ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] | |||||
| fneg d4, d2 // -R(TEMP) | |||||
| ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| #endif // DOUBLE | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||||
| ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| st2 {v15.4s, v16.4s}, [Y_OPTR], #32 | |||||
| #else // DOUBLE | |||||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||||
| ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| st2 {v15.2d, v16.2d}, [Y_OPTR], #32 | |||||
| ld2 {v17.2d, v18.2d}, [A_PTR], #32 | |||||
| ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #else | |||||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| st2 {v19.2d, v20.2d}, [Y_OPTR], #32 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v4.2s}, [A_PTR], #8 | |||||
| ld1 {v5.2s}, [Y_IPTR], #8 | |||||
| ext v6.8b, v4.8b, v4.8b, #4 | |||||
| fmla v5.2s, v2.2s, v4.2s | |||||
| fmla v5.2s, v3.2s, v6.2s | |||||
| st1 {v5.2s}, [Y_OPTR], #8 | |||||
| #else // DOUBLE | |||||
| ld1 {v4.2d}, [A_PTR], #16 | |||||
| ld1 {v5.2d}, [Y_IPTR], #16 | |||||
| ext v6.16b, v4.16b, v4.16b, #8 | |||||
| fmla v5.2d, v2.2d, v4.2d | |||||
| fmla v5.2d, v3.2d, v6.2d | |||||
| st1 {v5.2d}, [Y_OPTR], #16 | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_Y, INC_Y, #SHZ | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v4.2s}, [A_PTR], #8 | |||||
| ld1 {v5.2s}, [Y_IPTR], INC_Y | |||||
| ext v6.8b, v4.8b, v4.8b, #4 | |||||
| fmla v5.2s, v2.2s, v4.2s | |||||
| fmla v5.2s, v3.2s, v6.2s | |||||
| st1 {v5.2s}, [Y_OPTR], INC_Y | |||||
| #else // DOUBLE | |||||
| ld1 {v4.2d}, [A_PTR], #16 | |||||
| ld1 {v5.2d}, [Y_IPTR], INC_Y | |||||
| ext v6.16b, v4.16b, v4.16b, #8 | |||||
| fmla v5.2d, v2.2d, v4.2d | |||||
| fmla v5.2d, v3.2d, v6.2d | |||||
| st1 {v5.2d}, [Y_OPTR], INC_Y | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| ldr INC_Y, [sp] | |||||
| SAVE_REGS | |||||
| cmp N, xzr | |||||
| ble zgemv_n_kernel_L999 | |||||
| cmp M, xzr | |||||
| ble zgemv_n_kernel_L999 | |||||
| lsl LDA, LDA, #SHZ | |||||
| lsl INC_X, INC_X, #SHZ | |||||
| mov J, N | |||||
| INIT | |||||
| cmp INC_Y, #1 | |||||
| bne zgemv_n_kernel_S_BEGIN | |||||
| zgemv_n_kernel_F_LOOP: | |||||
| mov A_PTR, A | |||||
| mov Y_IPTR, Y | |||||
| mov Y_OPTR, Y | |||||
| mov X_PTR, X | |||||
| add X, X, INC_X | |||||
| INIT_LOOP | |||||
| asr I, M, #2 | |||||
| cmp I, xzr | |||||
| beq zgemv_n_kernel_F1 | |||||
| zgemv_n_kernel_F4: | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne zgemv_n_kernel_F4 | |||||
| zgemv_n_kernel_F1: | |||||
| ands I, M, #3 | |||||
| ble zgemv_n_kernel_F_END | |||||
| zgemv_n_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne zgemv_n_kernel_F10 | |||||
| zgemv_n_kernel_F_END: | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| bne zgemv_n_kernel_F_LOOP | |||||
| b zgemv_n_kernel_L999 | |||||
| zgemv_n_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| zgemv_n_kernel_S_LOOP: | |||||
| mov A_PTR, A | |||||
| mov Y_IPTR, Y | |||||
| mov Y_OPTR, Y | |||||
| mov X_PTR, X | |||||
| add X, X, INC_X | |||||
| INIT_LOOP | |||||
| asr I, M, #2 | |||||
| cmp I, xzr | |||||
| ble zgemv_n_kernel_S1 | |||||
| zgemv_n_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zgemv_n_kernel_S4 | |||||
| zgemv_n_kernel_S1: | |||||
| ands I, M, #3 | |||||
| ble zgemv_n_kernel_S_END | |||||
| zgemv_n_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zgemv_n_kernel_S10 | |||||
| zgemv_n_kernel_S_END: | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| bne zgemv_n_kernel_S_LOOP | |||||
| zgemv_n_kernel_L999: | |||||
| RESTORE_REGS | |||||
| mov w0, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,448 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M x0 /* Y vector length */ | |||||
| #define N x1 /* X vector length */ | |||||
| #define A x3 /* A vector address */ | |||||
| #define LDA x4 /* A stride */ | |||||
| #define X x5 /* X vector address */ | |||||
| #define INC_X x6 /* X stride */ | |||||
| #define Y x7 /* Y vector address */ | |||||
| #define INC_Y x2 /* Y stride */ | |||||
| #define A_PTR x9 /* loop A vector address */ | |||||
| #define X_PTR x10 /* loop Y vector address */ | |||||
| #define J x11 /* loop variable */ | |||||
| #define I x12 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define ALPHA_R s0 | |||||
| #define ALPHA_I s1 | |||||
| #define ALPHA_R_COPY s7 | |||||
| #define ALPHA_I_COPY s8 | |||||
| #define SHZ 3 | |||||
| #else | |||||
| #define ALPHA_R d0 | |||||
| #define ALPHA_I d1 | |||||
| #define ALPHA_R_COPY d7 | |||||
| #define ALPHA_I_COPY d8 | |||||
| #define SHZ 4 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro SAVE_REGS | |||||
| add sp, sp, #-(11 * 16) | |||||
| stp d8, d9, [sp, #(0 * 16)] | |||||
| stp d10, d11, [sp, #(1 * 16)] | |||||
| stp d12, d13, [sp, #(2 * 16)] | |||||
| stp d14, d15, [sp, #(3 * 16)] | |||||
| stp d16, d17, [sp, #(4 * 16)] | |||||
| stp x18, x19, [sp, #(5 * 16)] | |||||
| stp x20, x21, [sp, #(6 * 16)] | |||||
| stp x22, x23, [sp, #(7 * 16)] | |||||
| stp x24, x25, [sp, #(8 * 16)] | |||||
| stp x26, x27, [sp, #(9 * 16)] | |||||
| str x28, [sp, #(10 * 16)] | |||||
| .endm | |||||
| .macro RESTORE_REGS | |||||
| ldp d8, d9, [sp, #(0 * 16)] | |||||
| ldp d10, d11, [sp, #(1 * 16)] | |||||
| ldp d12, d13, [sp, #(2 * 16)] | |||||
| ldp d14, d15, [sp, #(3 * 16)] | |||||
| ldp d16, d17, [sp, #(4 * 16)] | |||||
| ldp x18, x19, [sp, #(5 * 16)] | |||||
| ldp x20, x21, [sp, #(6 * 16)] | |||||
| ldp x22, x23, [sp, #(7 * 16)] | |||||
| ldp x24, x25, [sp, #(8 * 16)] | |||||
| ldp x26, x27, [sp, #(9 * 16)] | |||||
| ldr x28, [sp, #(10 * 16)] | |||||
| add sp, sp, #(11*16) | |||||
| .endm | |||||
| .macro INIT | |||||
| #if !defined(XCONJ) | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R | |||||
| fneg s2, ALPHA_I | |||||
| ins v1.s[1], v2.s[0] | |||||
| ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R | |||||
| fneg d2, ALPHA_I | |||||
| ins v1.d[1], v2.d[0] | |||||
| ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I | |||||
| #endif | |||||
| #else // XCONJ | |||||
| #if !defined(DOUBLE) | |||||
| fneg s2, ALPHA_R | |||||
| ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R | |||||
| ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I | |||||
| #else | |||||
| fneg d2, ALPHA_R | |||||
| ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R | |||||
| ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I | |||||
| #endif | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_LOOP | |||||
| fmov d9, xzr // TEMP_R = [0, 0] | |||||
| fmov d10, xzr // TEMP_I = [0, 0] | |||||
| #if !defined(DOUBLE) | |||||
| #else | |||||
| fmov d15, xzr // TEMP_R = [0, 0] | |||||
| fmov d16, xzr // TEMP_I = [0, 0] | |||||
| #endif | |||||
| fmov d2, xzr // TEMP = [0, 0] | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld2 {v11.4s, v12.4s}, [X_PTR], #32 | |||||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
| fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||||
| fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||||
| fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | |||||
| #else | |||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
| fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||||
| fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||||
| fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
| fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||||
| fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||||
| fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | |||||
| #else | |||||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
| fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||||
| fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||||
| fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| #else // DOUBLE | |||||
| ld2 {v11.2d, v12.2d}, [X_PTR], #32 | |||||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||||
| prfm PLDL1STRM, [X_PTR, #512] | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
| fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||||
| fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||||
| fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | |||||
| #else | |||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
| fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||||
| fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||||
| fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
| fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||||
| fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||||
| fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | |||||
| #else | |||||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
| fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||||
| fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||||
| fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| ld2 {v17.2d, v18.2d}, [X_PTR], #32 | |||||
| ld2 {v19.2d, v20.2d}, [A_PTR], #32 | |||||
| prfm PLDL1STRM, [A_PTR, #512] | |||||
| #if !defined(CONJ) | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
| fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
| fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
| fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
| #else | |||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
| fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
| fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
| fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
| #endif | |||||
| #else // CONJ | |||||
| #if !defined(XCONJ) | |||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
| fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
| fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
| fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
| #else | |||||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
| fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
| fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
| fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
| #endif | |||||
| #endif // CONJ | |||||
| #endif //DOUBLE | |||||
| .endm | |||||
| .macro KERNEL_F4_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| ext v21.16b, v9.16b, v9.16b, #8 | |||||
| fadd v9.2s, v9.2s, v21.2s | |||||
| faddp s9, v9.2s | |||||
| ext v21.16b, v10.16b, v10.16b, #8 | |||||
| fadd v10.2s, v10.2s, v21.2s | |||||
| faddp s10, v10.2s | |||||
| ins v2.s[0], v9.s[0] | |||||
| ins v2.s[1], v10.s[0] | |||||
| #else | |||||
| fadd v9.2d, v9.2d, v15.2d | |||||
| fadd v10.2d, v10.2d, v16.2d | |||||
| faddp d9, v9.2d | |||||
| faddp d10, v10.2d | |||||
| ins v2.d[0], v9.d[0] | |||||
| ins v2.d[1], v10.d[0] | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | |||||
| ld1 {v5.s}[0], [A_PTR], #4 // A1 | |||||
| ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] | |||||
| fneg s16, s5 | |||||
| ins v5.s[1], v16.s[0] // [-A1, A1] | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | |||||
| #endif | |||||
| ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] | |||||
| fmla v2.2s, v4.2s, v6.2s | |||||
| fmla v2.2s, v5.2s, v7.2s | |||||
| #else // DOUBLE | |||||
| ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | |||||
| ld1 {v5.d}[0], [A_PTR], #8 // A1 | |||||
| ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] | |||||
| fneg d16, d5 | |||||
| ins v5.d[1], v16.d[0] // [-A1, A1] | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | |||||
| #endif | |||||
| ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] | |||||
| fmla v2.2d, v4.2d, v6.2d | |||||
| fmla v2.2d, v5.2d, v7.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| lsl INC_X, INC_X, #SHZ | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | |||||
| ld1 {v5.s}[0], [A_PTR], #4 // A1 | |||||
| ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] | |||||
| fneg s16, s5 | |||||
| ins v5.s[1], v16.s[0] // [-A1, A1] | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | |||||
| #endif | |||||
| ext v7.8b, v6.8b, v6.8b, #4 // [X0, X1] | |||||
| fmla v2.2s, v4.2s, v6.2s | |||||
| fmla v2.2s, v5.2s, v7.2s | |||||
| #else // DOUBLE | |||||
| ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | |||||
| ld1 {v5.d}[0], [A_PTR], #8 // A1 | |||||
| ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] | |||||
| fneg d16, d5 | |||||
| ins v5.d[1], v16.d[0] // [-A1, A1] | |||||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||||
| ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | |||||
| #endif | |||||
| ext v7.16b, v6.16b, v6.16b, #8 // [X0, X1] | |||||
| fmla v2.2d, v4.2d, v6.2d | |||||
| fmla v2.2d, v5.2d, v7.2d | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| ldr INC_Y, [sp] | |||||
| SAVE_REGS | |||||
| cmp N, xzr | |||||
| ble zgemv_t_kernel_L999 | |||||
| cmp M, xzr | |||||
| ble zgemv_t_kernel_L999 | |||||
| lsl LDA, LDA, #SHZ | |||||
| lsl INC_Y, INC_Y, #SHZ | |||||
| mov J, N | |||||
| INIT | |||||
| cmp INC_X, #1 | |||||
| bne zgemv_t_kernel_S_BEGIN | |||||
| zgemv_t_kernel_F_LOOP: | |||||
| mov A_PTR, A | |||||
| mov X_PTR, X | |||||
| INIT_LOOP | |||||
| asr I, M, #2 | |||||
| cmp I, xzr | |||||
| beq zgemv_t_kernel_F1 | |||||
| zgemv_t_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne zgemv_t_kernel_F4 | |||||
| KERNEL_F4_FINALIZE | |||||
| zgemv_t_kernel_F1: | |||||
| ands I, M, #3 | |||||
| ble zgemv_t_kernel_F_END | |||||
| zgemv_t_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne zgemv_t_kernel_F10 | |||||
| zgemv_t_kernel_F_END: | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v4.2s}, [Y] | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] | |||||
| fmla v4.2s, v0.2s, v2.2s | |||||
| fmla v4.2s, v1.2s, v3.2s | |||||
| st1 {v4.2s}, [Y], INC_Y | |||||
| #else // DOUBLE | |||||
| ld1 {v4.2d}, [Y] | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] | |||||
| fmla v4.2d, v0.2d, v2.2d | |||||
| fmla v4.2d, v1.2d, v3.2d | |||||
| st1 {v4.2d}, [Y], INC_Y | |||||
| #endif | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| bne zgemv_t_kernel_F_LOOP | |||||
| b zgemv_t_kernel_L999 | |||||
| zgemv_t_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| zgemv_t_kernel_S_LOOP: | |||||
| mov A_PTR, A | |||||
| mov X_PTR, X | |||||
| INIT_LOOP | |||||
| asr I, M, #2 | |||||
| cmp I, xzr | |||||
| ble zgemv_t_kernel_S1 | |||||
| zgemv_t_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zgemv_t_kernel_S4 | |||||
| zgemv_t_kernel_S1: | |||||
| ands I, M, #3 | |||||
| ble zgemv_t_kernel_S_END | |||||
| zgemv_t_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zgemv_t_kernel_S10 | |||||
| zgemv_t_kernel_S_END: | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v4.2s}, [Y] | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // [TEMP_R, TEMP_I] | |||||
| fmla v4.2s, v0.2s, v2.2s | |||||
| fmla v4.2s, v1.2s, v3.2s | |||||
| st1 {v4.2s}, [Y], INC_Y | |||||
| #else // DOUBLE | |||||
| ld1 {v4.2d}, [Y] | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // [TEMP_R, TEMP_I] | |||||
| fmla v4.2d, v0.2d, v2.2d | |||||
| fmla v4.2d, v1.2d, v3.2d | |||||
| st1 {v4.2d}, [Y], INC_Y | |||||
| #endif | |||||
| add A, A, LDA | |||||
| subs J, J, #1 | |||||
| bne zgemv_t_kernel_S_LOOP | |||||
| zgemv_t_kernel_L999: | |||||
| RESTORE_REGS | |||||
| mov w0, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,228 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define TMPF s6 | |||||
| #define SSQ s0 | |||||
| #define TMPVF {v6.s}[0] | |||||
| #define SZ 4 | |||||
| #else | |||||
| #define TMPF d6 | |||||
| #define SSQ d0 | |||||
| #define TMPVF {v6.d}[0] | |||||
| #define SZ 8 | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.2s}, [X], #8 | |||||
| fmul v1.2s, v1.2s, v1.2s | |||||
| faddp TMPF, v1.2s | |||||
| fadd SSQ, SSQ, TMPF | |||||
| #else | |||||
| ld1 {v1.2d}, [X], #16 | |||||
| fmul v1.2d, v1.2d, v1.2d | |||||
| faddp TMPF, v1.2d | |||||
| fadd SSQ, SSQ, TMPF | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F8 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.4s, v2.4s}, [X], #32 | |||||
| fmla v0.4s, v1.4s, v1.4s | |||||
| fmla v5.4s, v2.4s, v2.4s | |||||
| ld1 {v3.4s,v4.4s}, [X], #32 | |||||
| fmla v0.4s, v3.4s, v3.4s | |||||
| fmla v5.4s, v4.4s, v4.4s | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| #else // DOUBLE | |||||
| ld1 {v1.2d, v2.2d}, [X], #32 | |||||
| fmla v0.2d, v1.2d, v1.2d | |||||
| fmla v5.2d, v2.2d, v2.2d | |||||
| ld1 {v3.2d, v4.2d}, [X], #32 | |||||
| fmla v0.2d, v3.2d, v3.2d | |||||
| fmla v5.2d, v4.2d, v4.2d | |||||
| ld1 {v16.2d, v17.2d}, [X], #32 | |||||
| fmla v0.2d, v16.2d, v16.2d | |||||
| fmla v5.2d, v17.2d, v17.2d | |||||
| ld1 {v18.2d, v19.2d}, [X], #32 | |||||
| fmla v0.2d, v18.2d, v18.2d | |||||
| fmla v5.2d, v19.2d, v19.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro nrm2_kernel_F8_FINALIZE | |||||
| #if !defined(DOUBLE) | |||||
| fadd v0.4s, v0.4s, v5.4s | |||||
| ext v1.16b, v0.16b, v0.16b, #8 | |||||
| fadd v0.2s, v0.2s, v1.2s | |||||
| faddp SSQ, v0.2s | |||||
| #else | |||||
| fadd v0.2d, v0.2d, v5.2d | |||||
| faddp SSQ, v0.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| ld1 {v1.2s}, [X], INC_X | |||||
| fmul v1.2s, v1.2s, v1.2s | |||||
| faddp SSQ, v1.2s | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| ld1 {v1.2d}, [X], INC_X | |||||
| fmul v1.2d, v1.2d, v1.2d | |||||
| faddp SSQ, v1.2d | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v1.2s}, [X], INC_X | |||||
| fmul v1.2s, v1.2s, v1.2s | |||||
| faddp TMPF, v1.2s | |||||
| fadd SSQ, SSQ, TMPF | |||||
| #else | |||||
| ld1 {v1.2d}, [X], INC_X | |||||
| fmul v1.2d, v1.2d, v1.2d | |||||
| faddp TMPF, v1.2d | |||||
| fadd SSQ, SSQ, TMPF | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| #if !defined(DOUBLE) | |||||
| fmov SSQ, wzr | |||||
| fmov s5, SSQ | |||||
| #else | |||||
| fmov SSQ, xzr | |||||
| fmov d5, SSQ | |||||
| #endif | |||||
| cmp N, xzr | |||||
| ble nrm2_kernel_zero | |||||
| cmp INC_X, xzr | |||||
| ble nrm2_kernel_zero | |||||
| cmp INC_X, #1 | |||||
| bne nrm2_kernel_S_BEGIN | |||||
| nrm2_kernel_F_BEGIN: | |||||
| asr I, N, #3 | |||||
| cmp I, xzr | |||||
| beq nrm2_kernel_F1_INIT | |||||
| nrm2_kernel_F8: | |||||
| KERNEL_F8 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F8 | |||||
| nrm2_kernel_F8_FINALIZE | |||||
| nrm2_kernel_F1: | |||||
| ands I, N, #7 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_F10 | |||||
| b nrm2_kernel_L999 | |||||
| nrm2_kernel_F1_INIT: | |||||
| b nrm2_kernel_F1 | |||||
| nrm2_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| subs N, N, #1 | |||||
| ble nrm2_kernel_L999 | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble nrm2_kernel_S1 | |||||
| nrm2_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S4 | |||||
| nrm2_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble nrm2_kernel_L999 | |||||
| nrm2_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne nrm2_kernel_S10 | |||||
| nrm2_kernel_L999: | |||||
| fsqrt SSQ, SSQ | |||||
| ret | |||||
| nrm2_kernel_zero: | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,256 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x1 /* X vector address */ | |||||
| #define INC_X x2 /* X stride */ | |||||
| #define Y x3 /* Y vector address */ | |||||
| #define INC_Y x4 /* Y stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define C s0 /* scale input value */ | |||||
| #define S s1 /* scale input value */ | |||||
| #else | |||||
| #define C d0 /* scale input value */ | |||||
| #define S d1 /* scale input value */ | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro INIT | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] // [C, C] | |||||
| ins v1.s[1], v1.s[0] // [S, S] | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] // [C, C] | |||||
| ins v1.d[1], v1.d[0] // [S, S] | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X] | |||||
| ld1 {v3.2s}, [Y] | |||||
| fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] | |||||
| fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] | |||||
| fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] | |||||
| fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] | |||||
| st1 {v4.2s}, [X], #8 | |||||
| st1 {v5.2s}, [Y], #8 | |||||
| #else | |||||
| ld1 {v2.2d}, [X] | |||||
| ld1 {v3.2d}, [Y] | |||||
| fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] | |||||
| fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] | |||||
| fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] | |||||
| fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] | |||||
| st1 {v4.2d}, [X], #16 | |||||
| st1 {v5.2d}, [Y], #16 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_INIT_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.d[1], v0.d[0] // [C, C, C, C] | |||||
| ins v1.d[1], v1.d[0] // [S, S, S, S] | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s, v3.4s}, [X] | |||||
| ld1 {v4.4s, v5.4s}, [Y] | |||||
| fmul v6.4s, v0.4s, v2.4s // C*X3, C*X2, C*X1, C*X0 | |||||
| fmul v7.4s, v0.4s, v3.4s // C*X7, C*X6, C*X5, C*X4 | |||||
| fmla v6.4s, v1.4s, v4.4s // C*X3+S*Y3, ..., C*X0+S*Y0 | |||||
| fmla v7.4s, v1.4s, v5.4s // C*X7+S*Y7, ..., C*X4+S*Y4 | |||||
| fmul v16.4s, v0.4s, v4.4s // C*Y3, C*Y2, C*Y1, C*Y0 | |||||
| fmul v17.4s, v0.4s, v5.4s // C*Y7, C*Y6, C*Y5, C*Y4 | |||||
| fmls v16.4s, v1.4s, v2.4s // C*Y3-S*X3, ..., C*Y0-S*X0 | |||||
| fmls v17.4s, v1.4s, v3.4s // C*Y7-S*X7, ..., C*Y4-S*X4 | |||||
| st1 {v6.4s,v7.4s}, [X], #32 | |||||
| st1 {v16.4s,v17.4s}, [Y], #32 | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d, v3.2d}, [X] | |||||
| ld1 {v4.2d, v5.2d}, [Y] | |||||
| fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 | |||||
| fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 | |||||
| fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 | |||||
| fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 | |||||
| fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 | |||||
| fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 | |||||
| fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 | |||||
| fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 | |||||
| st1 {v6.2d,v7.2d}, [X], #32 | |||||
| st1 {v16.2d,v17.2d}, [Y], #32 | |||||
| ld1 {v2.2d, v3.2d}, [X] | |||||
| ld1 {v4.2d, v5.2d}, [Y] | |||||
| fmul v6.2d, v0.2d, v2.2d // C*X3, C*X2, C*X1, C*X0 | |||||
| fmul v7.2d, v0.2d, v3.2d // C*X7, C*X6, C*X5, C*X4 | |||||
| fmla v6.2d, v1.2d, v4.2d // C*X3+S*Y3, ..., C*X0+S*Y0 | |||||
| fmla v7.2d, v1.2d, v5.2d // C*X7+S*Y7, ..., C*X4+S*Y4 | |||||
| fmul v16.2d, v0.2d, v4.2d // C*Y3, C*Y2, C*Y1, C*Y0 | |||||
| fmul v17.2d, v0.2d, v5.2d // C*Y7, C*Y6, C*Y5, C*Y4 | |||||
| fmls v16.2d, v1.2d, v2.2d // C*Y3-S*X3, ..., C*Y0-S*X0 | |||||
| fmls v17.2d, v1.2d, v3.2d // C*Y7-S*X7, ..., C*Y4-S*X4 | |||||
| st1 {v6.2d,v7.2d}, [X], #32 | |||||
| st1 {v16.2d,v17.2d}, [Y], #32 | |||||
| #endif | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| lsl INC_Y, INC_Y, #3 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| lsl INC_Y, INC_Y, #4 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X] | |||||
| ld1 {v3.2s}, [Y] | |||||
| fmul v4.2s, v0.2s, v2.2s // [C*X1, C*X0] | |||||
| fmla v4.2s, v1.2s, v3.2s // [C*X1 + S*Y1, C*X0 + S*Y0] | |||||
| fmul v5.2s, v0.2s, v3.2s // [C*Y1, C*Y0] | |||||
| fmls v5.2s, v1.2s, v2.2s // [C*Y1 - S*X1, C*Y0 - S*X0] | |||||
| st1 {v4.2s}, [X], INC_X | |||||
| st1 {v5.2s}, [Y], INC_Y | |||||
| #else | |||||
| ld1 {v2.2d}, [X] | |||||
| ld1 {v3.2d}, [Y] | |||||
| fmul v4.2d, v0.2d, v2.2d // [C*X1, C*X0] | |||||
| fmla v4.2d, v1.2d, v3.2d // [C*X1 + S*Y1, C*X0 + S*Y0] | |||||
| fmul v5.2d, v0.2d, v3.2d // [C*Y1, C*Y0] | |||||
| fmls v5.2d, v1.2d, v2.2d // [C*Y1 - S*X1, C*Y0 - S*X0] | |||||
| st1 {v4.2d}, [X], INC_X | |||||
| st1 {v5.2d}, [Y], INC_Y | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble rot_kernel_L999 | |||||
| INIT | |||||
| cmp INC_X, #1 | |||||
| bne rot_kernel_S_BEGIN | |||||
| cmp INC_Y, #1 | |||||
| bne rot_kernel_S_BEGIN | |||||
| rot_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq rot_kernel_F1 | |||||
| KERNEL_INIT_F4 | |||||
| rot_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_F4 | |||||
| rot_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble rot_kernel_L999 | |||||
| rot_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_F10 | |||||
| mov w0, wzr | |||||
| ret | |||||
| rot_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble rot_kernel_S1 | |||||
| rot_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_S4 | |||||
| rot_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble rot_kernel_L999 | |||||
| rot_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne rot_kernel_S10 | |||||
| rot_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| @@ -0,0 +1,274 @@ | |||||
| /******************************************************************************* | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *******************************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N x0 /* vector length */ | |||||
| #define X x3 /* X vector address */ | |||||
| #define INC_X x4 /* X stride */ | |||||
| #define I x5 /* loop variable */ | |||||
| /******************************************************************************* | |||||
| * Macro definitions | |||||
| *******************************************************************************/ | |||||
| #if !defined(DOUBLE) | |||||
| #define DA_R s0 /* real scale input value */ | |||||
| #define DA_I s1 /* imaginary scale input value */ | |||||
| #else | |||||
| #define DA_R d0 /* real scale input value */ | |||||
| #define DA_I d1 /* imaginary scale input value */ | |||||
| #endif | |||||
| /******************************************************************************/ | |||||
| .macro INIT | |||||
| #if !defined(DOUBLE) | |||||
| ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
| fneg s2, DA_I | |||||
| ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||||
| ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | |||||
| #else | |||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||||
| fneg d2, DA_I | |||||
| ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I | |||||
| ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X] // X1, X0 | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||||
| fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2s}, [X], #8 | |||||
| #else | |||||
| ld1 {v2.2d}, [X] // X1, X0 | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||||
| fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2d}, [X], #16 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_INIT_F4 | |||||
| #if !defined(DOUBLE) | |||||
| // Replicate the lower 2 floats into the upper 2 slots | |||||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||||
| ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_F4 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0] | |||||
| // V3 = X[7], X[6], X[5], X[4] | |||||
| ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||||
| ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||||
| ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||||
| fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix] | |||||
| // X'[ix+1] += DA_R * X[ix+1] | |||||
| fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1] | |||||
| // X'[ix+1] += DA_I * X[ix] | |||||
| ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||||
| ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||||
| ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||||
| fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix] | |||||
| // X'[ix+1] += DA_R * X[ix+1] | |||||
| fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1] | |||||
| // X'[ix+1] += DA_I * X[ix] | |||||
| st1 {v2.4s,v3.4s}, [X], #32 | |||||
| #else // DOUBLE | |||||
| ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3 | |||||
| ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||||
| ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||||
| ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||||
| ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||||
| fmul v2.2d, v0.2d, v2.2d | |||||
| fmla v2.2d, v1.2d, v20.2d | |||||
| fmul v3.2d, v0.2d, v3.2d | |||||
| fmla v3.2d, v1.2d, v21.2d | |||||
| st1 {v2.2d,v3.2d}, [X], #32 | |||||
| fmul v4.2d, v0.2d, v4.2d | |||||
| fmla v4.2d, v1.2d, v22.2d | |||||
| fmul v5.2d, v0.2d, v5.2d | |||||
| fmla v5.2d, v1.2d, v23.2d | |||||
| st1 {v4.2d,v5.2d}, [X], #32 | |||||
| #endif | |||||
| PRFM PLDL1KEEP, [X, #1024] | |||||
| .endm | |||||
| .macro INIT_S | |||||
| #if !defined(DOUBLE) | |||||
| lsl INC_X, INC_X, #3 | |||||
| #else | |||||
| lsl INC_X, INC_X, #4 | |||||
| #endif | |||||
| .endm | |||||
| .macro KERNEL_S1 | |||||
| #if !defined(DOUBLE) | |||||
| ld1 {v2.2s}, [X] // X1, X0 | |||||
| ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||||
| fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2s}, [X], INC_X | |||||
| #else | |||||
| ld1 {v2.2d}, [X] // X1, X0 | |||||
| ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||||
| fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
| fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
| st1 {v2.2d}, [X], INC_X | |||||
| #endif | |||||
| .endm | |||||
| /******************************************************************************* | |||||
| * End of macro definitions | |||||
| *******************************************************************************/ | |||||
| PROLOGUE | |||||
| cmp N, xzr | |||||
| ble zscal_kernel_L999 | |||||
| fcmp DA_R, #0.0 | |||||
| bne zscal_kernel_1 | |||||
| fcmp DA_I, #0.0 | |||||
| beq zscal_kernel_zero | |||||
| // TODO: special case DA_R == 0 && DA_I != 0 | |||||
| zscal_kernel_1: | |||||
| // TODO: special case DA_R != 0 && DA_I == 0 | |||||
| INIT | |||||
| cmp INC_X, #1 | |||||
| bne zscal_kernel_S_BEGIN | |||||
| zscal_kernel_F_BEGIN: | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| beq zscal_kernel_F1 | |||||
| KERNEL_INIT_F4 | |||||
| zscal_kernel_F4: | |||||
| KERNEL_F4 | |||||
| subs I, I, #1 | |||||
| bne zscal_kernel_F4 | |||||
| zscal_kernel_F1: | |||||
| ands I, N, #3 | |||||
| ble zscal_kernel_L999 | |||||
| zscal_kernel_F10: | |||||
| KERNEL_F1 | |||||
| subs I, I, #1 | |||||
| bne zscal_kernel_F10 | |||||
| mov w0, wzr | |||||
| ret | |||||
| zscal_kernel_S_BEGIN: | |||||
| INIT_S | |||||
| asr I, N, #2 | |||||
| cmp I, xzr | |||||
| ble zscal_kernel_S1 | |||||
| zscal_kernel_S4: | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zscal_kernel_S4 | |||||
| zscal_kernel_S1: | |||||
| ands I, N, #3 | |||||
| ble zscal_kernel_L999 | |||||
| zscal_kernel_S10: | |||||
| KERNEL_S1 | |||||
| subs I, I, #1 | |||||
| bne zscal_kernel_S10 | |||||
| zscal_kernel_L999: | |||||
| mov w0, wzr | |||||
| ret | |||||
| zscal_kernel_zero: | |||||
| INIT_S | |||||
| zscal_kernel_Z1: | |||||
| stp DA_R, DA_I, [X] | |||||
| add X, X, INC_X | |||||
| subs N, N, #1 | |||||
| bne zscal_kernel_Z1 | |||||
| mov w0, wzr | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,883 @@ | |||||
| #include "common.h" | |||||
| #define MADD_ALPHA_N_STORE(C, res, alpha) \ | |||||
| C[0] = res ## _r * alpha ## _r - res ## _i * alpha ## _i; \ | |||||
| C[1] = res ## _r * alpha ## _i + res ## _i * alpha ## _r; | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||||
| #define MADD(res, op1, op2) \ | |||||
| res ## _r += op1 ## _r * op2 ## _r; \ | |||||
| res ## _r -= op1 ## _i * op2 ## _i; \ | |||||
| res ## _i += op1 ## _r * op2 ## _i; \ | |||||
| res ## _i += op1 ## _i * op2 ## _r; | |||||
| #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) | |||||
| #define MADD(res, op1, op2) \ | |||||
| res ## _r += op1 ## _r * op2 ## _r; \ | |||||
| res ## _r += op1 ## _i * op2 ## _i; \ | |||||
| res ## _i -= op1 ## _r * op2 ## _i; \ | |||||
| res ## _i += op1 ## _i * op2 ## _r; | |||||
| #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) | |||||
| #define MADD(res, op1, op2) \ | |||||
| res ## _r += op1 ## _r * op2 ## _r; \ | |||||
| res ## _r += op1 ## _i * op2 ## _i; \ | |||||
| res ## _i += op1 ## _r * op2 ## _i; \ | |||||
| res ## _i -= op1 ## _i * op2 ## _r; | |||||
| #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| #define MADD(res, op1, op2) \ | |||||
| res ## _r += op1 ## _r * op2 ## _r; \ | |||||
| res ## _r -= op1 ## _i * op2 ## _i; \ | |||||
| res ## _i -= op1 ## _r * op2 ## _i; \ | |||||
| res ## _i -= op1 ## _i * op2 ## _r; | |||||
| #endif | |||||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||||
| , BLASLONG offset | |||||
| ) | |||||
| { | |||||
| BLASLONG i,j,k; | |||||
| FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; | |||||
| FLOAT res00_r, res01_r, res02_r, res03_r; | |||||
| FLOAT res00_i, res01_i, res02_i, res03_i; | |||||
| FLOAT res10_r, res11_r, res12_r, res13_r; | |||||
| FLOAT res10_i, res11_i, res12_i, res13_i; | |||||
| FLOAT res20_r, res21_r, res22_r, res23_r; | |||||
| FLOAT res20_i, res21_i, res22_i, res23_i; | |||||
| FLOAT res30_r, res31_r, res32_r, res33_r; | |||||
| FLOAT res30_i, res31_i, res32_i, res33_i; | |||||
| FLOAT a0_r, a1_r; | |||||
| FLOAT a0_i, a1_i; | |||||
| FLOAT b0_r, b1_r, b2_r, b3_r; | |||||
| FLOAT b0_i, b1_i, b2_i, b3_i; | |||||
| BLASLONG off, temp; | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off = -offset; | |||||
| #endif | |||||
| for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops | |||||
| { | |||||
| C0 = C; | |||||
| C1 = C0+2*ldc; | |||||
| C2 = C1+2*ldc; | |||||
| C3 = C2+2*ldc; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x4 | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*4*2; // number of values in A | |||||
| ptrbb = bb + off*4*2; // number of values in B | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res01_r = 0; | |||||
| res01_i = 0; | |||||
| res02_r = 0; | |||||
| res02_i = 0; | |||||
| res03_r = 0; | |||||
| res03_i = 0; | |||||
| res10_r = 0; | |||||
| res10_i = 0; | |||||
| res11_r = 0; | |||||
| res11_i = 0; | |||||
| res12_r = 0; | |||||
| res12_i = 0; | |||||
| res13_r = 0; | |||||
| res13_i = 0; | |||||
| res20_r = 0; | |||||
| res20_i = 0; | |||||
| res21_r = 0; | |||||
| res21_i = 0; | |||||
| res22_r = 0; | |||||
| res22_i = 0; | |||||
| res23_r = 0; | |||||
| res23_i = 0; | |||||
| res30_r = 0; | |||||
| res30_i = 0; | |||||
| res31_r = 0; | |||||
| res31_i = 0; | |||||
| res32_r = 0; | |||||
| res32_i = 0; | |||||
| res33_r = 0; | |||||
| res33_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #elif defined(LEFT) | |||||
| temp = off + 4; | |||||
| #else | |||||
| temp = off + 4; | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1]; | |||||
| b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1]; | |||||
| b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| MADD(res10, a0, b1); | |||||
| MADD(res20, a0, b2); | |||||
| MADD(res30, a0, b3); | |||||
| a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1]; | |||||
| MADD(res01, a1, b0); | |||||
| MADD(res11, a1, b1); | |||||
| MADD(res21, a1, b2); | |||||
| MADD(res31, a1, b3); | |||||
| a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1]; | |||||
| MADD(res02, a0, b0); | |||||
| MADD(res12, a0, b1); | |||||
| MADD(res22, a0, b2); | |||||
| MADD(res32, a0, b3); | |||||
| a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1]; | |||||
| MADD(res03, a1, b0); | |||||
| MADD(res13, a1, b1); | |||||
| MADD(res23, a1, b2); | |||||
| MADD(res33, a1, b3); | |||||
| ptrba = ptrba+8; | |||||
| ptrbb = ptrbb+8; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res01, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res02, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res03, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res10, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res11, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res12, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res13, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C2, res20, alpha); | |||||
| C2 = C2 + 2; | |||||
| MADD_ALPHA_N_STORE(C2, res21, alpha); | |||||
| C2 = C2 + 2; | |||||
| MADD_ALPHA_N_STORE(C2, res22, alpha); | |||||
| C2 = C2 + 2; | |||||
| MADD_ALPHA_N_STORE(C2, res23, alpha); | |||||
| C2 = C2 + 2; | |||||
| MADD_ALPHA_N_STORE(C3, res30, alpha); | |||||
| C3 = C3 + 2; | |||||
| MADD_ALPHA_N_STORE(C3, res31, alpha); | |||||
| C3 = C3 + 2; | |||||
| MADD_ALPHA_N_STORE(C3, res32, alpha); | |||||
| C3 = C3 + 2; | |||||
| MADD_ALPHA_N_STORE(C3, res33, alpha); | |||||
| C3 = C3 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #if defined(LEFT) | |||||
| temp = temp - 4; | |||||
| #else | |||||
| temp = temp - 4; | |||||
| #endif | |||||
| ptrba += temp*4*2; // number of values in A | |||||
| ptrbb += temp*4*2; // number of values in B | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 4; // number of values in A | |||||
| #endif | |||||
| } | |||||
| if ( bm & 2 ) // do any 2x4 loop | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2*2; | |||||
| ptrbb = bb + off*4*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res01_r = 0; | |||||
| res01_i = 0; | |||||
| res10_r = 0; | |||||
| res10_i = 0; | |||||
| res11_r = 0; | |||||
| res11_i = 0; | |||||
| res20_r = 0; | |||||
| res20_i = 0; | |||||
| res21_r = 0; | |||||
| res21_i = 0; | |||||
| res30_r = 0; | |||||
| res30_i = 0; | |||||
| res31_r = 0; | |||||
| res31_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+2; // number of values in A | |||||
| #else | |||||
| temp = off+4; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1]; | |||||
| b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1]; | |||||
| b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| MADD(res10, a0, b1); | |||||
| MADD(res20, a0, b2); | |||||
| MADD(res30, a0, b3); | |||||
| a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1]; | |||||
| MADD(res01, a1, b0); | |||||
| MADD(res11, a1, b1); | |||||
| MADD(res21, a1, b2); | |||||
| MADD(res31, a1, b3); | |||||
| ptrba = ptrba+4; | |||||
| ptrbb = ptrbb+8; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res01, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res10, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res11, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C2, res20, alpha); | |||||
| C2 = C2 + 2; | |||||
| MADD_ALPHA_N_STORE(C2, res21, alpha); | |||||
| C2 = C2 + 2; | |||||
| MADD_ALPHA_N_STORE(C3, res30, alpha); | |||||
| C3 = C3 + 2; | |||||
| MADD_ALPHA_N_STORE(C3, res31, alpha); | |||||
| C3 = C3 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 2; // number of values in A | |||||
| #else | |||||
| temp -= 4; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*2*2; | |||||
| ptrbb += temp*4*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 2; // number of values in A | |||||
| #endif | |||||
| } | |||||
| if ( bm & 1 ) // do any 1x4 loop | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*1*2; | |||||
| ptrbb = bb + off*4*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res10_r = 0; | |||||
| res10_i = 0; | |||||
| res20_r = 0; | |||||
| res20_i = 0; | |||||
| res30_r = 0; | |||||
| res30_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+1; // number of values in A | |||||
| #else | |||||
| temp = off+4; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1]; | |||||
| b2_r = ptrbb[2*2+0]; b2_i = ptrbb[2*2+1]; | |||||
| b3_r = ptrbb[2*3+0]; b3_i = ptrbb[2*3+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| MADD(res10, a0, b1); | |||||
| MADD(res20, a0, b2); | |||||
| MADD(res30, a0, b3); | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+8; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res10, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C2, res20, alpha); | |||||
| C2 = C2 + 2; | |||||
| MADD_ALPHA_N_STORE(C3, res30, alpha); | |||||
| C3 = C3 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; // number of values in A | |||||
| #else | |||||
| temp -= 4; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*1*2; | |||||
| ptrbb += temp*4*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; // number of values in A | |||||
| #endif | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 4; | |||||
| #endif | |||||
| k = (bk<<3); | |||||
| bb = bb+k; | |||||
| i = (ldc<<3); | |||||
| C = C+i; | |||||
| } | |||||
| for (j=0; j<(bn&2); j+=2) // do the Mx2 loops | |||||
| { | |||||
| C0 = C; | |||||
| C1 = C0+ldc*2; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x2 | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*4*2; | |||||
| ptrbb = bb + off*2*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res01_r = 0; | |||||
| res01_i = 0; | |||||
| res02_r = 0; | |||||
| res02_i = 0; | |||||
| res03_r = 0; | |||||
| res03_i = 0; | |||||
| res10_r = 0; | |||||
| res10_i = 0; | |||||
| res11_r = 0; | |||||
| res11_i = 0; | |||||
| res12_r = 0; | |||||
| res12_i = 0; | |||||
| res13_r = 0; | |||||
| res13_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+4; // number of values in A | |||||
| #else | |||||
| temp = off+2; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| MADD(res10, a0, b1); | |||||
| a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1]; | |||||
| MADD(res01, a1, b0); | |||||
| MADD(res11, a1, b1); | |||||
| a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1]; | |||||
| MADD(res02, a0, b0); | |||||
| MADD(res12, a0, b1); | |||||
| a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1]; | |||||
| MADD(res03, a1, b0); | |||||
| MADD(res13, a1, b1); | |||||
| ptrba = ptrba+8; | |||||
| ptrbb = ptrbb+4; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res01, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res02, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res03, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res10, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res11, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res12, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res13, alpha); | |||||
| C1 = C1 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 4; // number of values in A | |||||
| #else | |||||
| temp -= 2; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*4*2; | |||||
| ptrbb += temp*2*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 4; // number of values in A | |||||
| #endif | |||||
| } | |||||
| if ( bm & 2 ) // do any 2x2 loop | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2*2; | |||||
| ptrbb = bb + off*2*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res01_r = 0; | |||||
| res01_i = 0; | |||||
| res10_r = 0; | |||||
| res10_i = 0; | |||||
| res11_r = 0; | |||||
| res11_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+2; // number of values in A | |||||
| #else | |||||
| temp = off+2; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| MADD(res10, a0, b1); | |||||
| a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1]; | |||||
| MADD(res01, a1, b0); | |||||
| MADD(res11, a1, b1); | |||||
| ptrba = ptrba+4; | |||||
| ptrbb = ptrbb+4; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res01, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res10, alpha); | |||||
| C1 = C1 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res11, alpha); | |||||
| C1 = C1 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 2; // number of values in A | |||||
| #else | |||||
| temp -= 2; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*2*2; | |||||
| ptrbb += temp*2*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 2; // number of values in A | |||||
| #endif | |||||
| } | |||||
| if ( bm & 1 ) // do any 1x2 loop | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*1*2; | |||||
| ptrbb = bb + off*2*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res10_r = 0; | |||||
| res10_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+1; // number of values in A | |||||
| #else | |||||
| temp = off+2; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| b1_r = ptrbb[2*1+0]; b1_i = ptrbb[2*1+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| MADD(res10, a0, b1); | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+4; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C1, res10, alpha); | |||||
| C1 = C1 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; // number of values in A | |||||
| #else | |||||
| temp -= 2; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*1*2; | |||||
| ptrbb += temp*2*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; // number of values in A | |||||
| #endif | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 2; | |||||
| #endif | |||||
| k = (bk<<2); | |||||
| bb = bb+k; | |||||
| i = (ldc<<2); | |||||
| C = C+i; | |||||
| } | |||||
| for (j=0; j<(bn&1); j+=1) // do the Mx1 loops | |||||
| { | |||||
| C0 = C; | |||||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||||
| off = offset; | |||||
| #endif | |||||
| ptrba = ba; | |||||
| for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*4*2; | |||||
| ptrbb = bb + off*1*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res01_r = 0; | |||||
| res01_i = 0; | |||||
| res02_r = 0; | |||||
| res02_i = 0; | |||||
| res03_r = 0; | |||||
| res03_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+4; // number of values in A | |||||
| #else | |||||
| temp = off+1; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1]; | |||||
| MADD(res01, a1, b0); | |||||
| a0_r = ptrba[2*2+0]; a0_i = ptrba[2*2+1]; | |||||
| MADD(res02, a0, b0); | |||||
| a1_r = ptrba[2*3+0]; a1_i = ptrba[2*3+1]; | |||||
| MADD(res03, a1, b0); | |||||
| ptrba = ptrba+8; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res01, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res02, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res03, alpha); | |||||
| C0 = C0 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 4; // number of values in A | |||||
| #else | |||||
| temp -= 1; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*4*2; | |||||
| ptrbb += temp*1*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 4; // number of values in A | |||||
| #endif | |||||
| } | |||||
| if ( bm & 2 ) // do any 2x1 loop | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*2*2; | |||||
| ptrbb = bb + off*1*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| res01_r = 0; | |||||
| res01_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+2; // number of values in A | |||||
| #else | |||||
| temp = off+1; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| a1_r = ptrba[2*1+0]; a1_i = ptrba[2*1+1]; | |||||
| MADD(res01, a1, b0); | |||||
| ptrba = ptrba+4; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| MADD_ALPHA_N_STORE(C0, res01, alpha); | |||||
| C0 = C0 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 2; // number of values in A | |||||
| #else | |||||
| temp -= 1; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*2*2; | |||||
| ptrbb += temp*1*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 2; // number of values in A | |||||
| #endif | |||||
| } | |||||
| if ( bm & 1 ) // do any 1x1 loop | |||||
| { | |||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| ptrbb = bb; | |||||
| #else | |||||
| ptrba += off*1*2; | |||||
| ptrbb = bb + off*1*2; | |||||
| #endif | |||||
| res00_r = 0; | |||||
| res00_i = 0; | |||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||||
| temp = bk-off; | |||||
| #elif defined(LEFT) | |||||
| temp = off+1; // number of values in A | |||||
| #else | |||||
| temp = off+1; // number of values in B | |||||
| #endif | |||||
| for (k=0; k<temp; k++) | |||||
| { | |||||
| b0_r = ptrbb[2*0+0]; b0_i = ptrbb[2*0+1]; | |||||
| a0_r = ptrba[2*0+0]; a0_i = ptrba[2*0+1]; | |||||
| MADD(res00, a0, b0); | |||||
| ptrba = ptrba+2; | |||||
| ptrbb = ptrbb+2; | |||||
| } | |||||
| MADD_ALPHA_N_STORE(C0, res00, alpha); | |||||
| C0 = C0 + 2; | |||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||||
| temp = bk - off; | |||||
| #ifdef LEFT | |||||
| temp -= 1; // number of values in A | |||||
| #else | |||||
| temp -= 1; // number of values in B | |||||
| #endif | |||||
| ptrba += temp*1*2; | |||||
| ptrbb += temp*1*2; | |||||
| #endif | |||||
| #ifdef LEFT | |||||
| off += 1; // number of values in A | |||||
| #endif | |||||
| } | |||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||||
| off += 1; | |||||
| #endif | |||||
| k = (bk<<1); | |||||
| bb = bb+k; | |||||
| i = (ldc<<1); | |||||
| C = C+i; | |||||
| } | |||||
| return 0; | |||||
| } | |||||
| @@ -2214,6 +2214,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define ZGEMM_DEFAULT_R 4096 | #define ZGEMM_DEFAULT_R 4096 | ||||
| #define SYMV_P 16 | |||||
| #endif | |||||
| #if defined(CORTEXA57) | |||||
| #define SNUMOPT 2 | |||||
| #define DNUMOPT 2 | |||||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
| #define SGEMM_DEFAULT_P 128 | |||||
| #define DGEMM_DEFAULT_P 256 | |||||
| #define CGEMM_DEFAULT_P 256 | |||||
| #define ZGEMM_DEFAULT_P 128 | |||||
| #define SGEMM_DEFAULT_Q 240 | |||||
| #define DGEMM_DEFAULT_Q 1024 | |||||
| #define CGEMM_DEFAULT_Q 1024 | |||||
| #define ZGEMM_DEFAULT_Q 512 | |||||
| #define SGEMM_DEFAULT_R 12288 | |||||
| #define DGEMM_DEFAULT_R 4096 | |||||
| #define CGEMM_DEFAULT_R 4096 | |||||
| #define ZGEMM_DEFAULT_R 2048 | |||||
| #define SYMV_P 16 | #define SYMV_P 16 | ||||
| #endif | #endif | ||||