Add new targets for ARM64tags/v0.2.20^2
@@ -9,3 +9,17 @@ CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | ||||
endif | endif | ||||
ifeq ($(CORE), VULCAN) | |||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||||
endif | |||||
ifeq ($(CORE), THUNDERX) | |||||
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||||
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||||
endif | |||||
ifeq ($(CORE), THUNDERX2T99) | |||||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||||
endif |
@@ -80,4 +80,7 @@ ARMV5 | |||||
8.ARM 64-bit CPU: | 8.ARM 64-bit CPU: | ||||
ARMV8 | ARMV8 | ||||
CORTEXA57 | CORTEXA57 | ||||
VULCAN | |||||
THUNDERX | |||||
THUNDERX2T99 | |||||
@@ -2193,7 +2193,7 @@ | |||||
#endif | #endif | ||||
#ifndef ASSEMBLER | #ifndef ASSEMBLER | ||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||||
extern BLASLONG gemm_offset_a; | extern BLASLONG gemm_offset_a; | ||||
extern BLASLONG gemm_offset_b; | extern BLASLONG gemm_offset_b; | ||||
extern BLASLONG sgemm_p; | extern BLASLONG sgemm_p; | ||||
@@ -30,17 +30,26 @@ | |||||
#define CPU_UNKNOWN 0 | #define CPU_UNKNOWN 0 | ||||
#define CPU_ARMV8 1 | #define CPU_ARMV8 1 | ||||
#define CPU_CORTEXA57 2 | #define CPU_CORTEXA57 2 | ||||
#define CPU_VULCAN 3 | |||||
#define CPU_THUNDERX 4 | |||||
#define CPU_THUNDERX2T99 5 | |||||
static char *cpuname[] = { | static char *cpuname[] = { | ||||
"UNKNOWN", | "UNKNOWN", | ||||
"ARMV8" , | "ARMV8" , | ||||
"CORTEXA57" | |||||
"CORTEXA57", | |||||
"VULCAN", | |||||
"THUNDERX", | |||||
"THUNDERX2T99" | |||||
}; | }; | ||||
static char *cpuname_lower[] = { | static char *cpuname_lower[] = { | ||||
"unknown", | "unknown", | ||||
"armv8" , | "armv8" , | ||||
"cortexa57" | |||||
"cortexa57", | |||||
"vulcan", | |||||
"thunderx", | |||||
"thunderx2t99" | |||||
}; | }; | ||||
int get_feature(char *search) | int get_feature(char *search) | ||||
@@ -85,25 +94,34 @@ int detect(void) | |||||
#ifdef linux | #ifdef linux | ||||
FILE *infile; | FILE *infile; | ||||
char buffer[512], *p; | |||||
p = (char *) NULL ; | |||||
infile = fopen("/proc/cpuinfo", "r"); | |||||
while (fgets(buffer, sizeof(buffer), infile)) | |||||
{ | |||||
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||||
p = (char *) NULL ; | |||||
if (!strncmp("CPU part", buffer, 8)) | |||||
{ | |||||
p = strchr(buffer, ':') + 2; | |||||
infile = fopen("/proc/cpuinfo", "r"); | |||||
while (fgets(buffer, sizeof(buffer), infile)) { | |||||
if ((cpu_part != NULL) && (cpu_implementer != NULL)) { | |||||
break; | break; | ||||
} | } | ||||
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { | |||||
cpu_part = strchr(buffer, ':') + 2; | |||||
cpu_part = strdup(cpu_part); | |||||
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { | |||||
cpu_implementer = strchr(buffer, ':') + 2; | |||||
cpu_implementer = strdup(cpu_implementer); | |||||
} | |||||
} | } | ||||
fclose(infile); | fclose(infile); | ||||
if(p != NULL) { | |||||
if (strstr(p, "0xd07")) { | |||||
return CPU_CORTEXA57; | |||||
} | |||||
if(cpu_part != NULL && cpu_implementer != NULL) { | |||||
if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41")) | |||||
return CPU_CORTEXA57; | |||||
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) | |||||
return CPU_VULCAN; | |||||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) | |||||
return CPU_THUNDERX; | |||||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ | |||||
return CPU_THUNDERX2T99; | |||||
} | } | ||||
p = (char *) NULL ; | p = (char *) NULL ; | ||||
@@ -176,6 +194,28 @@ void get_cpuconfig(void) | |||||
printf("#define L2_ASSOCIATIVE 4\n"); | printf("#define L2_ASSOCIATIVE 4\n"); | ||||
break; | break; | ||||
case CPU_VULCAN: | |||||
printf("#define VULCAN \n"); | |||||
printf("#define HAVE_VFP \n"); | |||||
printf("#define HAVE_VFPV3 \n"); | |||||
printf("#define HAVE_NEON \n"); | |||||
printf("#define HAVE_VFPV4 \n"); | |||||
printf("#define L1_CODE_SIZE 32768 \n"); | |||||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||||
printf("#define L1_DATA_SIZE 32768 \n"); | |||||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||||
printf("#define L2_SIZE 262144 \n"); | |||||
printf("#define L2_LINESIZE 64 \n"); | |||||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||||
printf("#define L3_SIZE 33554432 \n"); | |||||
printf("#define L3_LINESIZE 64 \n"); | |||||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||||
printf("#define DTB_SIZE 4096 \n"); | |||||
break; | |||||
case CPU_CORTEXA57: | case CPU_CORTEXA57: | ||||
printf("#define CORTEXA57\n"); | printf("#define CORTEXA57\n"); | ||||
printf("#define HAVE_VFP\n"); | printf("#define HAVE_VFP\n"); | ||||
@@ -191,8 +231,42 @@ void get_cpuconfig(void) | |||||
printf("#define L2_SIZE 2097152\n"); | printf("#define L2_SIZE 2097152\n"); | ||||
printf("#define L2_LINESIZE 64\n"); | printf("#define L2_LINESIZE 64\n"); | ||||
printf("#define L2_ASSOCIATIVE 16\n"); | printf("#define L2_ASSOCIATIVE 16\n"); | ||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
printf("#define DTB_SIZE 4096\n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
printf("#define DTB_SIZE 4096\n"); | |||||
break; | |||||
case CPU_THUNDERX: | |||||
printf("#define ARMV8\n"); | |||||
printf("#define THUNDERX\n"); | |||||
printf("#define L1_DATA_SIZE 32768\n"); | |||||
printf("#define L1_DATA_LINESIZE 128\n"); | |||||
printf("#define L2_SIZE 16777216\n"); | |||||
printf("#define L2_LINESIZE 128\n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||||
printf("#define DTB_SIZE 4096\n"); | |||||
printf("#define L2_ASSOCIATIVE 16\n"); | |||||
break; | |||||
case CPU_THUNDERX2T99: | |||||
printf("#define VULCAN \n"); | |||||
printf("#define HAVE_VFP \n"); | |||||
printf("#define HAVE_VFPV3 \n"); | |||||
printf("#define HAVE_NEON \n"); | |||||
printf("#define HAVE_VFPV4 \n"); | |||||
printf("#define L1_CODE_SIZE 32768 \n"); | |||||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||||
printf("#define L1_DATA_SIZE 32768 \n"); | |||||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||||
printf("#define L2_SIZE 262144 \n"); | |||||
printf("#define L2_LINESIZE 64 \n"); | |||||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||||
printf("#define L3_SIZE 33554432 \n"); | |||||
printf("#define L3_LINESIZE 64 \n"); | |||||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||||
printf("#define DTB_SIZE 4096 \n"); | |||||
break; | break; | ||||
} | } | ||||
} | } | ||||
@@ -995,7 +995,7 @@ void *blas_memory_alloc(int procpos){ | |||||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | ||||
#endif | #endif | ||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||||
#ifndef DYNAMIC_ARCH | #ifndef DYNAMIC_ARCH | ||||
blas_set_parameter(); | blas_set_parameter(); | ||||
#endif | #endif | ||||
@@ -727,3 +727,26 @@ void blas_set_parameter(void){ | |||||
} | } | ||||
#endif | #endif | ||||
#if defined(ARCH_ARM64) | |||||
#if defined(VULCAN) || defined(THUNDERX2T99) | |||||
unsigned long dgemm_prefetch_size_a; | |||||
unsigned long dgemm_prefetch_size_b; | |||||
unsigned long dgemm_prefetch_size_c; | |||||
#endif | |||||
void blas_set_parameter(void) | |||||
{ | |||||
#if defined(VULCAN) || defined(THUNDERX2T99) | |||||
dgemm_p = 160; | |||||
dgemm_q = 128; | |||||
dgemm_r = 4096; | |||||
dgemm_prefetch_size_a = 3584; | |||||
dgemm_prefetch_size_b = 512; | |||||
dgemm_prefetch_size_c = 128; | |||||
#endif | |||||
} | |||||
#endif |
@@ -884,7 +884,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#ifdef FORCE_CORTEXA57 | #ifdef FORCE_CORTEXA57 | ||||
#define FORCE | #define FORCE | ||||
#define ARCHITECTURE "ARM64" | #define ARCHITECTURE "ARM64" | ||||
#define SUBARCHITECTURE "ARMV8" | |||||
#define SUBARCHITECTURE "CORTEXA57" | |||||
#define SUBDIRNAME "arm64" | #define SUBDIRNAME "arm64" | ||||
#define ARCHCONFIG "-DCORTEXA57 " \ | #define ARCHCONFIG "-DCORTEXA57 " \ | ||||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | ||||
@@ -897,6 +897,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#else | #else | ||||
#endif | #endif | ||||
#ifdef FORCE_VULCAN | |||||
#define FORCE | |||||
#define ARCHITECTURE "ARM64" | |||||
#define SUBARCHITECTURE "VULCAN" | |||||
#define SUBDIRNAME "arm64" | |||||
#define ARCHCONFIG "-DVULCAN " \ | |||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||||
#define LIBNAME "vulcan" | |||||
#define CORENAME "VULCAN" | |||||
#else | |||||
#endif | |||||
#ifdef FORCE_THUNDERX | |||||
#define FORCE | |||||
#define ARCHITECTURE "ARM64" | |||||
#define SUBARCHITECTURE "THUNDERX" | |||||
#define SUBDIRNAME "arm64" | |||||
#define ARCHCONFIG "-DTHUNDERX " \ | |||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||||
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " | |||||
#define LIBNAME "thunderx" | |||||
#define CORENAME "THUNDERX" | |||||
#else | |||||
#endif | |||||
#ifdef FORCE_THUNDERX2T99 | |||||
#define FORCE | |||||
#define ARCHITECTURE "ARM64" | |||||
#define SUBARCHITECTURE "THUNDERX2T99" | |||||
#define SUBDIRNAME "arm64" | |||||
#define ARCHCONFIG "-DTHUNDERX2T99 " \ | |||||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||||
#define LIBNAME "thunderx2t99" | |||||
#define CORENAME "THUNDERX2T99" | |||||
#else | |||||
#endif | |||||
#ifndef FORCE | #ifndef FORCE | ||||
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | ||||
@@ -75,14 +75,29 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | ||||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | ||||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | ||||
ifeq ($(DGEMM_UNROLL_M), 8) | |||||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||||
else | |||||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | ||||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | ||||
endif | |||||
DGEMMINCOPYOBJ = dgemm_incopy.o | DGEMMINCOPYOBJ = dgemm_incopy.o | ||||
DGEMMITCOPYOBJ = dgemm_itcopy.o | DGEMMITCOPYOBJ = dgemm_itcopy.o | ||||
endif | endif | ||||
ifeq ($(DGEMM_UNROLL_N), 4) | |||||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||||
else | |||||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | ||||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | ||||
endif | |||||
DGEMMONCOPYOBJ = dgemm_oncopy.o | DGEMMONCOPYOBJ = dgemm_oncopy.o | ||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | DGEMMOTCOPYOBJ = dgemm_otcopy.o | ||||
@@ -0,0 +1,6 @@ | |||||
include $(KERNELDIR)/KERNEL.ARMV8 | |||||
SDOTKERNEL=dot-thunderx.c | |||||
DDOTKERNEL=ddot-thunderx.c | |||||
DAXPYKERNEL=daxpy-thunderx.c | |||||
@@ -0,0 +1,2 @@ | |||||
include $(KERNELDIR)/KERNEL.VULCAN | |||||
@@ -0,0 +1,4 @@ | |||||
include $(KERNELDIR)/KERNEL.CORTEXA57 | |||||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_vulcan.S | |||||
@@ -0,0 +1,151 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2014, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <arm_neon.h> | |||||
#define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); | |||||
//#define prefetch(a) | |||||
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
{ | |||||
BLASLONG register i = 0; | |||||
double a = *alpha; | |||||
#if 0 | |||||
prefetch(x + 128/sizeof(*x)); | |||||
prefetch(y + 128/sizeof(*y)); | |||||
#endif | |||||
prefetch(x + 2*128/sizeof(*x)); | |||||
prefetch(y + 2*128/sizeof(*y)); | |||||
prefetch(x + 3*128/sizeof(*x)); | |||||
prefetch(y + 3*128/sizeof(*y)); | |||||
prefetch(x + 4*128/sizeof(*x)); | |||||
prefetch(y + 4*128/sizeof(*y)); | |||||
while(i < n) | |||||
{ | |||||
double y0, y1, y2, y3; | |||||
double y4, y5, y6, y7; | |||||
double *xx; | |||||
double *yy; | |||||
y0 = a * x[0] + y[0]; | |||||
y1 = a * x[1] + y[1]; | |||||
y2 = a * x[2] + y[2]; | |||||
y3 = a * x[3] + y[3]; | |||||
y4 = a * x[4] + y[4]; | |||||
y5 = a * x[5] + y[5]; | |||||
y6 = a * x[6] + y[6]; | |||||
y7 = a * x[7] + y[7]; | |||||
asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); | |||||
y[0] = y0; | |||||
y[1] = y1; | |||||
y[2] = y2; | |||||
y[3] = y3; | |||||
y[4] = y4; | |||||
y[5] = y5; | |||||
y[6] = y6; | |||||
y[7] = y7; | |||||
xx = (x + 4*128/sizeof(*x)); | |||||
yy = (y + 4*128/sizeof(*y)); | |||||
asm("":"+r"(yy)::"memory"); | |||||
prefetch(xx); | |||||
prefetch(yy); | |||||
y += 8; | |||||
x += 8; | |||||
i += 8 ; | |||||
} | |||||
} | |||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0,iy=0; | |||||
if ( n <= 0 ) return(0); | |||||
if ( (inc_x == 1) && (inc_y == 1) ) | |||||
{ | |||||
BLASLONG n1 = n & -32; | |||||
if ( n1 ) | |||||
daxpy_kernel_8(n1, x, y , &da ); | |||||
i = n1; | |||||
while(i < n) | |||||
{ | |||||
y[i] += da * x[i] ; | |||||
i++ ; | |||||
} | |||||
return(0); | |||||
} | |||||
BLASLONG n1 = n & -4; | |||||
while(i < n1) | |||||
{ | |||||
FLOAT m1 = da * x[ix] ; | |||||
FLOAT m2 = da * x[ix+inc_x] ; | |||||
FLOAT m3 = da * x[ix+2*inc_x] ; | |||||
FLOAT m4 = da * x[ix+3*inc_x] ; | |||||
y[iy] += m1 ; | |||||
y[iy+inc_y] += m2 ; | |||||
y[iy+2*inc_y] += m3 ; | |||||
y[iy+3*inc_y] += m4 ; | |||||
ix += inc_x*4 ; | |||||
iy += inc_y*4 ; | |||||
i+=4 ; | |||||
} | |||||
while(i < n) | |||||
{ | |||||
y[iy] += da * x[ix] ; | |||||
ix += inc_x ; | |||||
iy += inc_y ; | |||||
i++ ; | |||||
} | |||||
return(0); | |||||
} | |||||
@@ -0,0 +1,119 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2014, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#include <arm_neon.h> | |||||
#define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0,iy=0; | |||||
FLOAT dot = 0.0 ; | |||||
if ( n < 0 ) return(dot); | |||||
if ( (inc_x == 1) && (inc_y == 1) ) | |||||
{ | |||||
float64x2_t vdot0 = {0.0, 0.0}; | |||||
float64x2_t vdot1 = {0.0, 0.0}; | |||||
float64x2_t vdot2 = {0.0, 0.0}; | |||||
float64x2_t vdot3 = {0.0, 0.0}; | |||||
float64x2_t *vx = (float64x2_t*)x; | |||||
float64x2_t *vy = (float64x2_t*)y; | |||||
#if 0 | |||||
prefetch(x + 128/sizeof(*x)); | |||||
prefetch(y + 128/sizeof(*y)); | |||||
#endif | |||||
prefetch(x + 2*128/sizeof(*x)); | |||||
prefetch(y + 2*128/sizeof(*y)); | |||||
prefetch(x + 3*128/sizeof(*x)); | |||||
prefetch(y + 3*128/sizeof(*y)); | |||||
int n1 = n&-8; | |||||
while(i < n1) | |||||
{ | |||||
#if 0 | |||||
vdot0 = vfmaq_f64 (vdot0, | |||||
vy[0], | |||||
vx[0]); | |||||
vdot1 = vfmaq_f64 (vdot1, | |||||
vy[1], | |||||
vx[1]); | |||||
vdot2 = vfmaq_f64 (vdot2, | |||||
vy[2], | |||||
vx[2]); | |||||
vdot3 = vfmaq_f64 (vdot3, | |||||
vy[3], | |||||
vx[3]); | |||||
#else | |||||
vdot0 = vy[0] * vx[0] + vdot0; | |||||
vdot1 = vy[1] * vx[1] + vdot1; | |||||
vdot2 = vy[2] * vx[2] + vdot2; | |||||
vdot3 = vy[3] * vx[3] + vdot3; | |||||
#endif | |||||
vy += 4; | |||||
vx += 4; | |||||
i += 8; | |||||
prefetch(vx + 3*128/sizeof(*x)); | |||||
prefetch(vy + 3*128/sizeof(*y)); | |||||
} | |||||
dot = vaddvq_f64 (vdot0 + vdot1); | |||||
dot += vaddvq_f64 (vdot2 + vdot3); | |||||
i = n1; | |||||
while(i < n) | |||||
{ | |||||
dot += y[i] * x[i] ; | |||||
i++ ; | |||||
} | |||||
return(dot); | |||||
} | |||||
while(i < n) | |||||
{ | |||||
dot += y[iy] * x[ix] ; | |||||
ix += inc_x ; | |||||
iy += inc_y ; | |||||
i++ ; | |||||
} | |||||
return(dot); | |||||
} | |||||
@@ -0,0 +1,340 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define M x0 | |||||
#define N x1 | |||||
#define A00 x2 | |||||
#define LDA x3 | |||||
#define B00 x4 | |||||
#define A01 x5 | |||||
#define A02 x6 | |||||
#define A03 x7 | |||||
#define A04 x8 | |||||
#define I x9 | |||||
#define J x10 | |||||
#define TEMP1 x11 | |||||
#define TEMP2 x12 | |||||
#define A_PREFETCH 2560 | |||||
/************************************************************************************** | |||||
* Macro definitions | |||||
**************************************************************************************/ | |||||
.macro SAVE_REGS | |||||
add sp, sp, #-(11 * 16) | |||||
stp d8, d9, [sp, #(0 * 16)] | |||||
stp d10, d11, [sp, #(1 * 16)] | |||||
stp d12, d13, [sp, #(2 * 16)] | |||||
stp d14, d15, [sp, #(3 * 16)] | |||||
stp d16, d17, [sp, #(4 * 16)] | |||||
stp x18, x19, [sp, #(5 * 16)] | |||||
stp x20, x21, [sp, #(6 * 16)] | |||||
stp x22, x23, [sp, #(7 * 16)] | |||||
stp x24, x25, [sp, #(8 * 16)] | |||||
stp x26, x27, [sp, #(9 * 16)] | |||||
str x28, [sp, #(10 * 16)] | |||||
.endm | |||||
.macro RESTORE_REGS | |||||
ldp d8, d9, [sp, #(0 * 16)] | |||||
ldp d10, d11, [sp, #(1 * 16)] | |||||
ldp d12, d13, [sp, #(2 * 16)] | |||||
ldp d14, d15, [sp, #(3 * 16)] | |||||
ldp d16, d17, [sp, #(4 * 16)] | |||||
ldp x18, x19, [sp, #(5 * 16)] | |||||
ldp x20, x21, [sp, #(6 * 16)] | |||||
ldp x22, x23, [sp, #(7 * 16)] | |||||
ldp x24, x25, [sp, #(8 * 16)] | |||||
ldp x26, x27, [sp, #(9 * 16)] | |||||
ldr x28, [sp, #(10 * 16)] | |||||
add sp, sp, #(11*16) | |||||
.endm | |||||
.macro COPY4x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ins v8.d[0], v0.d[0] | |||||
ins v10.d[0], v0.d[1] | |||||
ins v12.d[0], v1.d[0] | |||||
ins v14.d[0], v1.d[1] | |||||
ldp q2, q3, [A02], #32 | |||||
ins v8.d[1], v2.d[0] | |||||
ins v10.d[1], v2.d[1] | |||||
ins v12.d[1], v3.d[0] | |||||
ins v14.d[1], v3.d[1] | |||||
ldp q4, q5, [A03], #32 | |||||
ins v9.d[0], v4.d[0] | |||||
ins v11.d[0], v4.d[1] | |||||
ins v13.d[0], v5.d[0] | |||||
ins v15.d[0], v5.d[1] | |||||
ldp q6, q7, [A04], #32 | |||||
ins v9.d[1], v6.d[0] | |||||
ins v11.d[1], v6.d[1] | |||||
ins v13.d[1], v7.d[0] | |||||
ins v15.d[1], v7.d[1] | |||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||||
add B00, B00, #64 | |||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||||
add B00, B00, #64 | |||||
.endm | |||||
.macro COPY1x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
ldr d2, [A03], #8 | |||||
ldr d3, [A04], #8 | |||||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||||
add B00, B00, #32 | |||||
.endm | |||||
.macro COPY4x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ins v8.d[0], v0.d[0] | |||||
ins v9.d[0], v0.d[1] | |||||
ins v10.d[0], v1.d[0] | |||||
ins v11.d[0], v1.d[1] | |||||
ldp q2, q3, [A02], #32 | |||||
ins v8.d[1], v2.d[0] | |||||
ins v9.d[1], v2.d[1] | |||||
ins v10.d[1], v3.d[0] | |||||
ins v11.d[1], v3.d[1] | |||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||||
add B00, B00, #64 | |||||
.endm | |||||
.macro COPY1x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
stp d0, d1, [B00] | |||||
add B00, B00, #16 | |||||
.endm | |||||
.macro COPY4x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
stp q0, q1, [B00], #32 | |||||
.endm | |||||
.macro COPY1x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
str d0, [B00], #8 | |||||
.endm | |||||
/************************************************************************************** | |||||
* End of macro definitions | |||||
**************************************************************************************/ | |||||
PROLOGUE | |||||
.align 5 | |||||
SAVE_REGS | |||||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||||
dgemm_ncopy_L4_BEGIN: | |||||
asr J, N, #2 // J = N / 4 | |||||
cmp J, #0 | |||||
ble dgemm_ncopy_L2_BEGIN | |||||
.align 5 | |||||
dgemm_ncopy_L4_M4_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A00, A04, LDA | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L4_M4_40 | |||||
.align 5 | |||||
dgemm_ncopy_L4_M4_20: | |||||
COPY4x4 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L4_M4_20 | |||||
dgemm_ncopy_L4_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L4_M4_END | |||||
.align 5 | |||||
dgemm_ncopy_L4_M4_60: | |||||
COPY1x4 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L4_M4_60 | |||||
dgemm_ncopy_L4_M4_END: | |||||
subs J , J, #1 // j-- | |||||
bne dgemm_ncopy_L4_M4_BEGIN | |||||
/*********************************************************************************************/ | |||||
dgemm_ncopy_L2_BEGIN: | |||||
tst N, #3 | |||||
ble dgemm_ncopy_L999 | |||||
tst N, #2 | |||||
ble dgemm_ncopy_L1_BEGIN | |||||
dgemm_ncopy_L2_M4_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A00, A02, LDA | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L2_M4_40 | |||||
.align 5 | |||||
dgemm_ncopy_L2_M4_20: | |||||
COPY4x2 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L2_M4_20 | |||||
dgemm_ncopy_L2_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L2_M4_END | |||||
.align 5 | |||||
dgemm_ncopy_L2_M4_60: | |||||
COPY1x2 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L2_M4_60 | |||||
dgemm_ncopy_L2_M4_END: | |||||
/*********************************************************************************************/ | |||||
dgemm_ncopy_L1_BEGIN: | |||||
tst N, #1 | |||||
ble dgemm_ncopy_L999 | |||||
dgemm_ncopy_L1_M4_BEGIN: | |||||
mov A01, A00 | |||||
asr I, M, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L1_M4_40 | |||||
.align 5 | |||||
dgemm_ncopy_L1_M4_20: | |||||
COPY4x1 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L1_M4_20 | |||||
dgemm_ncopy_L1_M4_40: | |||||
and I, M , #3 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L1_M4_END | |||||
.align 5 | |||||
dgemm_ncopy_L1_M4_60: | |||||
COPY1x1 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L1_M4_60 | |||||
dgemm_ncopy_L1_M4_END: | |||||
dgemm_ncopy_L999: | |||||
mov x0, #0 | |||||
RESTORE_REGS | |||||
ret | |||||
EPILOGUE | |||||
@@ -0,0 +1,544 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define M x0 | |||||
#define N x1 | |||||
#define A00 x2 | |||||
#define LDA x3 | |||||
#define B00 x4 | |||||
#define A01 x5 | |||||
#define A02 x6 | |||||
#define A03 x7 | |||||
#define A04 x8 | |||||
#define A05 x9 | |||||
#define A06 x10 | |||||
#define A07 x11 | |||||
#define A08 x12 | |||||
#define I x13 | |||||
#define J x14 | |||||
#define TEMP1 x15 | |||||
#define TEMP2 x16 | |||||
#define A_PREFETCH 2560 | |||||
/************************************************************************************** | |||||
* Macro definitions | |||||
**************************************************************************************/ | |||||
.macro SAVE_REGS | |||||
add sp, sp, #-(11 * 16) | |||||
stp d8, d9, [sp, #(0 * 16)] | |||||
stp d10, d11, [sp, #(1 * 16)] | |||||
stp d12, d13, [sp, #(2 * 16)] | |||||
stp d14, d15, [sp, #(3 * 16)] | |||||
stp d16, d17, [sp, #(4 * 16)] | |||||
stp x18, x19, [sp, #(5 * 16)] | |||||
stp x20, x21, [sp, #(6 * 16)] | |||||
stp x22, x23, [sp, #(7 * 16)] | |||||
stp x24, x25, [sp, #(8 * 16)] | |||||
stp x26, x27, [sp, #(9 * 16)] | |||||
str x28, [sp, #(10 * 16)] | |||||
.endm | |||||
.macro RESTORE_REGS | |||||
ldp d8, d9, [sp, #(0 * 16)] | |||||
ldp d10, d11, [sp, #(1 * 16)] | |||||
ldp d12, d13, [sp, #(2 * 16)] | |||||
ldp d14, d15, [sp, #(3 * 16)] | |||||
ldp d16, d17, [sp, #(4 * 16)] | |||||
ldp x18, x19, [sp, #(5 * 16)] | |||||
ldp x20, x21, [sp, #(6 * 16)] | |||||
ldp x22, x23, [sp, #(7 * 16)] | |||||
ldp x24, x25, [sp, #(8 * 16)] | |||||
ldp x26, x27, [sp, #(9 * 16)] | |||||
ldr x28, [sp, #(10 * 16)] | |||||
add sp, sp, #(11*16) | |||||
.endm | |||||
/*************************************************************************************/ | |||||
.macro COPY8x8 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||||
COPY4x8 | |||||
COPY4x8 | |||||
.endm | |||||
.macro COPY4x8 | |||||
ldp q0, q1, [A01], #32 | |||||
ins v16.d[0], v0.d[0] | |||||
ins v20.d[0], v0.d[1] | |||||
ins v24.d[0], v1.d[0] | |||||
ins v28.d[0], v1.d[1] | |||||
ldp q2, q3, [A02], #32 | |||||
ins v16.d[1], v2.d[0] | |||||
ins v20.d[1], v2.d[1] | |||||
ins v24.d[1], v3.d[0] | |||||
ins v28.d[1], v3.d[1] | |||||
ldp q4, q5, [A03], #32 | |||||
ins v17.d[0], v4.d[0] | |||||
ins v21.d[0], v4.d[1] | |||||
ins v25.d[0], v5.d[0] | |||||
ins v29.d[0], v5.d[1] | |||||
ldp q6, q7, [A04], #32 | |||||
ins v17.d[1], v6.d[0] | |||||
ins v21.d[1], v6.d[1] | |||||
ins v25.d[1], v7.d[0] | |||||
ins v29.d[1], v7.d[1] | |||||
ldp q8, q9, [A05], #32 | |||||
ins v18.d[0], v8.d[0] | |||||
ins v22.d[0], v8.d[1] | |||||
ins v26.d[0], v9.d[0] | |||||
ins v30.d[0], v9.d[1] | |||||
ldp q10, q11, [A06], #32 | |||||
ins v18.d[1], v10.d[0] | |||||
ins v22.d[1], v10.d[1] | |||||
ins v26.d[1], v11.d[0] | |||||
ins v30.d[1], v11.d[1] | |||||
ldp q12, q13, [A07], #32 | |||||
ins v19.d[0], v12.d[0] | |||||
ins v23.d[0], v12.d[1] | |||||
ins v27.d[0], v13.d[0] | |||||
ins v31.d[0], v13.d[1] | |||||
ldp q14, q15, [A08], #32 | |||||
ins v19.d[1], v14.d[0] | |||||
ins v23.d[1], v14.d[1] | |||||
ins v27.d[1], v15.d[0] | |||||
ins v31.d[1], v15.d[1] | |||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [B00] | |||||
add B00, B00, #64 | |||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [B00] | |||||
add B00, B00, #64 | |||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||||
add B00, B00, #64 | |||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||||
add B00, B00, #64 | |||||
.endm | |||||
.macro COPY1x8 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
ldr d2, [A03], #8 | |||||
ldr d3, [A04], #8 | |||||
ldr d4, [A05], #8 | |||||
ldr d5, [A06], #8 | |||||
ldr d6, [A07], #8 | |||||
ldr d7, [A08], #8 | |||||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||||
add B00, B00, #32 | |||||
st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B00] | |||||
add B00, B00, #32 | |||||
.endm | |||||
/*************************************************************************************/ | |||||
.macro COPY8x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ins v8.d[0], v0.d[0] | |||||
ins v10.d[0], v0.d[1] | |||||
ins v12.d[0], v1.d[0] | |||||
ins v14.d[0], v1.d[1] | |||||
ldp q2, q3, [A02], #32 | |||||
ins v8.d[1], v2.d[0] | |||||
ins v10.d[1], v2.d[1] | |||||
ins v12.d[1], v3.d[0] | |||||
ins v14.d[1], v3.d[1] | |||||
ldp q4, q5, [A03], #32 | |||||
ins v9.d[0], v4.d[0] | |||||
ins v11.d[0], v4.d[1] | |||||
ins v13.d[0], v5.d[0] | |||||
ins v15.d[0], v5.d[1] | |||||
ldp q6, q7, [A04], #32 | |||||
ins v9.d[1], v6.d[0] | |||||
ins v11.d[1], v6.d[1] | |||||
ins v13.d[1], v7.d[0] | |||||
ins v15.d[1], v7.d[1] | |||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||||
add B00, B00, #64 | |||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||||
add B00, B00, #64 | |||||
ldp q16, q17, [A01], #32 | |||||
ins v24.d[0], v16.d[0] | |||||
ins v26.d[0], v16.d[1] | |||||
ins v28.d[0], v17.d[0] | |||||
ins v30.d[0], v17.d[1] | |||||
ldp q18, q19, [A02], #32 | |||||
ins v24.d[1], v18.d[0] | |||||
ins v26.d[1], v18.d[1] | |||||
ins v28.d[1], v19.d[0] | |||||
ins v30.d[1], v19.d[1] | |||||
ldp q20, q21, [A03], #32 | |||||
ins v25.d[0], v20.d[0] | |||||
ins v27.d[0], v20.d[1] | |||||
ins v29.d[0], v21.d[0] | |||||
ins v31.d[0], v21.d[1] | |||||
ldp q22, q23, [A04], #32 | |||||
ins v25.d[1], v22.d[0] | |||||
ins v27.d[1], v22.d[1] | |||||
ins v29.d[1], v23.d[0] | |||||
ins v31.d[1], v23.d[1] | |||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||||
add B00, B00, #64 | |||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||||
add B00, B00, #64 | |||||
.endm | |||||
.macro COPY1x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
ldr d2, [A03], #8 | |||||
ldr d3, [A04], #8 | |||||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||||
add B00, B00, #32 | |||||
.endm | |||||
/*************************************************************************************/ | |||||
.macro COPY8x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A01], #32 | |||||
ins v8.d[0], v0.d[0] | |||||
ins v9.d[0], v0.d[1] | |||||
ins v10.d[0], v1.d[0] | |||||
ins v11.d[0], v1.d[1] | |||||
ins v12.d[0], v2.d[0] | |||||
ins v13.d[0], v2.d[1] | |||||
ins v14.d[0], v3.d[0] | |||||
ins v15.d[0], v3.d[1] | |||||
ldp q4, q5, [A02], #32 | |||||
ldp q6, q7, [A02], #32 | |||||
ins v8.d[1], v4.d[0] | |||||
ins v9.d[1], v4.d[1] | |||||
ins v10.d[1], v5.d[0] | |||||
ins v11.d[1], v5.d[1] | |||||
ins v12.d[1], v6.d[0] | |||||
ins v13.d[1], v6.d[1] | |||||
ins v14.d[1], v7.d[0] | |||||
ins v15.d[1], v7.d[1] | |||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||||
add B00, B00, #64 | |||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||||
add B00, B00, #64 | |||||
.endm | |||||
.macro COPY1x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
stp d0, d1, [B00] | |||||
add B00, B00, #16 | |||||
.endm | |||||
/*************************************************************************************/ | |||||
.macro COPY8x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A01], #32 | |||||
stp q0, q1, [B00], #32 | |||||
stp q2, q3, [B00], #32 | |||||
.endm | |||||
.macro COPY1x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
str d0, [B00], #8 | |||||
.endm | |||||
/************************************************************************************** | |||||
* End of macro definitions | |||||
**************************************************************************************/ | |||||
PROLOGUE | |||||
.align 5 | |||||
SAVE_REGS | |||||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||||
dgemm_ncopy_L8_BEGIN: | |||||
asr J, N, #3 // J = N / 8 | |||||
cmp J, #0 | |||||
ble dgemm_ncopy_L4_BEGIN | |||||
dgemm_ncopy_L8_M8_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A05, A04, LDA | |||||
add A06, A05, LDA | |||||
add A07, A06, LDA | |||||
add A08, A07, LDA | |||||
add A00, A08, LDA | |||||
asr I, M, #3 // I = M / 8 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L8_M8_40 | |||||
dgemm_ncopy_L8_M8_20: | |||||
COPY8x8 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L8_M8_20 | |||||
dgemm_ncopy_L8_M8_40: | |||||
and I, M , #7 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L8_M8_END | |||||
dgemm_ncopy_L8_M8_60: | |||||
COPY1x8 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L8_M8_60 | |||||
dgemm_ncopy_L8_M8_END: | |||||
subs J , J, #1 // j-- | |||||
bne dgemm_ncopy_L8_M8_BEGIN | |||||
/*********************************************************************************************/ | |||||
dgemm_ncopy_L4_BEGIN: | |||||
tst N, #7 | |||||
ble dgemm_ncopy_L999 | |||||
tst N, #4 | |||||
ble dgemm_ncopy_L2_BEGIN | |||||
dgemm_ncopy_L4_M8_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A00, A04, LDA | |||||
asr I, M, #3 // I = M / 8 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L4_M8_40 | |||||
dgemm_ncopy_L4_M8_20: | |||||
COPY8x4 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L4_M8_20 | |||||
dgemm_ncopy_L4_M8_40: | |||||
and I, M , #7 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L4_M8_END | |||||
dgemm_ncopy_L4_M8_60: | |||||
COPY1x4 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L4_M8_60 | |||||
dgemm_ncopy_L4_M8_END: | |||||
/*********************************************************************************************/ | |||||
dgemm_ncopy_L2_BEGIN: | |||||
tst N, #3 | |||||
ble dgemm_ncopy_L999 | |||||
tst N, #2 | |||||
ble dgemm_ncopy_L1_BEGIN | |||||
dgemm_ncopy_L2_M8_BEGIN: | |||||
mov A01, A00 | |||||
add A02, A01, LDA | |||||
add A00, A02, LDA | |||||
asr I, M, #3 // I = M / 8 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L2_M8_40 | |||||
dgemm_ncopy_L2_M8_20: | |||||
COPY8x2 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L2_M8_20 | |||||
dgemm_ncopy_L2_M8_40: | |||||
and I, M , #7 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L2_M8_END | |||||
dgemm_ncopy_L2_M8_60: | |||||
COPY1x2 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L2_M8_60 | |||||
dgemm_ncopy_L2_M8_END: | |||||
/*********************************************************************************************/ | |||||
dgemm_ncopy_L1_BEGIN: | |||||
tst N, #1 | |||||
ble dgemm_ncopy_L999 | |||||
dgemm_ncopy_L1_M8_BEGIN: | |||||
mov A01, A00 | |||||
asr I, M, #3 // I = M / 8 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L1_M8_40 | |||||
dgemm_ncopy_L1_M8_20: | |||||
COPY8x1 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L1_M8_20 | |||||
dgemm_ncopy_L1_M8_40: | |||||
and I, M , #7 | |||||
cmp I, #0 | |||||
ble dgemm_ncopy_L1_M8_END | |||||
dgemm_ncopy_L1_M8_60: | |||||
COPY1x1 | |||||
subs I , I , #1 | |||||
bne dgemm_ncopy_L1_M8_60 | |||||
dgemm_ncopy_L1_M8_END: | |||||
dgemm_ncopy_L999: | |||||
mov x0, #0 | |||||
RESTORE_REGS | |||||
ret | |||||
EPILOGUE | |||||
@@ -0,0 +1,402 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define M x0 | |||||
#define N x1 | |||||
#define A x2 | |||||
#define LDA x3 | |||||
#define B x4 | |||||
#define M4 x5 | |||||
#define A01 x6 | |||||
#define A02 x7 | |||||
#define A03 x8 | |||||
#define A04 x9 | |||||
#define B01 x10 | |||||
#define B02 x11 | |||||
#define B03 x12 | |||||
#define B04 x13 | |||||
#define I x14 | |||||
#define J x15 | |||||
#define TEMP1 x16 | |||||
#define TEMP2 x17 | |||||
#define A_PREFETCH 2560 | |||||
#define B_PREFETCH 256 | |||||
/************************************************************************************** | |||||
* Macro definitions | |||||
**************************************************************************************/ | |||||
.macro SAVE_REGS | |||||
add sp, sp, #-(11 * 16) | |||||
stp d8, d9, [sp, #(0 * 16)] | |||||
stp d10, d11, [sp, #(1 * 16)] | |||||
stp d12, d13, [sp, #(2 * 16)] | |||||
stp d14, d15, [sp, #(3 * 16)] | |||||
stp d16, d17, [sp, #(4 * 16)] | |||||
stp x18, x19, [sp, #(5 * 16)] | |||||
stp x20, x21, [sp, #(6 * 16)] | |||||
stp x22, x23, [sp, #(7 * 16)] | |||||
stp x24, x25, [sp, #(8 * 16)] | |||||
stp x26, x27, [sp, #(9 * 16)] | |||||
str x28, [sp, #(10 * 16)] | |||||
.endm | |||||
.macro RESTORE_REGS | |||||
ldp d8, d9, [sp, #(0 * 16)] | |||||
ldp d10, d11, [sp, #(1 * 16)] | |||||
ldp d12, d13, [sp, #(2 * 16)] | |||||
ldp d14, d15, [sp, #(3 * 16)] | |||||
ldp d16, d17, [sp, #(4 * 16)] | |||||
ldp x18, x19, [sp, #(5 * 16)] | |||||
ldp x20, x21, [sp, #(6 * 16)] | |||||
ldp x22, x23, [sp, #(7 * 16)] | |||||
ldp x24, x25, [sp, #(8 * 16)] | |||||
ldp x26, x27, [sp, #(9 * 16)] | |||||
ldr x28, [sp, #(10 * 16)] | |||||
add sp, sp, #(11*16) | |||||
.endm | |||||
.macro COPY4x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A02], #32 | |||||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||||
add TEMP1, B01, #64 | |||||
ldp q4, q5, [A03], #32 | |||||
ldp q6, q7, [A04], #32 | |||||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||||
add B01, B01, M4 | |||||
.endm | |||||
.macro COPY2x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ldr q1, [A02], #16 | |||||
ldr q2, [A03], #16 | |||||
ldr q3, [A04], #16 | |||||
////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||||
add B02, B02, #64 | |||||
.endm | |||||
.macro COPY1x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
ldr d2, [A03], #8 | |||||
ldr d3, [A04], #8 | |||||
////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B03] | |||||
add B03, B03, #32 | |||||
.endm | |||||
/*************************************************************************************************************************/ | |||||
.macro COPY4x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A02], #32 | |||||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||||
add B01, B01, M4 | |||||
.endm | |||||
.macro COPY2x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ldr q1, [A02], #16 | |||||
////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||||
stp q0, q1, [B02] | |||||
add B02, B02, #32 | |||||
.endm | |||||
.macro COPY1x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||||
stp d0, d1, [B03] | |||||
add B03, B03, #16 | |||||
.endm | |||||
/*************************************************************************************************************************/ | |||||
.macro COPY4x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||||
stp q0, q1, [B01] | |||||
add B01, B01, M4 | |||||
.endm | |||||
.macro COPY2x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||||
str q0, [B02] | |||||
add B02, B02, #16 | |||||
.endm | |||||
.macro COPY1x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||||
str d0, [B03] | |||||
add B03, B03, #8 | |||||
.endm | |||||
/************************************************************************************** | |||||
* End of macro definitions | |||||
**************************************************************************************/ | |||||
PROLOGUE | |||||
.align 5 | |||||
SAVE_REGS | |||||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||||
lsl TEMP1, M, #3 // x12 = M * SIZE | |||||
and B02 , N , #-4 | |||||
and B03 , N , #-2 | |||||
mul B02, B02, TEMP1 | |||||
mul B03, B03, TEMP1 | |||||
add B02 , B02, B | |||||
add B03 , B03, B | |||||
lsl M4, M, #5 // M4 = M * 4 * SIZE | |||||
dgemm_tcopy_L4_BEGIN: | |||||
asr J, M, #2 // J = M / 4 | |||||
cmp J, #0 | |||||
ble dgemm_tcopy_L2_BEGIN | |||||
.align 5 | |||||
dgemm_tcopy_L4_M4_BEGIN: | |||||
mov A01, A | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A, A04, LDA | |||||
mov B01, B | |||||
add B, B01, #128 // B = B + 16 * SIZE | |||||
asr I, N, #2 // I = N / 4 | |||||
cmp I, #0 | |||||
ble dgemm_tcopy_L4_M4_40 | |||||
.align 5 | |||||
dgemm_tcopy_L4_M4_20: | |||||
COPY4x4 | |||||
subs I , I , #1 | |||||
bne dgemm_tcopy_L4_M4_20 | |||||
dgemm_tcopy_L4_M4_40: | |||||
tst N , #2 | |||||
ble dgemm_tcopy_L4_M4_60 | |||||
COPY2x4 | |||||
dgemm_tcopy_L4_M4_60: | |||||
tst N, #1 | |||||
ble dgemm_tcopy_L4_M4_END | |||||
COPY1x4 | |||||
dgemm_tcopy_L4_M4_END: | |||||
subs J , J, #1 // j-- | |||||
bne dgemm_tcopy_L4_M4_BEGIN | |||||
/*********************************************************************************************/ | |||||
dgemm_tcopy_L2_BEGIN: | |||||
tst M, #3 | |||||
ble dgemm_tcopy_L999 | |||||
tst M, #2 | |||||
ble dgemm_tcopy_L1_BEGIN | |||||
dgemm_tcopy_L2_M4_BEGIN: | |||||
mov A01, A | |||||
add A02, A01, LDA | |||||
add A, A02, LDA | |||||
mov B01, B | |||||
add B, B01, #64 // B = B + 8 * SIZE | |||||
asr I, N, #2 // I = N / 4 | |||||
cmp I, #0 | |||||
ble dgemm_tcopy_L2_M4_40 | |||||
.align 5 | |||||
dgemm_tcopy_L2_M4_20: | |||||
COPY4x2 | |||||
subs I , I , #1 | |||||
bne dgemm_tcopy_L2_M4_20 | |||||
dgemm_tcopy_L2_M4_40: | |||||
tst N , #2 | |||||
ble dgemm_tcopy_L2_M4_60 | |||||
COPY2x2 | |||||
dgemm_tcopy_L2_M4_60: | |||||
tst N , #1 | |||||
ble dgemm_tcopy_L2_M4_END | |||||
COPY1x2 | |||||
dgemm_tcopy_L2_M4_END: | |||||
/*********************************************************************************************/ | |||||
dgemm_tcopy_L1_BEGIN: | |||||
tst M, #1 | |||||
ble dgemm_tcopy_L999 | |||||
dgemm_tcopy_L1_M4_BEGIN: | |||||
mov A01, A // A01 = A | |||||
mov B01, B | |||||
asr I, N, #2 // I = M / 4 | |||||
cmp I, #0 | |||||
ble dgemm_tcopy_L1_M4_40 | |||||
.align 5 | |||||
dgemm_tcopy_L1_M4_20: | |||||
COPY4x1 | |||||
subs I , I , #1 | |||||
bne dgemm_tcopy_L1_M4_20 | |||||
dgemm_tcopy_L1_M4_40: | |||||
tst N , #2 | |||||
ble dgemm_tcopy_L1_M4_60 | |||||
COPY2x1 | |||||
dgemm_tcopy_L1_M4_60: | |||||
tst N , #1 | |||||
ble dgemm_tcopy_L1_M4_END | |||||
COPY1x1 | |||||
dgemm_tcopy_L1_M4_END: | |||||
dgemm_tcopy_L999: | |||||
mov x0, #0 // set return value | |||||
RESTORE_REGS | |||||
ret | |||||
EPILOGUE | |||||
@@ -0,0 +1,682 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2016, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define M x0 | |||||
#define N x1 | |||||
#define A x2 | |||||
#define LDA x3 | |||||
#define B x4 | |||||
#define M8 x5 | |||||
#define A01 x6 | |||||
#define A02 x7 | |||||
#define A03 x8 | |||||
#define A04 x9 | |||||
#define A05 x10 | |||||
#define A06 x11 | |||||
#define A07 x12 | |||||
#define A08 x13 | |||||
#define B01 x14 | |||||
#define B02 x15 | |||||
#define B03 x16 | |||||
#define B04 x17 | |||||
#define I x18 | |||||
#define J x19 | |||||
#define TEMP1 x20 | |||||
#define TEMP2 x21 | |||||
#define A_PREFETCH 2560 | |||||
#define B_PREFETCH 256 | |||||
/************************************************************************************** | |||||
* Macro definitions | |||||
**************************************************************************************/ | |||||
.macro SAVE_REGS | |||||
add sp, sp, #-(11 * 16) | |||||
stp d8, d9, [sp, #(0 * 16)] | |||||
stp d10, d11, [sp, #(1 * 16)] | |||||
stp d12, d13, [sp, #(2 * 16)] | |||||
stp d14, d15, [sp, #(3 * 16)] | |||||
stp d16, d17, [sp, #(4 * 16)] | |||||
stp x18, x19, [sp, #(5 * 16)] | |||||
stp x20, x21, [sp, #(6 * 16)] | |||||
stp x22, x23, [sp, #(7 * 16)] | |||||
stp x24, x25, [sp, #(8 * 16)] | |||||
stp x26, x27, [sp, #(9 * 16)] | |||||
str x28, [sp, #(10 * 16)] | |||||
.endm | |||||
.macro RESTORE_REGS | |||||
ldp d8, d9, [sp, #(0 * 16)] | |||||
ldp d10, d11, [sp, #(1 * 16)] | |||||
ldp d12, d13, [sp, #(2 * 16)] | |||||
ldp d14, d15, [sp, #(3 * 16)] | |||||
ldp d16, d17, [sp, #(4 * 16)] | |||||
ldp x18, x19, [sp, #(5 * 16)] | |||||
ldp x20, x21, [sp, #(6 * 16)] | |||||
ldp x22, x23, [sp, #(7 * 16)] | |||||
ldp x24, x25, [sp, #(8 * 16)] | |||||
ldp x26, x27, [sp, #(9 * 16)] | |||||
ldr x28, [sp, #(10 * 16)] | |||||
add sp, sp, #(11*16) | |||||
.endm | |||||
/*************************************************************************************************************************/ | |||||
.macro COPY8x8 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A01], #32 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||||
add TEMP1, B01, #64 | |||||
ldp q4, q5, [A02], #32 | |||||
ldp q6, q7, [A02], #32 | |||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q8, q9, [A03], #32 | |||||
ldp q10, q11, [A03], #32 | |||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q12, q13, [A04], #32 | |||||
ldp q14, q15, [A04], #32 | |||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q16, q17, [A05], #32 | |||||
ldp q18, q19, [A05], #32 | |||||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q20, q21, [A06], #32 | |||||
ldp q22, q23, [A06], #32 | |||||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q24, q25, [A07], #32 | |||||
ldp q26, q27, [A07], #32 | |||||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q28, q29, [A08], #32 | |||||
ldp q30, q31, [A08], #32 | |||||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
add B01, B01, M8 | |||||
.endm | |||||
.macro COPY4x8 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A02], #32 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||||
add B02, B02, #64 | |||||
ldp q4, q5, [A03], #32 | |||||
ldp q6, q7, [A04], #32 | |||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||||
add B02, B02, #64 | |||||
ldp q8, q9, [A05], #32 | |||||
ldp q10, q11, [A06], #32 | |||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B02] | |||||
add B02, B02, #64 | |||||
ldp q12, q13, [A07], #32 | |||||
ldp q14, q15, [A08], #32 | |||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B02] | |||||
add B02, B02, #64 | |||||
.endm | |||||
.macro COPY2x8 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ldr q1, [A02], #16 | |||||
ldr q2, [A03], #16 | |||||
ldr q3, [A04], #16 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||||
add B03, B03, #64 | |||||
ldr q4, [A05], #16 | |||||
ldr q5, [A06], #16 | |||||
ldr q6, [A07], #16 | |||||
ldr q7, [A08], #16 | |||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B03] | |||||
add B03, B03, #64 | |||||
.endm | |||||
.macro COPY1x8 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
ldr d2, [A03], #8 | |||||
ldr d3, [A04], #8 | |||||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||||
add B04, B04, #32 | |||||
ldr d4, [A05], #8 | |||||
ldr d5, [A06], #8 | |||||
ldr d6, [A07], #8 | |||||
ldr d7, [A08], #8 | |||||
st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B04] | |||||
add B04, B04, #32 | |||||
.endm | |||||
/*************************************************************************************************************************/ | |||||
.macro COPY8x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A01], #32 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||||
add TEMP1, B01, #64 | |||||
ldp q4, q5, [A02], #32 | |||||
ldp q6, q7, [A02], #32 | |||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q8, q9, [A03], #32 | |||||
ldp q10, q11, [A03], #32 | |||||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
ldp q12, q13, [A04], #32 | |||||
ldp q14, q15, [A04], #32 | |||||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||||
add TEMP1, TEMP1, #64 | |||||
add B01, B01, M8 | |||||
.endm | |||||
.macro COPY4x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A02], #32 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||||
add B02, B02, #64 | |||||
ldp q4, q5, [A03], #32 | |||||
ldp q6, q7, [A04], #32 | |||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||||
add B02, B02, #64 | |||||
.endm | |||||
.macro COPY2x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ldr q1, [A02], #16 | |||||
ldr q2, [A03], #16 | |||||
ldr q3, [A04], #16 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||||
add B03, B03, #64 | |||||
.endm | |||||
.macro COPY1x4 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
ldr d2, [A03], #8 | |||||
ldr d3, [A04], #8 | |||||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||||
add B04, B04, #32 | |||||
.endm | |||||
/*************************************************************************************************************************/ | |||||
.macro COPY8x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A01], #32 | |||||
ldp q4, q5, [A02], #32 | |||||
ldp q6, q7, [A02], #32 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||||
add TEMP1, B01, #64 | |||||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||||
add B01, B01, M8 | |||||
.endm | |||||
.macro COPY4x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A02], #32 | |||||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||||
add B02, B02, #64 | |||||
.endm | |||||
.macro COPY2x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
ldr q1, [A02], #16 | |||||
stp q0, q1, [B03] | |||||
add B03, B03, #32 | |||||
.endm | |||||
.macro COPY1x2 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
ldr d1, [A02], #8 | |||||
stp d0, d1, [B04] | |||||
add B04, B04, #16 | |||||
.endm | |||||
/*************************************************************************************************************************/ | |||||
.macro COPY8x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
ldp q2, q3, [A01], #32 | |||||
stp q0, q1, [B01] | |||||
add TEMP1, B01, #32 | |||||
stp q2, q3, [TEMP1] | |||||
add B01, B01, M8 | |||||
.endm | |||||
.macro COPY4x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldp q0, q1, [A01], #32 | |||||
stp q0, q1, [B02] | |||||
add B02, B02, #32 | |||||
.endm | |||||
.macro COPY2x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr q0, [A01], #16 | |||||
str q0, [B03] | |||||
add B03, B03, #16 | |||||
.endm | |||||
.macro COPY1x1 | |||||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||||
ldr d0, [A01], #8 | |||||
str d0, [B04] | |||||
add B04, B04, #8 | |||||
.endm | |||||
/************************************************************************************** | |||||
* End of macro definitions | |||||
**************************************************************************************/ | |||||
PROLOGUE | |||||
.align 5 | |||||
SAVE_REGS | |||||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||||
lsl TEMP1, M, #3 // TEMP1 = M * SIZE | |||||
and B02 , N , #-8 | |||||
and B03 , N , #-4 | |||||
and B04 , N , #-2 | |||||
mul B02, B02, TEMP1 | |||||
mul B03, B03, TEMP1 | |||||
mul B04, B04, TEMP1 | |||||
add B02 , B02, B | |||||
add B03 , B03, B | |||||
add B04 , B04, B | |||||
lsl M8, M, #6 // M8 = M * 8 * SIZE | |||||
dgemm_tcopy_L8_BEGIN: | |||||
asr J, M, #3 // J = M / 4 | |||||
cmp J, #0 | |||||
ble dgemm_tcopy_L4_BEGIN | |||||
.align 5 | |||||
dgemm_tcopy_L8_M8_BEGIN: | |||||
mov A01, A | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A05, A04, LDA | |||||
add A06, A05, LDA | |||||
add A07, A06, LDA | |||||
add A08, A07, LDA | |||||
add A, A08, LDA | |||||
mov B01, B | |||||
add B, B01, #512 // B = B + 64 * SIZE | |||||
asr I, N, #3 // I = N / 8 | |||||
cmp I, #0 | |||||
ble dgemm_tcopy_L8_M8_40 | |||||
.align 5 | |||||
dgemm_tcopy_L8_M8_20: | |||||
COPY8x8 | |||||
subs I , I , #1 | |||||
bne dgemm_tcopy_L8_M8_20 | |||||
dgemm_tcopy_L8_M8_40: | |||||
tst N , #4 | |||||
ble dgemm_tcopy_L8_M8_60 | |||||
COPY4x8 | |||||
dgemm_tcopy_L8_M8_60: | |||||
tst N , #2 | |||||
ble dgemm_tcopy_L8_M8_80 | |||||
COPY2x8 | |||||
dgemm_tcopy_L8_M8_80: | |||||
tst N, #1 | |||||
ble dgemm_tcopy_L8_M8_END | |||||
COPY1x8 | |||||
dgemm_tcopy_L8_M8_END: | |||||
subs J , J, #1 // j-- | |||||
bne dgemm_tcopy_L8_M8_BEGIN | |||||
/*********************************************************************************************/ | |||||
dgemm_tcopy_L4_BEGIN: | |||||
tst M, #7 | |||||
ble dgemm_tcopy_L999 | |||||
tst M, #4 | |||||
ble dgemm_tcopy_L2_BEGIN | |||||
dgemm_tcopy_L4_M8_BEGIN: | |||||
mov A01, A | |||||
add A02, A01, LDA | |||||
add A03, A02, LDA | |||||
add A04, A03, LDA | |||||
add A, A04, LDA | |||||
mov B01, B | |||||
add B, B01, #256 // B = B + 32 * SIZE | |||||
asr I, N, #3 // I = N / 8 | |||||
cmp I, #0 | |||||
ble dgemm_tcopy_L4_M8_40 | |||||
.align 5 | |||||
dgemm_tcopy_L4_M8_20: | |||||
COPY8x4 | |||||
subs I , I , #1 | |||||
bne dgemm_tcopy_L4_M8_20 | |||||
dgemm_tcopy_L4_M8_40: | |||||
tst N , #4 | |||||
ble dgemm_tcopy_L4_M8_60 | |||||
COPY4x4 | |||||
dgemm_tcopy_L4_M8_60: | |||||
tst N , #2 | |||||
ble dgemm_tcopy_L4_M8_80 | |||||
COPY2x4 | |||||
dgemm_tcopy_L4_M8_80: | |||||
tst N, #1 | |||||
ble dgemm_tcopy_L4_M8_END | |||||
COPY1x4 | |||||
dgemm_tcopy_L4_M8_END: | |||||
/*********************************************************************************************/ | |||||
dgemm_tcopy_L2_BEGIN: | |||||
tst M, #3 | |||||
ble dgemm_tcopy_L999 | |||||
tst M, #2 | |||||
ble dgemm_tcopy_L1_BEGIN | |||||
dgemm_tcopy_L2_M8_BEGIN: | |||||
mov A01, A | |||||
add A02, A01, LDA | |||||
add A, A02, LDA | |||||
mov B01, B | |||||
add B, B01, #128 // B = B + 16 * SIZE | |||||
asr I, N, #3 // I = N / 8 | |||||
cmp I, #0 | |||||
ble dgemm_tcopy_L2_M8_40 | |||||
.align 5 | |||||
dgemm_tcopy_L2_M8_20: | |||||
COPY8x2 | |||||
subs I , I , #1 | |||||
bne dgemm_tcopy_L2_M8_20 | |||||
dgemm_tcopy_L2_M8_40: | |||||
tst N , #4 | |||||
ble dgemm_tcopy_L2_M8_60 | |||||
COPY4x2 | |||||
dgemm_tcopy_L2_M8_60: | |||||
tst N , #2 | |||||
ble dgemm_tcopy_L2_M8_80 | |||||
COPY2x2 | |||||
dgemm_tcopy_L2_M8_80: | |||||
tst N , #1 | |||||
ble dgemm_tcopy_L2_M8_END | |||||
COPY1x2 | |||||
dgemm_tcopy_L2_M8_END: | |||||
/*********************************************************************************************/ | |||||
dgemm_tcopy_L1_BEGIN: | |||||
tst M, #1 | |||||
ble dgemm_tcopy_L999 | |||||
dgemm_tcopy_L1_M8_BEGIN: | |||||
mov A01, A // A01 = A | |||||
mov B01, B | |||||
asr I, N, #3 // I = M / 8 | |||||
cmp I, #0 | |||||
ble dgemm_tcopy_L1_M8_40 | |||||
.align 5 | |||||
dgemm_tcopy_L1_M8_20: | |||||
COPY8x1 | |||||
subs I , I , #1 | |||||
bne dgemm_tcopy_L1_M8_20 | |||||
dgemm_tcopy_L1_M8_40: | |||||
tst N , #4 | |||||
ble dgemm_tcopy_L1_M8_60 | |||||
COPY4x1 | |||||
dgemm_tcopy_L1_M8_60: | |||||
tst N , #2 | |||||
ble dgemm_tcopy_L1_M8_80 | |||||
COPY2x1 | |||||
dgemm_tcopy_L1_M8_80: | |||||
tst N , #1 | |||||
ble dgemm_tcopy_L1_M8_END | |||||
COPY1x1 | |||||
dgemm_tcopy_L1_M8_END: | |||||
dgemm_tcopy_L999: | |||||
mov x0, #0 // set return value | |||||
RESTORE_REGS | |||||
ret | |||||
EPILOGUE | |||||
@@ -0,0 +1,104 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2014, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include "common.h" | |||||
#if defined(DSDOT) | |||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
#else | |||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
#endif | |||||
{ | |||||
BLASLONG i=0; | |||||
BLASLONG ix=0,iy=0; | |||||
#if defined(DSDOT) | |||||
double dot = 0.0 ; | |||||
#else | |||||
FLOAT dot = 0.0 ; | |||||
#endif | |||||
if ( n < 0 ) return(dot); | |||||
if ( (inc_x == 1) && (inc_y == 1) ) | |||||
{ | |||||
int n1 = n & -4; | |||||
while(i < n1) | |||||
{ | |||||
#if defined(DSDOT) | |||||
dot += (double) y[i] * (double) x[i] | |||||
+ (double) y[i+1] * (double) x[i+1] | |||||
+ (double) y[i+2] * (double) x[i+2] | |||||
+ (double) y[i+3] * (double) x[i+3] ; | |||||
#else | |||||
dot += y[i] * x[i] | |||||
+ y[i+1] * x[i+1] | |||||
+ y[i+2] * x[i+2] | |||||
+ y[i+3] * x[i+3] ; | |||||
#endif | |||||
i+=4 ; | |||||
} | |||||
while(i < n) | |||||
{ | |||||
#if defined(DSDOT) | |||||
dot += (double) y[i] * (double) x[i] ; | |||||
#else | |||||
dot += y[i] * x[i] ; | |||||
#endif | |||||
i++ ; | |||||
} | |||||
return(dot); | |||||
} | |||||
while(i < n) | |||||
{ | |||||
#if defined(DSDOT) | |||||
dot += (double) y[iy] * (double) x[ix] ; | |||||
#else | |||||
dot += y[iy] * x[ix] ; | |||||
#endif | |||||
ix += inc_x ; | |||||
iy += inc_y ; | |||||
i++ ; | |||||
} | |||||
return(dot); | |||||
} | |||||
@@ -2303,6 +2303,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define ZGEMM_DEFAULT_R 4096 | #define ZGEMM_DEFAULT_R 4096 | ||||
#define SYMV_P 16 | |||||
#endif | |||||
#if defined(VULCAN) | |||||
#define SNUMOPT 2 | |||||
#define DNUMOPT 2 | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | |||||
#define GEMM_DEFAULT_OFFSET_B 0 | |||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
#define SGEMM_DEFAULT_P 512 | |||||
#define DGEMM_DEFAULT_P dgemm_p | |||||
#define CGEMM_DEFAULT_P 256 | |||||
#define ZGEMM_DEFAULT_P 128 | |||||
#define SGEMM_DEFAULT_Q 1024 | |||||
#define DGEMM_DEFAULT_Q dgemm_q | |||||
#define CGEMM_DEFAULT_Q 512 | |||||
#define ZGEMM_DEFAULT_Q 512 | |||||
#define SGEMM_DEFAULT_R 4096 | |||||
#define DGEMM_DEFAULT_R dgemm_r | |||||
#define CGEMM_DEFAULT_R 4096 | |||||
#define ZGEMM_DEFAULT_R 2048 | |||||
#define SYMV_P 16 | #define SYMV_P 16 | ||||
#endif | #endif | ||||
@@ -2385,6 +2423,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define SYMV_P 16 | #define SYMV_P 16 | ||||
#endif | #endif | ||||
#if defined(THUNDERX) | |||||
#define SNUMOPT 2 | |||||
#define DNUMOPT 2 | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | |||||
#define GEMM_DEFAULT_OFFSET_B 0 | |||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||||
#define DGEMM_DEFAULT_UNROLL_N 2 | |||||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||||
#define SGEMM_DEFAULT_P 128 | |||||
#define DGEMM_DEFAULT_P 128 | |||||
#define CGEMM_DEFAULT_P 96 | |||||
#define ZGEMM_DEFAULT_P 64 | |||||
#define SGEMM_DEFAULT_Q 240 | |||||
#define DGEMM_DEFAULT_Q 120 | |||||
#define CGEMM_DEFAULT_Q 120 | |||||
#define ZGEMM_DEFAULT_Q 120 | |||||
#define SGEMM_DEFAULT_R 12288 | |||||
#define DGEMM_DEFAULT_R 8192 | |||||
#define CGEMM_DEFAULT_R 4096 | |||||
#define ZGEMM_DEFAULT_R 4096 | |||||
#define SYMV_P 16 | |||||
#endif | |||||
#if defined(THUNDERX2T99) | |||||
#define SNUMOPT 2 | |||||
#define DNUMOPT 2 | |||||
#define GEMM_DEFAULT_OFFSET_A 0 | |||||
#define GEMM_DEFAULT_OFFSET_B 0 | |||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||||
#define SGEMM_DEFAULT_P 512 | |||||
#define DGEMM_DEFAULT_P dgemm_p | |||||
#define CGEMM_DEFAULT_P 256 | |||||
#define ZGEMM_DEFAULT_P 128 | |||||
#define SGEMM_DEFAULT_Q 1024 | |||||
#define DGEMM_DEFAULT_Q dgemm_q | |||||
#define CGEMM_DEFAULT_Q 512 | |||||
#define ZGEMM_DEFAULT_Q 512 | |||||
#define SGEMM_DEFAULT_R 4096 | |||||
#define DGEMM_DEFAULT_R dgemm_r | |||||
#define CGEMM_DEFAULT_R 4096 | |||||
#define ZGEMM_DEFAULT_R 2048 | |||||
#define SYMV_P 16 | |||||
#endif | |||||
#if defined(ARMV5) | #if defined(ARMV5) | ||||
#define SNUMOPT 2 | #define SNUMOPT 2 | ||||