Add new targets for ARM64tags/v0.2.20^2
@@ -9,3 +9,17 @@ CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
endif | |||
ifeq ($(CORE), VULCAN) | |||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
endif | |||
ifeq ($(CORE), THUNDERX) | |||
CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||
FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||
endif | |||
ifeq ($(CORE), THUNDERX2T99) | |||
CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
endif |
@@ -80,4 +80,7 @@ ARMV5 | |||
8.ARM 64-bit CPU: | |||
ARMV8 | |||
CORTEXA57 | |||
VULCAN | |||
THUNDERX | |||
THUNDERX2T99 | |||
@@ -2193,7 +2193,7 @@ | |||
#endif | |||
#ifndef ASSEMBLER | |||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
extern BLASLONG gemm_offset_a; | |||
extern BLASLONG gemm_offset_b; | |||
extern BLASLONG sgemm_p; | |||
@@ -30,17 +30,26 @@ | |||
#define CPU_UNKNOWN 0 | |||
#define CPU_ARMV8 1 | |||
#define CPU_CORTEXA57 2 | |||
#define CPU_VULCAN 3 | |||
#define CPU_THUNDERX 4 | |||
#define CPU_THUNDERX2T99 5 | |||
static char *cpuname[] = { | |||
"UNKNOWN", | |||
"ARMV8" , | |||
"CORTEXA57" | |||
"CORTEXA57", | |||
"VULCAN", | |||
"THUNDERX", | |||
"THUNDERX2T99" | |||
}; | |||
static char *cpuname_lower[] = { | |||
"unknown", | |||
"armv8" , | |||
"cortexa57" | |||
"cortexa57", | |||
"vulcan", | |||
"thunderx", | |||
"thunderx2t99" | |||
}; | |||
int get_feature(char *search) | |||
@@ -85,25 +94,34 @@ int detect(void) | |||
#ifdef linux | |||
FILE *infile; | |||
char buffer[512], *p; | |||
p = (char *) NULL ; | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)) | |||
{ | |||
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||
p = (char *) NULL ; | |||
if (!strncmp("CPU part", buffer, 8)) | |||
{ | |||
p = strchr(buffer, ':') + 2; | |||
infile = fopen("/proc/cpuinfo", "r"); | |||
while (fgets(buffer, sizeof(buffer), infile)) { | |||
if ((cpu_part != NULL) && (cpu_implementer != NULL)) { | |||
break; | |||
} | |||
if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { | |||
cpu_part = strchr(buffer, ':') + 2; | |||
cpu_part = strdup(cpu_part); | |||
} else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { | |||
cpu_implementer = strchr(buffer, ':') + 2; | |||
cpu_implementer = strdup(cpu_implementer); | |||
} | |||
} | |||
fclose(infile); | |||
if(p != NULL) { | |||
if (strstr(p, "0xd07")) { | |||
return CPU_CORTEXA57; | |||
} | |||
if(cpu_part != NULL && cpu_implementer != NULL) { | |||
if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41")) | |||
return CPU_CORTEXA57; | |||
else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) | |||
return CPU_VULCAN; | |||
else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) | |||
return CPU_THUNDERX; | |||
else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ | |||
return CPU_THUNDERX2T99; | |||
} | |||
p = (char *) NULL ; | |||
@@ -176,6 +194,28 @@ void get_cpuconfig(void) | |||
printf("#define L2_ASSOCIATIVE 4\n"); | |||
break; | |||
case CPU_VULCAN: | |||
printf("#define VULCAN \n"); | |||
printf("#define HAVE_VFP \n"); | |||
printf("#define HAVE_VFPV3 \n"); | |||
printf("#define HAVE_NEON \n"); | |||
printf("#define HAVE_VFPV4 \n"); | |||
printf("#define L1_CODE_SIZE 32768 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
printf("#define L1_DATA_SIZE 32768 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
printf("#define L2_SIZE 262144 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define L3_SIZE 33554432 \n"); | |||
printf("#define L3_LINESIZE 64 \n"); | |||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
case CPU_CORTEXA57: | |||
printf("#define CORTEXA57\n"); | |||
printf("#define HAVE_VFP\n"); | |||
@@ -191,8 +231,42 @@ void get_cpuconfig(void) | |||
printf("#define L2_SIZE 2097152\n"); | |||
printf("#define L2_LINESIZE 64\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
break; | |||
case CPU_THUNDERX: | |||
printf("#define ARMV8\n"); | |||
printf("#define THUNDERX\n"); | |||
printf("#define L1_DATA_SIZE 32768\n"); | |||
printf("#define L1_DATA_LINESIZE 128\n"); | |||
printf("#define L2_SIZE 16777216\n"); | |||
printf("#define L2_LINESIZE 128\n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
printf("#define DTB_SIZE 4096\n"); | |||
printf("#define L2_ASSOCIATIVE 16\n"); | |||
break; | |||
case CPU_THUNDERX2T99: | |||
printf("#define VULCAN \n"); | |||
printf("#define HAVE_VFP \n"); | |||
printf("#define HAVE_VFPV3 \n"); | |||
printf("#define HAVE_NEON \n"); | |||
printf("#define HAVE_VFPV4 \n"); | |||
printf("#define L1_CODE_SIZE 32768 \n"); | |||
printf("#define L1_CODE_LINESIZE 64 \n"); | |||
printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
printf("#define L1_DATA_SIZE 32768 \n"); | |||
printf("#define L1_DATA_LINESIZE 64 \n"); | |||
printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
printf("#define L2_SIZE 262144 \n"); | |||
printf("#define L2_LINESIZE 64 \n"); | |||
printf("#define L2_ASSOCIATIVE 8 \n"); | |||
printf("#define L3_SIZE 33554432 \n"); | |||
printf("#define L3_LINESIZE 64 \n"); | |||
printf("#define L3_ASSOCIATIVE 32 \n"); | |||
printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
printf("#define DTB_SIZE 4096 \n"); | |||
break; | |||
} | |||
} | |||
@@ -995,7 +995,7 @@ void *blas_memory_alloc(int procpos){ | |||
if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | |||
#endif | |||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
#ifndef DYNAMIC_ARCH | |||
blas_set_parameter(); | |||
#endif | |||
@@ -727,3 +727,26 @@ void blas_set_parameter(void){ | |||
} | |||
#endif | |||
#if defined(ARCH_ARM64) | |||
#if defined(VULCAN) || defined(THUNDERX2T99) | |||
unsigned long dgemm_prefetch_size_a; | |||
unsigned long dgemm_prefetch_size_b; | |||
unsigned long dgemm_prefetch_size_c; | |||
#endif | |||
void blas_set_parameter(void) | |||
{ | |||
#if defined(VULCAN) || defined(THUNDERX2T99) | |||
dgemm_p = 160; | |||
dgemm_q = 128; | |||
dgemm_r = 4096; | |||
dgemm_prefetch_size_a = 3584; | |||
dgemm_prefetch_size_b = 512; | |||
dgemm_prefetch_size_c = 128; | |||
#endif | |||
} | |||
#endif |
@@ -884,7 +884,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#ifdef FORCE_CORTEXA57 | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "ARMV8" | |||
#define SUBARCHITECTURE "CORTEXA57" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DCORTEXA57 " \ | |||
"-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
@@ -897,6 +897,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#else | |||
#endif | |||
#ifdef FORCE_VULCAN | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "VULCAN" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DVULCAN " \ | |||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
#define LIBNAME "vulcan" | |||
#define CORENAME "VULCAN" | |||
#else | |||
#endif | |||
#ifdef FORCE_THUNDERX | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "THUNDERX" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DTHUNDERX " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
"-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " | |||
#define LIBNAME "thunderx" | |||
#define CORENAME "THUNDERX" | |||
#else | |||
#endif | |||
#ifdef FORCE_THUNDERX2T99 | |||
#define FORCE | |||
#define ARCHITECTURE "ARM64" | |||
#define SUBARCHITECTURE "THUNDERX2T99" | |||
#define SUBDIRNAME "arm64" | |||
#define ARCHCONFIG "-DTHUNDERX2T99 " \ | |||
"-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
"-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
"-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
#define LIBNAME "thunderx2t99" | |||
#define CORENAME "THUNDERX2T99" | |||
#else | |||
#endif | |||
#ifndef FORCE | |||
#if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | |||
@@ -75,14 +75,29 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
ifeq ($(DGEMM_UNROLL_M), 8) | |||
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
else | |||
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
endif | |||
DGEMMINCOPYOBJ = dgemm_incopy.o | |||
DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
endif | |||
ifeq ($(DGEMM_UNROLL_N), 4) | |||
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
else | |||
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
endif | |||
DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
@@ -0,0 +1,6 @@ | |||
include $(KERNELDIR)/KERNEL.ARMV8 | |||
SDOTKERNEL=dot-thunderx.c | |||
DDOTKERNEL=ddot-thunderx.c | |||
DAXPYKERNEL=daxpy-thunderx.c | |||
@@ -0,0 +1,2 @@ | |||
include $(KERNELDIR)/KERNEL.VULCAN | |||
@@ -0,0 +1,4 @@ | |||
include $(KERNELDIR)/KERNEL.CORTEXA57 | |||
DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_vulcan.S | |||
@@ -0,0 +1,151 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2014, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); | |||
//#define prefetch(a) | |||
static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
{ | |||
BLASLONG register i = 0; | |||
double a = *alpha; | |||
#if 0 | |||
prefetch(x + 128/sizeof(*x)); | |||
prefetch(y + 128/sizeof(*y)); | |||
#endif | |||
prefetch(x + 2*128/sizeof(*x)); | |||
prefetch(y + 2*128/sizeof(*y)); | |||
prefetch(x + 3*128/sizeof(*x)); | |||
prefetch(y + 3*128/sizeof(*y)); | |||
prefetch(x + 4*128/sizeof(*x)); | |||
prefetch(y + 4*128/sizeof(*y)); | |||
while(i < n) | |||
{ | |||
double y0, y1, y2, y3; | |||
double y4, y5, y6, y7; | |||
double *xx; | |||
double *yy; | |||
y0 = a * x[0] + y[0]; | |||
y1 = a * x[1] + y[1]; | |||
y2 = a * x[2] + y[2]; | |||
y3 = a * x[3] + y[3]; | |||
y4 = a * x[4] + y[4]; | |||
y5 = a * x[5] + y[5]; | |||
y6 = a * x[6] + y[6]; | |||
y7 = a * x[7] + y[7]; | |||
asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); | |||
y[0] = y0; | |||
y[1] = y1; | |||
y[2] = y2; | |||
y[3] = y3; | |||
y[4] = y4; | |||
y[5] = y5; | |||
y[6] = y6; | |||
y[7] = y7; | |||
xx = (x + 4*128/sizeof(*x)); | |||
yy = (y + 4*128/sizeof(*y)); | |||
asm("":"+r"(yy)::"memory"); | |||
prefetch(xx); | |||
prefetch(yy); | |||
y += 8; | |||
x += 8; | |||
i += 8 ; | |||
} | |||
} | |||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
if ( n <= 0 ) return(0); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
BLASLONG n1 = n & -32; | |||
if ( n1 ) | |||
daxpy_kernel_8(n1, x, y , &da ); | |||
i = n1; | |||
while(i < n) | |||
{ | |||
y[i] += da * x[i] ; | |||
i++ ; | |||
} | |||
return(0); | |||
} | |||
BLASLONG n1 = n & -4; | |||
while(i < n1) | |||
{ | |||
FLOAT m1 = da * x[ix] ; | |||
FLOAT m2 = da * x[ix+inc_x] ; | |||
FLOAT m3 = da * x[ix+2*inc_x] ; | |||
FLOAT m4 = da * x[ix+3*inc_x] ; | |||
y[iy] += m1 ; | |||
y[iy+inc_y] += m2 ; | |||
y[iy+2*inc_y] += m3 ; | |||
y[iy+3*inc_y] += m4 ; | |||
ix += inc_x*4 ; | |||
iy += inc_y*4 ; | |||
i+=4 ; | |||
} | |||
while(i < n) | |||
{ | |||
y[iy] += da * x[ix] ; | |||
ix += inc_x ; | |||
iy += inc_y ; | |||
i++ ; | |||
} | |||
return(0); | |||
} | |||
@@ -0,0 +1,119 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2014, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <arm_neon.h> | |||
#define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
FLOAT dot = 0.0 ; | |||
if ( n < 0 ) return(dot); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
float64x2_t vdot0 = {0.0, 0.0}; | |||
float64x2_t vdot1 = {0.0, 0.0}; | |||
float64x2_t vdot2 = {0.0, 0.0}; | |||
float64x2_t vdot3 = {0.0, 0.0}; | |||
float64x2_t *vx = (float64x2_t*)x; | |||
float64x2_t *vy = (float64x2_t*)y; | |||
#if 0 | |||
prefetch(x + 128/sizeof(*x)); | |||
prefetch(y + 128/sizeof(*y)); | |||
#endif | |||
prefetch(x + 2*128/sizeof(*x)); | |||
prefetch(y + 2*128/sizeof(*y)); | |||
prefetch(x + 3*128/sizeof(*x)); | |||
prefetch(y + 3*128/sizeof(*y)); | |||
int n1 = n&-8; | |||
while(i < n1) | |||
{ | |||
#if 0 | |||
vdot0 = vfmaq_f64 (vdot0, | |||
vy[0], | |||
vx[0]); | |||
vdot1 = vfmaq_f64 (vdot1, | |||
vy[1], | |||
vx[1]); | |||
vdot2 = vfmaq_f64 (vdot2, | |||
vy[2], | |||
vx[2]); | |||
vdot3 = vfmaq_f64 (vdot3, | |||
vy[3], | |||
vx[3]); | |||
#else | |||
vdot0 = vy[0] * vx[0] + vdot0; | |||
vdot1 = vy[1] * vx[1] + vdot1; | |||
vdot2 = vy[2] * vx[2] + vdot2; | |||
vdot3 = vy[3] * vx[3] + vdot3; | |||
#endif | |||
vy += 4; | |||
vx += 4; | |||
i += 8; | |||
prefetch(vx + 3*128/sizeof(*x)); | |||
prefetch(vy + 3*128/sizeof(*y)); | |||
} | |||
dot = vaddvq_f64 (vdot0 + vdot1); | |||
dot += vaddvq_f64 (vdot2 + vdot3); | |||
i = n1; | |||
while(i < n) | |||
{ | |||
dot += y[i] * x[i] ; | |||
i++ ; | |||
} | |||
return(dot); | |||
} | |||
while(i < n) | |||
{ | |||
dot += y[iy] * x[ix] ; | |||
ix += inc_x ; | |||
iy += inc_y ; | |||
i++ ; | |||
} | |||
return(dot); | |||
} | |||
@@ -0,0 +1,340 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define M x0 | |||
#define N x1 | |||
#define A00 x2 | |||
#define LDA x3 | |||
#define B00 x4 | |||
#define A01 x5 | |||
#define A02 x6 | |||
#define A03 x7 | |||
#define A04 x8 | |||
#define I x9 | |||
#define J x10 | |||
#define TEMP1 x11 | |||
#define TEMP2 x12 | |||
#define A_PREFETCH 2560 | |||
/************************************************************************************** | |||
* Macro definitions | |||
**************************************************************************************/ | |||
.macro SAVE_REGS | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
.endm | |||
.macro RESTORE_REGS | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
.endm | |||
.macro COPY4x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ins v8.d[0], v0.d[0] | |||
ins v10.d[0], v0.d[1] | |||
ins v12.d[0], v1.d[0] | |||
ins v14.d[0], v1.d[1] | |||
ldp q2, q3, [A02], #32 | |||
ins v8.d[1], v2.d[0] | |||
ins v10.d[1], v2.d[1] | |||
ins v12.d[1], v3.d[0] | |||
ins v14.d[1], v3.d[1] | |||
ldp q4, q5, [A03], #32 | |||
ins v9.d[0], v4.d[0] | |||
ins v11.d[0], v4.d[1] | |||
ins v13.d[0], v5.d[0] | |||
ins v15.d[0], v5.d[1] | |||
ldp q6, q7, [A04], #32 | |||
ins v9.d[1], v6.d[0] | |||
ins v11.d[1], v6.d[1] | |||
ins v13.d[1], v7.d[0] | |||
ins v15.d[1], v7.d[1] | |||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
add B00, B00, #64 | |||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
add B00, B00, #64 | |||
.endm | |||
.macro COPY1x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
add B00, B00, #32 | |||
.endm | |||
.macro COPY4x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ins v8.d[0], v0.d[0] | |||
ins v9.d[0], v0.d[1] | |||
ins v10.d[0], v1.d[0] | |||
ins v11.d[0], v1.d[1] | |||
ldp q2, q3, [A02], #32 | |||
ins v8.d[1], v2.d[0] | |||
ins v9.d[1], v2.d[1] | |||
ins v10.d[1], v3.d[0] | |||
ins v11.d[1], v3.d[1] | |||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
add B00, B00, #64 | |||
.endm | |||
.macro COPY1x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
stp d0, d1, [B00] | |||
add B00, B00, #16 | |||
.endm | |||
.macro COPY4x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
stp q0, q1, [B00], #32 | |||
.endm | |||
.macro COPY1x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
str d0, [B00], #8 | |||
.endm | |||
/************************************************************************************** | |||
* End of macro definitions | |||
**************************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
SAVE_REGS | |||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
dgemm_ncopy_L4_BEGIN: | |||
asr J, N, #2 // J = N / 4 | |||
cmp J, #0 | |||
ble dgemm_ncopy_L2_BEGIN | |||
.align 5 | |||
dgemm_ncopy_L4_M4_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A00, A04, LDA | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L4_M4_40 | |||
.align 5 | |||
dgemm_ncopy_L4_M4_20: | |||
COPY4x4 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L4_M4_20 | |||
dgemm_ncopy_L4_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L4_M4_END | |||
.align 5 | |||
dgemm_ncopy_L4_M4_60: | |||
COPY1x4 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L4_M4_60 | |||
dgemm_ncopy_L4_M4_END: | |||
subs J , J, #1 // j-- | |||
bne dgemm_ncopy_L4_M4_BEGIN | |||
/*********************************************************************************************/ | |||
dgemm_ncopy_L2_BEGIN: | |||
tst N, #3 | |||
ble dgemm_ncopy_L999 | |||
tst N, #2 | |||
ble dgemm_ncopy_L1_BEGIN | |||
dgemm_ncopy_L2_M4_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A00, A02, LDA | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L2_M4_40 | |||
.align 5 | |||
dgemm_ncopy_L2_M4_20: | |||
COPY4x2 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L2_M4_20 | |||
dgemm_ncopy_L2_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L2_M4_END | |||
.align 5 | |||
dgemm_ncopy_L2_M4_60: | |||
COPY1x2 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L2_M4_60 | |||
dgemm_ncopy_L2_M4_END: | |||
/*********************************************************************************************/ | |||
dgemm_ncopy_L1_BEGIN: | |||
tst N, #1 | |||
ble dgemm_ncopy_L999 | |||
dgemm_ncopy_L1_M4_BEGIN: | |||
mov A01, A00 | |||
asr I, M, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L1_M4_40 | |||
.align 5 | |||
dgemm_ncopy_L1_M4_20: | |||
COPY4x1 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L1_M4_20 | |||
dgemm_ncopy_L1_M4_40: | |||
and I, M , #3 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L1_M4_END | |||
.align 5 | |||
dgemm_ncopy_L1_M4_60: | |||
COPY1x1 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L1_M4_60 | |||
dgemm_ncopy_L1_M4_END: | |||
dgemm_ncopy_L999: | |||
mov x0, #0 | |||
RESTORE_REGS | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,544 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define M x0 | |||
#define N x1 | |||
#define A00 x2 | |||
#define LDA x3 | |||
#define B00 x4 | |||
#define A01 x5 | |||
#define A02 x6 | |||
#define A03 x7 | |||
#define A04 x8 | |||
#define A05 x9 | |||
#define A06 x10 | |||
#define A07 x11 | |||
#define A08 x12 | |||
#define I x13 | |||
#define J x14 | |||
#define TEMP1 x15 | |||
#define TEMP2 x16 | |||
#define A_PREFETCH 2560 | |||
/************************************************************************************** | |||
* Macro definitions | |||
**************************************************************************************/ | |||
.macro SAVE_REGS | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
.endm | |||
.macro RESTORE_REGS | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
.endm | |||
/*************************************************************************************/ | |||
.macro COPY8x8 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
COPY4x8 | |||
COPY4x8 | |||
.endm | |||
.macro COPY4x8 | |||
ldp q0, q1, [A01], #32 | |||
ins v16.d[0], v0.d[0] | |||
ins v20.d[0], v0.d[1] | |||
ins v24.d[0], v1.d[0] | |||
ins v28.d[0], v1.d[1] | |||
ldp q2, q3, [A02], #32 | |||
ins v16.d[1], v2.d[0] | |||
ins v20.d[1], v2.d[1] | |||
ins v24.d[1], v3.d[0] | |||
ins v28.d[1], v3.d[1] | |||
ldp q4, q5, [A03], #32 | |||
ins v17.d[0], v4.d[0] | |||
ins v21.d[0], v4.d[1] | |||
ins v25.d[0], v5.d[0] | |||
ins v29.d[0], v5.d[1] | |||
ldp q6, q7, [A04], #32 | |||
ins v17.d[1], v6.d[0] | |||
ins v21.d[1], v6.d[1] | |||
ins v25.d[1], v7.d[0] | |||
ins v29.d[1], v7.d[1] | |||
ldp q8, q9, [A05], #32 | |||
ins v18.d[0], v8.d[0] | |||
ins v22.d[0], v8.d[1] | |||
ins v26.d[0], v9.d[0] | |||
ins v30.d[0], v9.d[1] | |||
ldp q10, q11, [A06], #32 | |||
ins v18.d[1], v10.d[0] | |||
ins v22.d[1], v10.d[1] | |||
ins v26.d[1], v11.d[0] | |||
ins v30.d[1], v11.d[1] | |||
ldp q12, q13, [A07], #32 | |||
ins v19.d[0], v12.d[0] | |||
ins v23.d[0], v12.d[1] | |||
ins v27.d[0], v13.d[0] | |||
ins v31.d[0], v13.d[1] | |||
ldp q14, q15, [A08], #32 | |||
ins v19.d[1], v14.d[0] | |||
ins v23.d[1], v14.d[1] | |||
ins v27.d[1], v15.d[0] | |||
ins v31.d[1], v15.d[1] | |||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [B00] | |||
add B00, B00, #64 | |||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [B00] | |||
add B00, B00, #64 | |||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||
add B00, B00, #64 | |||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||
add B00, B00, #64 | |||
.endm | |||
.macro COPY1x8 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
ldr d4, [A05], #8 | |||
ldr d5, [A06], #8 | |||
ldr d6, [A07], #8 | |||
ldr d7, [A08], #8 | |||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
add B00, B00, #32 | |||
st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B00] | |||
add B00, B00, #32 | |||
.endm | |||
/*************************************************************************************/ | |||
.macro COPY8x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ins v8.d[0], v0.d[0] | |||
ins v10.d[0], v0.d[1] | |||
ins v12.d[0], v1.d[0] | |||
ins v14.d[0], v1.d[1] | |||
ldp q2, q3, [A02], #32 | |||
ins v8.d[1], v2.d[0] | |||
ins v10.d[1], v2.d[1] | |||
ins v12.d[1], v3.d[0] | |||
ins v14.d[1], v3.d[1] | |||
ldp q4, q5, [A03], #32 | |||
ins v9.d[0], v4.d[0] | |||
ins v11.d[0], v4.d[1] | |||
ins v13.d[0], v5.d[0] | |||
ins v15.d[0], v5.d[1] | |||
ldp q6, q7, [A04], #32 | |||
ins v9.d[1], v6.d[0] | |||
ins v11.d[1], v6.d[1] | |||
ins v13.d[1], v7.d[0] | |||
ins v15.d[1], v7.d[1] | |||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
add B00, B00, #64 | |||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
add B00, B00, #64 | |||
ldp q16, q17, [A01], #32 | |||
ins v24.d[0], v16.d[0] | |||
ins v26.d[0], v16.d[1] | |||
ins v28.d[0], v17.d[0] | |||
ins v30.d[0], v17.d[1] | |||
ldp q18, q19, [A02], #32 | |||
ins v24.d[1], v18.d[0] | |||
ins v26.d[1], v18.d[1] | |||
ins v28.d[1], v19.d[0] | |||
ins v30.d[1], v19.d[1] | |||
ldp q20, q21, [A03], #32 | |||
ins v25.d[0], v20.d[0] | |||
ins v27.d[0], v20.d[1] | |||
ins v29.d[0], v21.d[0] | |||
ins v31.d[0], v21.d[1] | |||
ldp q22, q23, [A04], #32 | |||
ins v25.d[1], v22.d[0] | |||
ins v27.d[1], v22.d[1] | |||
ins v29.d[1], v23.d[0] | |||
ins v31.d[1], v23.d[1] | |||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||
add B00, B00, #64 | |||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||
add B00, B00, #64 | |||
.endm | |||
.macro COPY1x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
add B00, B00, #32 | |||
.endm | |||
/*************************************************************************************/ | |||
.macro COPY8x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A01], #32 | |||
ins v8.d[0], v0.d[0] | |||
ins v9.d[0], v0.d[1] | |||
ins v10.d[0], v1.d[0] | |||
ins v11.d[0], v1.d[1] | |||
ins v12.d[0], v2.d[0] | |||
ins v13.d[0], v2.d[1] | |||
ins v14.d[0], v3.d[0] | |||
ins v15.d[0], v3.d[1] | |||
ldp q4, q5, [A02], #32 | |||
ldp q6, q7, [A02], #32 | |||
ins v8.d[1], v4.d[0] | |||
ins v9.d[1], v4.d[1] | |||
ins v10.d[1], v5.d[0] | |||
ins v11.d[1], v5.d[1] | |||
ins v12.d[1], v6.d[0] | |||
ins v13.d[1], v6.d[1] | |||
ins v14.d[1], v7.d[0] | |||
ins v15.d[1], v7.d[1] | |||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
add B00, B00, #64 | |||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
add B00, B00, #64 | |||
.endm | |||
.macro COPY1x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
stp d0, d1, [B00] | |||
add B00, B00, #16 | |||
.endm | |||
/*************************************************************************************/ | |||
.macro COPY8x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A01], #32 | |||
stp q0, q1, [B00], #32 | |||
stp q2, q3, [B00], #32 | |||
.endm | |||
.macro COPY1x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
str d0, [B00], #8 | |||
.endm | |||
/************************************************************************************** | |||
* End of macro definitions | |||
**************************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
SAVE_REGS | |||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
dgemm_ncopy_L8_BEGIN: | |||
asr J, N, #3 // J = N / 8 | |||
cmp J, #0 | |||
ble dgemm_ncopy_L4_BEGIN | |||
dgemm_ncopy_L8_M8_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A05, A04, LDA | |||
add A06, A05, LDA | |||
add A07, A06, LDA | |||
add A08, A07, LDA | |||
add A00, A08, LDA | |||
asr I, M, #3 // I = M / 8 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L8_M8_40 | |||
dgemm_ncopy_L8_M8_20: | |||
COPY8x8 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L8_M8_20 | |||
dgemm_ncopy_L8_M8_40: | |||
and I, M , #7 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L8_M8_END | |||
dgemm_ncopy_L8_M8_60: | |||
COPY1x8 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L8_M8_60 | |||
dgemm_ncopy_L8_M8_END: | |||
subs J , J, #1 // j-- | |||
bne dgemm_ncopy_L8_M8_BEGIN | |||
/*********************************************************************************************/ | |||
dgemm_ncopy_L4_BEGIN: | |||
tst N, #7 | |||
ble dgemm_ncopy_L999 | |||
tst N, #4 | |||
ble dgemm_ncopy_L2_BEGIN | |||
dgemm_ncopy_L4_M8_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A00, A04, LDA | |||
asr I, M, #3 // I = M / 8 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L4_M8_40 | |||
dgemm_ncopy_L4_M8_20: | |||
COPY8x4 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L4_M8_20 | |||
dgemm_ncopy_L4_M8_40: | |||
and I, M , #7 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L4_M8_END | |||
dgemm_ncopy_L4_M8_60: | |||
COPY1x4 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L4_M8_60 | |||
dgemm_ncopy_L4_M8_END: | |||
/*********************************************************************************************/ | |||
dgemm_ncopy_L2_BEGIN: | |||
tst N, #3 | |||
ble dgemm_ncopy_L999 | |||
tst N, #2 | |||
ble dgemm_ncopy_L1_BEGIN | |||
dgemm_ncopy_L2_M8_BEGIN: | |||
mov A01, A00 | |||
add A02, A01, LDA | |||
add A00, A02, LDA | |||
asr I, M, #3 // I = M / 8 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L2_M8_40 | |||
dgemm_ncopy_L2_M8_20: | |||
COPY8x2 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L2_M8_20 | |||
dgemm_ncopy_L2_M8_40: | |||
and I, M , #7 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L2_M8_END | |||
dgemm_ncopy_L2_M8_60: | |||
COPY1x2 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L2_M8_60 | |||
dgemm_ncopy_L2_M8_END: | |||
/*********************************************************************************************/ | |||
dgemm_ncopy_L1_BEGIN: | |||
tst N, #1 | |||
ble dgemm_ncopy_L999 | |||
dgemm_ncopy_L1_M8_BEGIN: | |||
mov A01, A00 | |||
asr I, M, #3 // I = M / 8 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L1_M8_40 | |||
dgemm_ncopy_L1_M8_20: | |||
COPY8x1 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L1_M8_20 | |||
dgemm_ncopy_L1_M8_40: | |||
and I, M , #7 | |||
cmp I, #0 | |||
ble dgemm_ncopy_L1_M8_END | |||
dgemm_ncopy_L1_M8_60: | |||
COPY1x1 | |||
subs I , I , #1 | |||
bne dgemm_ncopy_L1_M8_60 | |||
dgemm_ncopy_L1_M8_END: | |||
dgemm_ncopy_L999: | |||
mov x0, #0 | |||
RESTORE_REGS | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,402 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define M x0 | |||
#define N x1 | |||
#define A x2 | |||
#define LDA x3 | |||
#define B x4 | |||
#define M4 x5 | |||
#define A01 x6 | |||
#define A02 x7 | |||
#define A03 x8 | |||
#define A04 x9 | |||
#define B01 x10 | |||
#define B02 x11 | |||
#define B03 x12 | |||
#define B04 x13 | |||
#define I x14 | |||
#define J x15 | |||
#define TEMP1 x16 | |||
#define TEMP2 x17 | |||
#define A_PREFETCH 2560 | |||
#define B_PREFETCH 256 | |||
/************************************************************************************** | |||
* Macro definitions | |||
**************************************************************************************/ | |||
.macro SAVE_REGS | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
.endm | |||
.macro RESTORE_REGS | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
.endm | |||
.macro COPY4x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A02], #32 | |||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
add TEMP1, B01, #64 | |||
ldp q4, q5, [A03], #32 | |||
ldp q6, q7, [A04], #32 | |||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
add B01, B01, M4 | |||
.endm | |||
.macro COPY2x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
ldr q2, [A03], #16 | |||
ldr q3, [A04], #16 | |||
////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
add B02, B02, #64 | |||
.endm | |||
.macro COPY1x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B03] | |||
add B03, B03, #32 | |||
.endm | |||
/*************************************************************************************************************************/ | |||
.macro COPY4x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A02], #32 | |||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
add B01, B01, M4 | |||
.endm | |||
.macro COPY2x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
stp q0, q1, [B02] | |||
add B02, B02, #32 | |||
.endm | |||
.macro COPY1x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
stp d0, d1, [B03] | |||
add B03, B03, #16 | |||
.endm | |||
/*************************************************************************************************************************/ | |||
.macro COPY4x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
stp q0, q1, [B01] | |||
add B01, B01, M4 | |||
.endm | |||
.macro COPY2x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
str q0, [B02] | |||
add B02, B02, #16 | |||
.endm | |||
.macro COPY1x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
str d0, [B03] | |||
add B03, B03, #8 | |||
.endm | |||
/************************************************************************************** | |||
* End of macro definitions | |||
**************************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
SAVE_REGS | |||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
lsl TEMP1, M, #3 // x12 = M * SIZE | |||
and B02 , N , #-4 | |||
and B03 , N , #-2 | |||
mul B02, B02, TEMP1 | |||
mul B03, B03, TEMP1 | |||
add B02 , B02, B | |||
add B03 , B03, B | |||
lsl M4, M, #5 // M4 = M * 4 * SIZE | |||
dgemm_tcopy_L4_BEGIN: | |||
asr J, M, #2 // J = M / 4 | |||
cmp J, #0 | |||
ble dgemm_tcopy_L2_BEGIN | |||
.align 5 | |||
dgemm_tcopy_L4_M4_BEGIN: | |||
mov A01, A | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A, A04, LDA | |||
mov B01, B | |||
add B, B01, #128 // B = B + 16 * SIZE | |||
asr I, N, #2 // I = N / 4 | |||
cmp I, #0 | |||
ble dgemm_tcopy_L4_M4_40 | |||
.align 5 | |||
dgemm_tcopy_L4_M4_20: | |||
COPY4x4 | |||
subs I , I , #1 | |||
bne dgemm_tcopy_L4_M4_20 | |||
dgemm_tcopy_L4_M4_40: | |||
tst N , #2 | |||
ble dgemm_tcopy_L4_M4_60 | |||
COPY2x4 | |||
dgemm_tcopy_L4_M4_60: | |||
tst N, #1 | |||
ble dgemm_tcopy_L4_M4_END | |||
COPY1x4 | |||
dgemm_tcopy_L4_M4_END: | |||
subs J , J, #1 // j-- | |||
bne dgemm_tcopy_L4_M4_BEGIN | |||
/*********************************************************************************************/ | |||
dgemm_tcopy_L2_BEGIN: | |||
tst M, #3 | |||
ble dgemm_tcopy_L999 | |||
tst M, #2 | |||
ble dgemm_tcopy_L1_BEGIN | |||
dgemm_tcopy_L2_M4_BEGIN: | |||
mov A01, A | |||
add A02, A01, LDA | |||
add A, A02, LDA | |||
mov B01, B | |||
add B, B01, #64 // B = B + 8 * SIZE | |||
asr I, N, #2 // I = N / 4 | |||
cmp I, #0 | |||
ble dgemm_tcopy_L2_M4_40 | |||
.align 5 | |||
dgemm_tcopy_L2_M4_20: | |||
COPY4x2 | |||
subs I , I , #1 | |||
bne dgemm_tcopy_L2_M4_20 | |||
dgemm_tcopy_L2_M4_40: | |||
tst N , #2 | |||
ble dgemm_tcopy_L2_M4_60 | |||
COPY2x2 | |||
dgemm_tcopy_L2_M4_60: | |||
tst N , #1 | |||
ble dgemm_tcopy_L2_M4_END | |||
COPY1x2 | |||
dgemm_tcopy_L2_M4_END: | |||
/*********************************************************************************************/ | |||
dgemm_tcopy_L1_BEGIN: | |||
tst M, #1 | |||
ble dgemm_tcopy_L999 | |||
dgemm_tcopy_L1_M4_BEGIN: | |||
mov A01, A // A01 = A | |||
mov B01, B | |||
asr I, N, #2 // I = M / 4 | |||
cmp I, #0 | |||
ble dgemm_tcopy_L1_M4_40 | |||
.align 5 | |||
dgemm_tcopy_L1_M4_20: | |||
COPY4x1 | |||
subs I , I , #1 | |||
bne dgemm_tcopy_L1_M4_20 | |||
dgemm_tcopy_L1_M4_40: | |||
tst N , #2 | |||
ble dgemm_tcopy_L1_M4_60 | |||
COPY2x1 | |||
dgemm_tcopy_L1_M4_60: | |||
tst N , #1 | |||
ble dgemm_tcopy_L1_M4_END | |||
COPY1x1 | |||
dgemm_tcopy_L1_M4_END: | |||
dgemm_tcopy_L999: | |||
mov x0, #0 // set return value | |||
RESTORE_REGS | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,682 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define M x0 | |||
#define N x1 | |||
#define A x2 | |||
#define LDA x3 | |||
#define B x4 | |||
#define M8 x5 | |||
#define A01 x6 | |||
#define A02 x7 | |||
#define A03 x8 | |||
#define A04 x9 | |||
#define A05 x10 | |||
#define A06 x11 | |||
#define A07 x12 | |||
#define A08 x13 | |||
#define B01 x14 | |||
#define B02 x15 | |||
#define B03 x16 | |||
#define B04 x17 | |||
#define I x18 | |||
#define J x19 | |||
#define TEMP1 x20 | |||
#define TEMP2 x21 | |||
#define A_PREFETCH 2560 | |||
#define B_PREFETCH 256 | |||
/************************************************************************************** | |||
* Macro definitions | |||
**************************************************************************************/ | |||
.macro SAVE_REGS | |||
add sp, sp, #-(11 * 16) | |||
stp d8, d9, [sp, #(0 * 16)] | |||
stp d10, d11, [sp, #(1 * 16)] | |||
stp d12, d13, [sp, #(2 * 16)] | |||
stp d14, d15, [sp, #(3 * 16)] | |||
stp d16, d17, [sp, #(4 * 16)] | |||
stp x18, x19, [sp, #(5 * 16)] | |||
stp x20, x21, [sp, #(6 * 16)] | |||
stp x22, x23, [sp, #(7 * 16)] | |||
stp x24, x25, [sp, #(8 * 16)] | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
.endm | |||
.macro RESTORE_REGS | |||
ldp d8, d9, [sp, #(0 * 16)] | |||
ldp d10, d11, [sp, #(1 * 16)] | |||
ldp d12, d13, [sp, #(2 * 16)] | |||
ldp d14, d15, [sp, #(3 * 16)] | |||
ldp d16, d17, [sp, #(4 * 16)] | |||
ldp x18, x19, [sp, #(5 * 16)] | |||
ldp x20, x21, [sp, #(6 * 16)] | |||
ldp x22, x23, [sp, #(7 * 16)] | |||
ldp x24, x25, [sp, #(8 * 16)] | |||
ldp x26, x27, [sp, #(9 * 16)] | |||
ldr x28, [sp, #(10 * 16)] | |||
add sp, sp, #(11*16) | |||
.endm | |||
/*************************************************************************************************************************/ | |||
.macro COPY8x8 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A01], #32 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
add TEMP1, B01, #64 | |||
ldp q4, q5, [A02], #32 | |||
ldp q6, q7, [A02], #32 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q8, q9, [A03], #32 | |||
ldp q10, q11, [A03], #32 | |||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q12, q13, [A04], #32 | |||
ldp q14, q15, [A04], #32 | |||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q16, q17, [A05], #32 | |||
ldp q18, q19, [A05], #32 | |||
st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q20, q21, [A06], #32 | |||
ldp q22, q23, [A06], #32 | |||
st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q24, q25, [A07], #32 | |||
ldp q26, q27, [A07], #32 | |||
st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q28, q29, [A08], #32 | |||
ldp q30, q31, [A08], #32 | |||
st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
add B01, B01, M8 | |||
.endm | |||
.macro COPY4x8 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A02], #32 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
add B02, B02, #64 | |||
ldp q4, q5, [A03], #32 | |||
ldp q6, q7, [A04], #32 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||
add B02, B02, #64 | |||
ldp q8, q9, [A05], #32 | |||
ldp q10, q11, [A06], #32 | |||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B02] | |||
add B02, B02, #64 | |||
ldp q12, q13, [A07], #32 | |||
ldp q14, q15, [A08], #32 | |||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B02] | |||
add B02, B02, #64 | |||
.endm | |||
.macro COPY2x8 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
ldr q2, [A03], #16 | |||
ldr q3, [A04], #16 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||
add B03, B03, #64 | |||
ldr q4, [A05], #16 | |||
ldr q5, [A06], #16 | |||
ldr q6, [A07], #16 | |||
ldr q7, [A08], #16 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B03] | |||
add B03, B03, #64 | |||
.endm | |||
.macro COPY1x8 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||
add B04, B04, #32 | |||
ldr d4, [A05], #8 | |||
ldr d5, [A06], #8 | |||
ldr d6, [A07], #8 | |||
ldr d7, [A08], #8 | |||
st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B04] | |||
add B04, B04, #32 | |||
.endm | |||
/*************************************************************************************************************************/ | |||
.macro COPY8x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A01], #32 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
add TEMP1, B01, #64 | |||
ldp q4, q5, [A02], #32 | |||
ldp q6, q7, [A02], #32 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q8, q9, [A03], #32 | |||
ldp q10, q11, [A03], #32 | |||
st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
ldp q12, q13, [A04], #32 | |||
ldp q14, q15, [A04], #32 | |||
st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||
add TEMP1, TEMP1, #64 | |||
add B01, B01, M8 | |||
.endm | |||
.macro COPY4x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A02], #32 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
add B02, B02, #64 | |||
ldp q4, q5, [A03], #32 | |||
ldp q6, q7, [A04], #32 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||
add B02, B02, #64 | |||
.endm | |||
.macro COPY2x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
ldr q2, [A03], #16 | |||
ldr q3, [A04], #16 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||
add B03, B03, #64 | |||
.endm | |||
.macro COPY1x4 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
ldr d2, [A03], #8 | |||
ldr d3, [A04], #8 | |||
st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||
add B04, B04, #32 | |||
.endm | |||
/*************************************************************************************************************************/ | |||
.macro COPY8x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A01], #32 | |||
ldp q4, q5, [A02], #32 | |||
ldp q6, q7, [A02], #32 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
add TEMP1, B01, #64 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
add B01, B01, M8 | |||
.endm | |||
.macro COPY4x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A02], #32 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
add B02, B02, #64 | |||
.endm | |||
.macro COPY2x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
ldr q1, [A02], #16 | |||
stp q0, q1, [B03] | |||
add B03, B03, #32 | |||
.endm | |||
.macro COPY1x2 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
//prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
ldr d1, [A02], #8 | |||
stp d0, d1, [B04] | |||
add B04, B04, #16 | |||
.endm | |||
/*************************************************************************************************************************/ | |||
.macro COPY8x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
ldp q2, q3, [A01], #32 | |||
stp q0, q1, [B01] | |||
add TEMP1, B01, #32 | |||
stp q2, q3, [TEMP1] | |||
add B01, B01, M8 | |||
.endm | |||
.macro COPY4x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldp q0, q1, [A01], #32 | |||
stp q0, q1, [B02] | |||
add B02, B02, #32 | |||
.endm | |||
.macro COPY2x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr q0, [A01], #16 | |||
str q0, [B03] | |||
add B03, B03, #16 | |||
.endm | |||
.macro COPY1x1 | |||
//prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
ldr d0, [A01], #8 | |||
str d0, [B04] | |||
add B04, B04, #8 | |||
.endm | |||
/************************************************************************************** | |||
* End of macro definitions | |||
**************************************************************************************/ | |||
PROLOGUE | |||
.align 5 | |||
SAVE_REGS | |||
lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
lsl TEMP1, M, #3 // TEMP1 = M * SIZE | |||
and B02 , N , #-8 | |||
and B03 , N , #-4 | |||
and B04 , N , #-2 | |||
mul B02, B02, TEMP1 | |||
mul B03, B03, TEMP1 | |||
mul B04, B04, TEMP1 | |||
add B02 , B02, B | |||
add B03 , B03, B | |||
add B04 , B04, B | |||
lsl M8, M, #6 // M8 = M * 8 * SIZE | |||
dgemm_tcopy_L8_BEGIN: | |||
asr J, M, #3 // J = M / 4 | |||
cmp J, #0 | |||
ble dgemm_tcopy_L4_BEGIN | |||
.align 5 | |||
dgemm_tcopy_L8_M8_BEGIN: | |||
mov A01, A | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A05, A04, LDA | |||
add A06, A05, LDA | |||
add A07, A06, LDA | |||
add A08, A07, LDA | |||
add A, A08, LDA | |||
mov B01, B | |||
add B, B01, #512 // B = B + 64 * SIZE | |||
asr I, N, #3 // I = N / 8 | |||
cmp I, #0 | |||
ble dgemm_tcopy_L8_M8_40 | |||
.align 5 | |||
dgemm_tcopy_L8_M8_20: | |||
COPY8x8 | |||
subs I , I , #1 | |||
bne dgemm_tcopy_L8_M8_20 | |||
dgemm_tcopy_L8_M8_40: | |||
tst N , #4 | |||
ble dgemm_tcopy_L8_M8_60 | |||
COPY4x8 | |||
dgemm_tcopy_L8_M8_60: | |||
tst N , #2 | |||
ble dgemm_tcopy_L8_M8_80 | |||
COPY2x8 | |||
dgemm_tcopy_L8_M8_80: | |||
tst N, #1 | |||
ble dgemm_tcopy_L8_M8_END | |||
COPY1x8 | |||
dgemm_tcopy_L8_M8_END: | |||
subs J , J, #1 // j-- | |||
bne dgemm_tcopy_L8_M8_BEGIN | |||
/*********************************************************************************************/ | |||
dgemm_tcopy_L4_BEGIN: | |||
tst M, #7 | |||
ble dgemm_tcopy_L999 | |||
tst M, #4 | |||
ble dgemm_tcopy_L2_BEGIN | |||
dgemm_tcopy_L4_M8_BEGIN: | |||
mov A01, A | |||
add A02, A01, LDA | |||
add A03, A02, LDA | |||
add A04, A03, LDA | |||
add A, A04, LDA | |||
mov B01, B | |||
add B, B01, #256 // B = B + 32 * SIZE | |||
asr I, N, #3 // I = N / 8 | |||
cmp I, #0 | |||
ble dgemm_tcopy_L4_M8_40 | |||
.align 5 | |||
dgemm_tcopy_L4_M8_20: | |||
COPY8x4 | |||
subs I , I , #1 | |||
bne dgemm_tcopy_L4_M8_20 | |||
dgemm_tcopy_L4_M8_40: | |||
tst N , #4 | |||
ble dgemm_tcopy_L4_M8_60 | |||
COPY4x4 | |||
dgemm_tcopy_L4_M8_60: | |||
tst N , #2 | |||
ble dgemm_tcopy_L4_M8_80 | |||
COPY2x4 | |||
dgemm_tcopy_L4_M8_80: | |||
tst N, #1 | |||
ble dgemm_tcopy_L4_M8_END | |||
COPY1x4 | |||
dgemm_tcopy_L4_M8_END: | |||
/*********************************************************************************************/ | |||
dgemm_tcopy_L2_BEGIN: | |||
tst M, #3 | |||
ble dgemm_tcopy_L999 | |||
tst M, #2 | |||
ble dgemm_tcopy_L1_BEGIN | |||
dgemm_tcopy_L2_M8_BEGIN: | |||
mov A01, A | |||
add A02, A01, LDA | |||
add A, A02, LDA | |||
mov B01, B | |||
add B, B01, #128 // B = B + 16 * SIZE | |||
asr I, N, #3 // I = N / 8 | |||
cmp I, #0 | |||
ble dgemm_tcopy_L2_M8_40 | |||
.align 5 | |||
dgemm_tcopy_L2_M8_20: | |||
COPY8x2 | |||
subs I , I , #1 | |||
bne dgemm_tcopy_L2_M8_20 | |||
dgemm_tcopy_L2_M8_40: | |||
tst N , #4 | |||
ble dgemm_tcopy_L2_M8_60 | |||
COPY4x2 | |||
dgemm_tcopy_L2_M8_60: | |||
tst N , #2 | |||
ble dgemm_tcopy_L2_M8_80 | |||
COPY2x2 | |||
dgemm_tcopy_L2_M8_80: | |||
tst N , #1 | |||
ble dgemm_tcopy_L2_M8_END | |||
COPY1x2 | |||
dgemm_tcopy_L2_M8_END: | |||
/*********************************************************************************************/ | |||
dgemm_tcopy_L1_BEGIN: | |||
tst M, #1 | |||
ble dgemm_tcopy_L999 | |||
dgemm_tcopy_L1_M8_BEGIN: | |||
mov A01, A // A01 = A | |||
mov B01, B | |||
asr I, N, #3 // I = M / 8 | |||
cmp I, #0 | |||
ble dgemm_tcopy_L1_M8_40 | |||
.align 5 | |||
dgemm_tcopy_L1_M8_20: | |||
COPY8x1 | |||
subs I , I , #1 | |||
bne dgemm_tcopy_L1_M8_20 | |||
dgemm_tcopy_L1_M8_40: | |||
tst N , #4 | |||
ble dgemm_tcopy_L1_M8_60 | |||
COPY4x1 | |||
dgemm_tcopy_L1_M8_60: | |||
tst N , #2 | |||
ble dgemm_tcopy_L1_M8_80 | |||
COPY2x1 | |||
dgemm_tcopy_L1_M8_80: | |||
tst N , #1 | |||
ble dgemm_tcopy_L1_M8_END | |||
COPY1x1 | |||
dgemm_tcopy_L1_M8_END: | |||
dgemm_tcopy_L999: | |||
mov x0, #0 // set return value | |||
RESTORE_REGS | |||
ret | |||
EPILOGUE | |||
@@ -0,0 +1,104 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2014, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if defined(DSDOT) | |||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
#else | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
#endif | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
#if defined(DSDOT) | |||
double dot = 0.0 ; | |||
#else | |||
FLOAT dot = 0.0 ; | |||
#endif | |||
if ( n < 0 ) return(dot); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
int n1 = n & -4; | |||
while(i < n1) | |||
{ | |||
#if defined(DSDOT) | |||
dot += (double) y[i] * (double) x[i] | |||
+ (double) y[i+1] * (double) x[i+1] | |||
+ (double) y[i+2] * (double) x[i+2] | |||
+ (double) y[i+3] * (double) x[i+3] ; | |||
#else | |||
dot += y[i] * x[i] | |||
+ y[i+1] * x[i+1] | |||
+ y[i+2] * x[i+2] | |||
+ y[i+3] * x[i+3] ; | |||
#endif | |||
i+=4 ; | |||
} | |||
while(i < n) | |||
{ | |||
#if defined(DSDOT) | |||
dot += (double) y[i] * (double) x[i] ; | |||
#else | |||
dot += y[i] * x[i] ; | |||
#endif | |||
i++ ; | |||
} | |||
return(dot); | |||
} | |||
while(i < n) | |||
{ | |||
#if defined(DSDOT) | |||
dot += (double) y[iy] * (double) x[ix] ; | |||
#else | |||
dot += y[iy] * x[ix] ; | |||
#endif | |||
ix += inc_x ; | |||
iy += inc_y ; | |||
i++ ; | |||
} | |||
return(dot); | |||
} | |||
@@ -2303,6 +2303,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define ZGEMM_DEFAULT_R 4096 | |||
#define SYMV_P 16 | |||
#endif | |||
#if defined(VULCAN) | |||
#define SNUMOPT 2 | |||
#define DNUMOPT 2 | |||
#define GEMM_DEFAULT_OFFSET_A 0 | |||
#define GEMM_DEFAULT_OFFSET_B 0 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define SGEMM_DEFAULT_P 512 | |||
#define DGEMM_DEFAULT_P dgemm_p | |||
#define CGEMM_DEFAULT_P 256 | |||
#define ZGEMM_DEFAULT_P 128 | |||
#define SGEMM_DEFAULT_Q 1024 | |||
#define DGEMM_DEFAULT_Q dgemm_q | |||
#define CGEMM_DEFAULT_Q 512 | |||
#define ZGEMM_DEFAULT_Q 512 | |||
#define SGEMM_DEFAULT_R 4096 | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
#define CGEMM_DEFAULT_R 4096 | |||
#define ZGEMM_DEFAULT_R 2048 | |||
#define SYMV_P 16 | |||
#endif | |||
@@ -2385,6 +2423,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define SYMV_P 16 | |||
#endif | |||
#if defined(THUNDERX) | |||
#define SNUMOPT 2 | |||
#define DNUMOPT 2 | |||
#define GEMM_DEFAULT_OFFSET_A 0 | |||
#define GEMM_DEFAULT_OFFSET_B 0 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 2 | |||
#define DGEMM_DEFAULT_UNROLL_N 2 | |||
#define CGEMM_DEFAULT_UNROLL_M 2 | |||
#define CGEMM_DEFAULT_UNROLL_N 2 | |||
#define ZGEMM_DEFAULT_UNROLL_M 2 | |||
#define ZGEMM_DEFAULT_UNROLL_N 2 | |||
#define SGEMM_DEFAULT_P 128 | |||
#define DGEMM_DEFAULT_P 128 | |||
#define CGEMM_DEFAULT_P 96 | |||
#define ZGEMM_DEFAULT_P 64 | |||
#define SGEMM_DEFAULT_Q 240 | |||
#define DGEMM_DEFAULT_Q 120 | |||
#define CGEMM_DEFAULT_Q 120 | |||
#define ZGEMM_DEFAULT_Q 120 | |||
#define SGEMM_DEFAULT_R 12288 | |||
#define DGEMM_DEFAULT_R 8192 | |||
#define CGEMM_DEFAULT_R 4096 | |||
#define ZGEMM_DEFAULT_R 4096 | |||
#define SYMV_P 16 | |||
#endif | |||
#if defined(THUNDERX2T99) | |||
#define SNUMOPT 2 | |||
#define DNUMOPT 2 | |||
#define GEMM_DEFAULT_OFFSET_A 0 | |||
#define GEMM_DEFAULT_OFFSET_B 0 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||
#define ZGEMM_DEFAULT_UNROLL_N 4 | |||
#define SGEMM_DEFAULT_P 512 | |||
#define DGEMM_DEFAULT_P dgemm_p | |||
#define CGEMM_DEFAULT_P 256 | |||
#define ZGEMM_DEFAULT_P 128 | |||
#define SGEMM_DEFAULT_Q 1024 | |||
#define DGEMM_DEFAULT_Q dgemm_q | |||
#define CGEMM_DEFAULT_Q 512 | |||
#define ZGEMM_DEFAULT_Q 512 | |||
#define SGEMM_DEFAULT_R 4096 | |||
#define DGEMM_DEFAULT_R dgemm_r | |||
#define CGEMM_DEFAULT_R 4096 | |||
#define ZGEMM_DEFAULT_R 2048 | |||
#define SYMV_P 16 | |||
#endif | |||
#if defined(ARMV5) | |||
#define SNUMOPT 2 | |||