This patch makes use of new POWER10 vector pair instructions for loads and stores.tags/v0.3.13^2
@@ -151,9 +151,9 @@ DCOPYKERNEL = dcopy_power10.c | |||
CCOPYKERNEL = ccopy_power10.c | |||
ZCOPYKERNEL = zcopy_power10.c | |||
# | |||
SDOTKERNEL = sdot.c | |||
DDOTKERNEL = ddot.c | |||
DSDOTKERNEL = sdot.c | |||
SDOTKERNEL = sdot_power10.c | |||
DDOTKERNEL = ddot_power10.c | |||
DSDOTKERNEL = sdot_power10.c | |||
ifneq ($(GCCVERSIONGTEQ9),1) | |||
CDOTKERNEL = cdot_power9.S | |||
else | |||
@@ -0,0 +1,131 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2020, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define HAVE_KERNEL_8 1 | |||
static double ddot_kernel_8 (long n, double *x, double *y) | |||
{ | |||
double dot; | |||
__asm__ | |||
( | |||
"dcbt 0, %2 \n\t" | |||
"dcbt 0, %3 \n\t" | |||
"xxlxor 32, 32, 32 \n\t" | |||
"xxlxor 33, 33, 33 \n\t" | |||
"xxlxor 34, 34, 34 \n\t" | |||
"xxlxor 35, 35, 35 \n\t" | |||
"xxlxor 36, 36, 36 \n\t" | |||
"xxlxor 37, 37, 37 \n\t" | |||
"xxlxor 38, 38, 38 \n\t" | |||
"xxlxor 39, 39, 39 \n\t" | |||
"lxvp 40, 0(%2) \n\t" | |||
"lxvp 42, 32(%2) \n\t" | |||
"lxvp 44, 64(%2) \n\t" | |||
"lxvp 46, 96(%2) \n\t" | |||
"lxvp 48, 0(%3) \n\t" | |||
"lxvp 50, 32(%3) \n\t" | |||
"lxvp 52, 64(%3) \n\t" | |||
"lxvp 54, 96(%3) \n\t" | |||
"addi %2, %2, 128 \n\t" | |||
"addi %3, %3, 128 \n\t" | |||
"addic. %1, %1, -16 \n\t" | |||
"ble two%= \n\t" | |||
".align 5 \n" | |||
"one%=: \n\t" | |||
"xvmaddadp 32, 40, 48 \n\t" | |||
"xvmaddadp 33, 41, 49 \n\t" | |||
"lxvp 40, 0(%2) \n\t" | |||
"lxvp 48, 0(%3) \n\t" | |||
"xvmaddadp 34, 42, 50 \n\t" | |||
"xvmaddadp 35, 43, 51 \n\t" | |||
"lxvp 42, 32(%2) \n\t" | |||
"lxvp 50, 32(%3) \n\t" | |||
"xvmaddadp 36, 44, 52 \n\t" | |||
"xvmaddadp 37, 45, 53 \n\t" | |||
"lxvp 44, 64(%2) \n\t" | |||
"lxvp 52, 64(%3) \n\t" | |||
"xvmaddadp 38, 46, 54 \n\t" | |||
"xvmaddadp 39, 47, 55 \n\t" | |||
"lxvp 46, 96(%2) \n\t" | |||
"lxvp 54, 96(%3) \n\t" | |||
"addi %2, %2, 128 \n\t" | |||
"addi %3, %3, 128 \n\t" | |||
"addic. %1, %1, -16 \n\t" | |||
"bgt one%= \n" | |||
"two%=: \n\t" | |||
"xvmaddadp 32, 40, 48 \n\t" | |||
"xvmaddadp 33, 41, 49 \n\t" | |||
"xvmaddadp 34, 42, 50 \n\t" | |||
"xvmaddadp 35, 43, 51 \n\t" | |||
"xvmaddadp 36, 44, 52 \n\t" | |||
"xvmaddadp 37, 45, 53 \n\t" | |||
"xvmaddadp 38, 46, 54 \n\t" | |||
"xvmaddadp 39, 47, 55 \n\t" | |||
"xvadddp 32, 32, 33 \n\t" | |||
"xvadddp 34, 34, 35 \n\t" | |||
"xvadddp 36, 36, 37 \n\t" | |||
"xvadddp 38, 38, 39 \n\t" | |||
"xvadddp 32, 32, 34 \n\t" | |||
"xvadddp 36, 36, 38 \n\t" | |||
"xvadddp 32, 32, 36 \n\t" | |||
XXSWAPD_S(33,32) | |||
"xsadddp %x0, 32, 33 \n" | |||
"#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" | |||
: | |||
"=d" (dot), // 0 | |||
"+r" (n), // 1 | |||
"+b" (x), // 2 | |||
"+b" (y) // 3 | |||
: | |||
"m" (*x), | |||
"m" (*y) | |||
: | |||
"cr0", | |||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" | |||
); | |||
return dot; | |||
} |
@@ -0,0 +1,130 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013-2016, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#include "ddot_microk_power10.c" | |||
#endif | |||
#ifndef HAVE_KERNEL_8 | |||
static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y) | |||
{ | |||
BLASLONG register i = 0; | |||
FLOAT dot = 0.0; | |||
while(i < n) | |||
{ | |||
dot += y[i] * x[i] | |||
+ y[i+1] * x[i+1] | |||
+ y[i+2] * x[i+2] | |||
+ y[i+3] * x[i+3] | |||
+ y[i+4] * x[i+4] | |||
+ y[i+5] * x[i+5] | |||
+ y[i+6] * x[i+6] | |||
+ y[i+7] * x[i+7] ; | |||
i+=8 ; | |||
} | |||
return dot; | |||
} | |||
#endif | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
FLOAT dot = 0.0 ; | |||
if ( n <= 0 ) return(dot); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
BLASLONG n1 = n & -16; | |||
if ( n1 ) | |||
dot = ddot_kernel_8(n1, x, y); | |||
i = n1; | |||
while(i < n) | |||
{ | |||
dot += y[i] * x[i] ; | |||
i++ ; | |||
} | |||
return(dot); | |||
} | |||
FLOAT temp1 = 0.0; | |||
FLOAT temp2 = 0.0; | |||
BLASLONG n1 = n & -4; | |||
while(i < n1) | |||
{ | |||
FLOAT m1 = y[iy] * x[ix] ; | |||
FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; | |||
FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; | |||
FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; | |||
ix += inc_x*4 ; | |||
iy += inc_y*4 ; | |||
temp1 += m1+m3; | |||
temp2 += m2+m4; | |||
i+=4 ; | |||
} | |||
while(i < n) | |||
{ | |||
temp1 += y[iy] * x[ix] ; | |||
ix += inc_x ; | |||
iy += inc_y ; | |||
i++ ; | |||
} | |||
dot = temp1 + temp2; | |||
return(dot); | |||
} | |||
@@ -0,0 +1,135 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2020, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#define HAVE_KERNEL_16 1 | |||
static float sdot_kernel_16 (long n, float *x, float *y) | |||
{ | |||
float dot; | |||
__asm__ | |||
( | |||
"dcbt 0, %2 \n\t" | |||
"dcbt 0, %3 \n\t" | |||
"xxlxor 32, 32, 32 \n\t" | |||
"xxlxor 33, 33, 33 \n\t" | |||
"xxlxor 34, 34, 34 \n\t" | |||
"xxlxor 35, 35, 35 \n\t" | |||
"xxlxor 36, 36, 36 \n\t" | |||
"xxlxor 37, 37, 37 \n\t" | |||
"xxlxor 38, 38, 38 \n\t" | |||
"xxlxor 39, 39, 39 \n\t" | |||
"lxvp 40, 0(%2) \n\t" | |||
"lxvp 42, 32(%2) \n\t" | |||
"lxvp 44, 64(%2) \n\t" | |||
"lxvp 46, 96(%2) \n\t" | |||
"lxvp 48, 0(%3) \n\t" | |||
"lxvp 50, 32(%3) \n\t" | |||
"lxvp 52, 64(%3) \n\t" | |||
"lxvp 54, 96(%3) \n\t" | |||
"addi %2, %2, 128 \n\t" | |||
"addi %3, %3, 128 \n\t" | |||
"addic. %1, %1, -32 \n\t" | |||
"ble two%= \n\t" | |||
".align 5 \n" | |||
"one%=: \n\t" | |||
"xvmaddasp 32, 40, 48 \n\t" | |||
"xvmaddasp 33, 41, 49 \n\t" | |||
"lxvp 40, 0(%2) \n\t" | |||
"lxvp 48, 0(%3) \n\t" | |||
"xvmaddasp 34, 42, 50 \n\t" | |||
"xvmaddasp 35, 43, 51 \n\t" | |||
"lxvp 42, 32(%2) \n\t" | |||
"lxvp 50, 32(%3) \n\t" | |||
"xvmaddasp 36, 44, 52 \n\t" | |||
"xvmaddasp 37, 45, 53 \n\t" | |||
"lxvp 44, 64(%2) \n\t" | |||
"lxvp 52, 64(%3) \n\t" | |||
"xvmaddasp 38, 46, 54 \n\t" | |||
"xvmaddasp 39, 47, 55 \n\t" | |||
"lxvp 46, 96(%2) \n\t" | |||
"lxvp 54, 96(%3) \n\t" | |||
"addi %2, %2, 128 \n\t" | |||
"addi %3, %3, 128 \n\t" | |||
"addic. %1, %1, -32 \n\t" | |||
"bgt one%= \n" | |||
"two%=: \n\t" | |||
"xvmaddasp 32, 40, 48 \n\t" | |||
"xvmaddasp 33, 41, 49 \n\t" | |||
"xvmaddasp 34, 42, 50 \n\t" | |||
"xvmaddasp 35, 43, 51 \n\t" | |||
"xvmaddasp 36, 44, 52 \n\t" | |||
"xvmaddasp 37, 45, 53 \n\t" | |||
"xvmaddasp 38, 46, 54 \n\t" | |||
"xvmaddasp 39, 47, 55 \n\t" | |||
"xvaddsp 32, 32, 33 \n\t" | |||
"xvaddsp 34, 34, 35 \n\t" | |||
"xvaddsp 36, 36, 37 \n\t" | |||
"xvaddsp 38, 38, 39 \n\t" | |||
"xvaddsp 32, 32, 34 \n\t" | |||
"xvaddsp 36, 36, 38 \n\t" | |||
"xvaddsp 32, 32, 36 \n\t" | |||
"xxsldwi 33, 32, 32, 2 \n\t" | |||
"xvaddsp 32, 32, 33 \n\t" | |||
"xxsldwi 33, 32, 32, 1 \n\t" | |||
"xvaddsp 32, 32, 33 \n\t" | |||
"xscvspdp %x0, 32 \n" | |||
"#dot=%0 n=%1 x=%4=%2 y=%5=%3\n" | |||
: | |||
"=f" (dot), // 0 | |||
"+r" (n), // 1 | |||
"+b" (x), // 2 | |||
"+b" (y) // 3 | |||
: | |||
"m" (*x), | |||
"m" (*y) | |||
: | |||
"cr0", | |||
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55" | |||
); | |||
return dot; | |||
} |
@@ -0,0 +1,154 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2020, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if defined(__VEC__) || defined(__ALTIVEC__) | |||
#include "sdot_microk_power10.c" | |||
#endif | |||
#ifndef HAVE_KERNEL_16 | |||
static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||
{ | |||
BLASLONG register i = 0; | |||
FLOAT dot = 0.0; | |||
while(i < n) | |||
{ | |||
dot += y[i] * x[i] | |||
+ y[i+1] * x[i+1] | |||
+ y[i+2] * x[i+2] | |||
+ y[i+3] * x[i+3] | |||
+ y[i+4] * x[i+4] | |||
+ y[i+5] * x[i+5] | |||
+ y[i+6] * x[i+6] | |||
+ y[i+7] * x[i+7] ; | |||
i+=8 ; | |||
} | |||
return dot; | |||
} | |||
#endif | |||
#if defined (DSDOT) | |||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
#else | |||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
#endif | |||
{ | |||
BLASLONG i=0; | |||
BLASLONG ix=0,iy=0; | |||
double dot = 0.0 ; | |||
#if defined (DSDOT) | |||
double mydot = 0.0; | |||
FLOAT asmdot = 0.0; | |||
#else | |||
FLOAT mydot=0.0; | |||
#endif | |||
BLASLONG n1; | |||
if ( n <= 0 ) return(dot); | |||
if ( (inc_x == 1) && (inc_y == 1) ) | |||
{ | |||
n1 = n & (BLASLONG)(-32); | |||
if ( n1 ) | |||
#if defined(DSDOT) | |||
{ | |||
FLOAT *x1=x; | |||
FLOAT *y1=y; | |||
BLASLONG n2 = 32; | |||
while (i<n1) { | |||
asmdot = sdot_kernel_16(n2, x1, y1); | |||
mydot += (double)asmdot; | |||
asmdot=0.; | |||
x1+=32; | |||
y1+=32; | |||
i+=32; | |||
} | |||
} | |||
#else | |||
mydot = sdot_kernel_16(n1, x, y); | |||
#endif | |||
i = n1; | |||
while(i < n) | |||
{ | |||
#if defined(DSDOT) | |||
dot += (double)y[i] * (double)x[i] ; | |||
#else | |||
dot += y[i] * x[i] ; | |||
#endif | |||
i++ ; | |||
} | |||
dot+=mydot; | |||
return(dot); | |||
} | |||
n1 = n & (BLASLONG)(-2); | |||
while(i < n1) | |||
{ | |||
#if defined (DSDOT) | |||
dot += (double)y[iy] * (double)x[ix] + (double)y[iy+inc_y] * (double)x[ix+inc_x]; | |||
#else | |||
dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; | |||
#endif | |||
ix += inc_x*2 ; | |||
iy += inc_y*2 ; | |||
i+=2 ; | |||
} | |||
while(i < n) | |||
{ | |||
#if defined (DSDOT) | |||
dot += (double)y[iy] * (double)x[ix] ; | |||
#else | |||
dot += y[iy] * x[ix] ; | |||
#endif | |||
ix += inc_x ; | |||
iy += inc_y ; | |||
i++ ; | |||
} | |||
return(dot); | |||
} | |||