POWER10: Optimize dgemv_ntags/v0.3.13^2
@@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
/************************************************************************************** | |||
* 2016/03/30 Werner Saar (wernsaar@googlemail.com) | |||
* BLASTEST : OK | |||
* CTEST : OK | |||
* TEST : OK | |||
* LAPACK-TEST : OK | |||
**************************************************************************************/ | |||
#define HAVE_KERNEL_4x4 1 | |||
static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) | |||
@@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" | |||
); | |||
} | |||
static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha) | |||
{ | |||
double *a0; | |||
double *a1; | |||
double *a2; | |||
double *a3; | |||
double *a4; | |||
double *a5; | |||
double *a6; | |||
double *a7; | |||
long tmp; | |||
__asm__ | |||
( | |||
"lxvp 34, 0( %15) \n\t" // x0, x1 | |||
"lxvp 38, 32( %15) \n\t" // x4, x5 | |||
XXSPLTD_S(58,%x14,0) // alpha, alpha | |||
"sldi %10, %17, 3 \n\t" // lda * sizeof (double) | |||
"xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha | |||
"xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha | |||
"xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha | |||
"xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha | |||
"li %11, 32 \n\t" | |||
"add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda | |||
"add %10, %10, %10 \n\t" // 2 * lda | |||
XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha | |||
XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha | |||
XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha | |||
XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha | |||
XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha | |||
"add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda | |||
"add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda | |||
"add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda | |||
"add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda | |||
"add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda | |||
"add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda | |||
"lxvp 40, 0( %3) \n\t" // a0[0], a0[1] | |||
"lxvp 42, 0( %4) \n\t" // a1[0], a1[1] | |||
"lxvp 44, 0( %5) \n\t" // a2[0], a2[1] | |||
"lxvp 46, 0( %6) \n\t" // a3[0], a3[1] | |||
"lxvp 50, 0( %7) \n\t" // a4[0] | |||
"lxvp 52, 0( %8) \n\t" // a5[0] | |||
"lxvp 54, 0( %9) \n\t" // a6[0] | |||
"lxvp 56, 0( %10) \n\t" // a7[0] | |||
"addic. %1, %1, -4 \n\t" | |||
"ble two%= \n\t" | |||
".align 5 \n" | |||
"one%=: \n\t" | |||
"lxvp 36, 0( %2) \n\t" // y0, y1 | |||
"xvmaddadp 36, 40, 34 \n\t" | |||
"xvmaddadp 37, 41, 34 \n\t" | |||
"lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] | |||
"xvmaddadp 36, 42, 35 \n\t" | |||
"xvmaddadp 37, 43, 35 \n\t" | |||
"lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] | |||
"xvmaddadp 36, 44, 32 \n\t" | |||
"xvmaddadp 37, 45, 32 \n\t" | |||
"lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] | |||
"xvmaddadp 36, 46, 33 \n\t" | |||
"xvmaddadp 37, 47, 33 \n\t" | |||
"lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] | |||
"xvmaddadp 36, 50, 48 \n\t" | |||
"xvmaddadp 37, 51, 48 \n\t" | |||
"lxvpx 50, %7, %11 \n\t" // a4[0] | |||
"xvmaddadp 36, 52, 49 \n\t" | |||
"xvmaddadp 37, 53, 49 \n\t" | |||
"lxvpx 52, %8, %11 \n\t" // a5[0] | |||
"xvmaddadp 36, 54, 38 \n\t" | |||
"xvmaddadp 37, 55, 38 \n\t" | |||
"lxvpx 54, %9, %11 \n\t" // a6[0] | |||
"xvmaddadp 36, 56, 39 \n\t" | |||
"xvmaddadp 37, 57, 39 \n\t" | |||
"lxvpx 56, %10, %11 \n\t" // a7[0] | |||
"addi %11, %11, 32 \n\t" | |||
"stxvp 36, 0( %2) \n\t" // y0, y1 | |||
"addi %2, %2, 32 \n\t" | |||
"addic. %1, %1, -4 \n\t" | |||
"bgt one%= \n" | |||
"two%=: \n\t" | |||
"lxvp 36, 0( %2) \n\t" // y0, y1 | |||
"xvmaddadp 36, 40, 34 \n\t" | |||
"xvmaddadp 37, 41, 34 \n\t" | |||
"xvmaddadp 36, 42, 35 \n\t" | |||
"xvmaddadp 37, 43, 35 \n\t" | |||
"xvmaddadp 36, 44, 32 \n\t" | |||
"xvmaddadp 37, 45, 32 \n\t" | |||
"xvmaddadp 36, 46, 33 \n\t" | |||
"xvmaddadp 37, 47, 33 \n\t" | |||
"xvmaddadp 36, 50, 48 \n\t" | |||
"xvmaddadp 37, 51, 48 \n\t" | |||
"xvmaddadp 36, 52, 49 \n\t" | |||
"xvmaddadp 37, 53, 49 \n\t" | |||
"xvmaddadp 36, 54, 38 \n\t" | |||
"xvmaddadp 37, 55, 38 \n\t" | |||
"xvmaddadp 36, 56, 39 \n\t" | |||
"xvmaddadp 37, 57, 39 \n\t" | |||
"stxvp 36, 0( %2) \n\t" // y0, y1 | |||
: | |||
"+m" (*y), | |||
"+r" (n), // 1 | |||
"+b" (y), // 2 | |||
"=b" (a0), // 3 | |||
"=b" (a1), // 4 | |||
"=&b" (a2), // 5 | |||
"=&b" (a3), // 6 | |||
"=&b" (a4), // 7 | |||
"=&b" (a5), // 8 | |||
"=&b" (a6), // 9 | |||
"=&b" (a7), // 10 | |||
"=b" (tmp) | |||
: | |||
"m" (*x), | |||
"m" (*ap), | |||
"d" (alpha), // 14 | |||
"r" (x), // 15 | |||
"3" (ap), // 16 | |||
"4" (lda) // 17 | |||
: | |||
"cr0", | |||
"vs32","vs33","vs34","vs35","vs36","vs37", | |||
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48", | |||
"vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58" | |||
); | |||
} |
@@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#include <altivec.h> | |||
typedef __vector unsigned char vec_t; | |||
typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
typedef __vector_pair __attribute__((aligned(8))) vecp_t; | |||
#include "dgemv_n_microk_power10.c" | |||
#define MMA(X, APTR, ACC) \ | |||
rX = (vec_t *) & X; \ | |||
rowA = *((vecp_t*)((void*)&APTR)); \ | |||
__builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); | |||
#define SAVE(ACC, Z) \ | |||
rowC = (v4sf_t *) &y[Z]; \ | |||
__builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
result[0][1] = result[1][0]; \ | |||
result[2][1] = result[3][0]; \ | |||
rowC[0] += valpha * result[0]; \ | |||
rowC[1] += valpha * result[2]; | |||
void | |||
dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, | |||
FLOAT * y, FLOAT alpha) | |||
{ | |||
BLASLONG i, j, tmp; | |||
FLOAT *a0 = a_ptr; | |||
FLOAT *x1 = xo; | |||
vector double valpha = { alpha, alpha }; | |||
v4sf_t *rowC; | |||
__vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
v4sf_t result[4]; | |||
vecp_t rowA; | |||
vec_t *rX; | |||
tmp = (n / 32) * 32; | |||
for (i = 0; i < tmp; i += 32) | |||
{ | |||
xo = x1; | |||
a0 = a_ptr; | |||
__builtin_mma_xxsetaccz (&acc0); | |||
__builtin_mma_xxsetaccz (&acc1); | |||
__builtin_mma_xxsetaccz (&acc2); | |||
__builtin_mma_xxsetaccz (&acc3); | |||
__builtin_mma_xxsetaccz (&acc4); | |||
__builtin_mma_xxsetaccz (&acc5); | |||
__builtin_mma_xxsetaccz (&acc6); | |||
__builtin_mma_xxsetaccz (&acc7); | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
SAVE (&acc0, i + 0); | |||
SAVE (&acc1, i + 4); | |||
SAVE (&acc2, i + 8); | |||
SAVE (&acc3, i + 12); | |||
SAVE (&acc4, i + 16); | |||
SAVE (&acc5, i + 20); | |||
SAVE (&acc6, i + 24); | |||
SAVE (&acc7, i + 28); | |||
} | |||
for (i = tmp; i < n; i += 4) | |||
{ | |||
xo = x1; | |||
a0 = a_ptr; | |||
__builtin_mma_xxsetaccz (&acc0); | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + j * lda], &acc0); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + j * lda], &acc0); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + j * lda], &acc0); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
for (j = 0; j < 32; j++) | |||
{ | |||
__builtin_prefetch (xo+j); | |||
__builtin_prefetch (a0+i+j+lda); | |||
MMA (xo[j], a0[i + j * lda], &acc0); | |||
} | |||
xo += 32; | |||
a0 += lda << 5; | |||
SAVE (&acc0, i); | |||
} | |||
} | |||
#define NBMAX 4096 | |||
#ifndef HAVE_KERNEL_4x4 | |||
@@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
FLOAT *a_ptr; | |||
FLOAT *x_ptr; | |||
FLOAT *y_ptr; | |||
BLASLONG n1; | |||
BLASLONG m1; | |||
BLASLONG m2; | |||
BLASLONG m3; | |||
BLASLONG n2; | |||
BLASLONG lda4 = lda << 2; | |||
BLASLONG lda128 = lda << 7; | |||
BLASLONG lda8 = lda << 3; | |||
FLOAT xbuffer[8] __attribute__ ((aligned (16))); | |||
FLOAT *ybuffer; | |||
@@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
if ( n < 1 ) return(0); | |||
ybuffer = buffer; | |||
BLASLONG n128 = n >> 7; | |||
n1 = (n - (n128 * 128)) >> 2; | |||
n2 = (n - (n128 * 128)) & 3; | |||
BLASLONG n8 = n >> 3; | |||
n2 = n & 3; | |||
m3 = m & 3 ; | |||
m1 = m & -4 ; | |||
@@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
if ( inc_x == 1 ) | |||
{ | |||
for( i = 0; i < n128 ; i++) | |||
for( i = 0; i < n8 ; i++) | |||
{ | |||
dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
a_ptr += lda128; | |||
x_ptr += 128; | |||
dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
a_ptr += lda8; | |||
x_ptr += 8; | |||
} | |||
for( i = 0; i < n1 ; i++) | |||
if( n & 4 ) | |||
{ | |||
dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
a_ptr += lda4; | |||
@@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
} | |||
else | |||
{ | |||
for( i = 0; i < n128 ; i++) | |||
for( i = 0; i < n8 ; i++) | |||
{ | |||
FLOAT xbuffer[128] __attribute__ ((aligned (16))); | |||
BLASLONG j; | |||
for ( j = 0; j < 128 ; j++) | |||
for ( j = 0; j < 8 ; j++) | |||
{ | |||
xbuffer[j] = x_ptr[0]; | |||
x_ptr += inc_x; | |||
} | |||
dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); | |||
a_ptr += lda128; | |||
dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha); | |||
a_ptr += lda8; | |||
} | |||
for( i = 0; i < n1 ; i++) | |||
if( n & 4 ) | |||
{ | |||
xbuffer[0] = x_ptr[0]; | |||
x_ptr += inc_x; | |||