Browse Source

POWER10: Improve axpy performance

This patch aligns the stores to 32 byte boundary for saxpy and daxpy
before entering into vector pair loop. Fox caxpy, changed the store
instructions to stxv to improve performance of unaligned cases.
tags/v0.3.13^2
Rajalakshmi Srinivasaraghavan 5 years ago
parent
commit
346e30a46a
3 changed files with 38 additions and 17 deletions
  1. +16
    -8
      kernel/power/caxpy_microk_power10.c
  2. +12
    -5
      kernel/power/daxpy_power10.c
  3. +10
    -4
      kernel/power/saxpy_power10.c

+ 16
- 8
kernel/power/caxpy_microk_power10.c View File

@@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"

"stxvp 48, 0(%4) \n\t"
"stxvp 50, 32(%4) \n\t"
"stxvp 34, 64(%4) \n\t"
"stxvp 38, 96(%4) \n\t"
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
"stxv 50, 48(%4) \n\t"
"stxv 35, 64(%4) \n\t"
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"

"addi %4, %4, 128 \n\t"
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
@@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y,
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"

"stxvp 48, 0(%4) \n\t"
"stxvp 50, 32(%4) \n\t"
"stxvp 34, 64(%4) \n\t"
"stxvp 38, 96(%4) \n\t"
"stxv 49, 0(%4) \n\t"
"stxv 48, 16(%4) \n\t"
"stxv 51, 32(%4) \n\t"
"stxv 50, 48(%4) \n\t"
"stxv 35, 64(%4) \n\t"
"stxv 34, 80(%4) \n\t"
"stxv 39, 96(%4) \n\t"
"stxv 38, 112(%4) \n\t"

"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
:


+ 12
- 5
kernel/power/daxpy_power10.c View File

@@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -16;
if ( n >= 16 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3;
for (i = 0; i < align; i++) {
y[i] += da * x[i] ;
}
}
BLASLONG n1 = (n-i) & -16;
if ( n1 )
daxpy_kernel_8(n1, &x[i], &y[i], da);

i += n1;

if ( n1 )
daxpy_kernel_8(n1, x, y, da);

i = n1;
while(i < n)
{



+ 10
- 4
kernel/power/saxpy_power10.c View File

@@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
if ( (inc_x == 1) && (inc_y == 1) )
{

BLASLONG n1 = n & -64;

if ( n >= 64 )
{
BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7;
for (i = 0; i < align; i++) {
y[i] += da * x[i] ;
}
}
BLASLONG n1 = (n-i) & -64;
if ( n1 )
saxpy_kernel_64(n1, x, y, da);
saxpy_kernel_64(n1, &x[i], &y[i], da);

i = n1;
i += n1;
while(i < n)
{



Loading…
Cancel
Save