POWER10: Optimize dgemv_ntags/v0.3.13^2
| @@ -25,14 +25,6 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/30 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) | |||
| @@ -266,3 +258,145 @@ static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" | |||
| ); | |||
| } | |||
| static void dgemv_kernel_4x8 (long n, double *ap, long lda, double *x, double *y, double alpha) | |||
| { | |||
| double *a0; | |||
| double *a1; | |||
| double *a2; | |||
| double *a3; | |||
| double *a4; | |||
| double *a5; | |||
| double *a6; | |||
| double *a7; | |||
| long tmp; | |||
| __asm__ | |||
| ( | |||
| "lxvp 34, 0( %15) \n\t" // x0, x1 | |||
| "lxvp 38, 32( %15) \n\t" // x4, x5 | |||
| XXSPLTD_S(58,%x14,0) // alpha, alpha | |||
| "sldi %10, %17, 3 \n\t" // lda * sizeof (double) | |||
| "xvmuldp 34, 34, 58 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 35, 58 \n\t" // x2 * alpha, x3 * alpha | |||
| "xvmuldp 38, 38, 58 \n\t" // x4 * alpha, x5 * alpha | |||
| "xvmuldp 39, 39, 58 \n\t" // x6 * alpha, x7 * alpha | |||
| "li %11, 32 \n\t" | |||
| "add %4, %3, %10 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %10, %10, %10 \n\t" // 2 * lda | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha | |||
| XXSPLTD_S(48,39,1) // x6 * alpha, x6 * alpha | |||
| XXSPLTD_S(49,39,0) // x7 * alpha, x7 * alpha | |||
| XXSPLTD_S(39,38,0) // x5 * alpha, x5 * alpha | |||
| XXSPLTD_S(38,38,1) // x4 * alpha, x4 * alpha | |||
| "add %5, %3, %10 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %10 \n\t" // a3 = a1 + 2 * lda | |||
| "add %7, %5, %10 \n\t" // a4 = a2 + 2 * lda | |||
| "add %8, %6, %10 \n\t" // a5 = a3 + 2 * lda | |||
| "add %9, %7, %10 \n\t" // a6 = a4 + 2 * lda | |||
| "add %10, %8, %10 \n\t" // a7 = a5 + 2 * lda | |||
| "lxvp 40, 0( %3) \n\t" // a0[0], a0[1] | |||
| "lxvp 42, 0( %4) \n\t" // a1[0], a1[1] | |||
| "lxvp 44, 0( %5) \n\t" // a2[0], a2[1] | |||
| "lxvp 46, 0( %6) \n\t" // a3[0], a3[1] | |||
| "lxvp 50, 0( %7) \n\t" // a4[0] | |||
| "lxvp 52, 0( %8) \n\t" // a5[0] | |||
| "lxvp 54, 0( %9) \n\t" // a6[0] | |||
| "lxvp 56, 0( %10) \n\t" // a7[0] | |||
| "addic. %1, %1, -4 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| "lxvpx 40, %3, %11 \n\t" // a0[0], a0[1] | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| "xvmaddadp 37, 43, 35 \n\t" | |||
| "lxvpx 42, %4, %11 \n\t" // a1[0], a1[1] | |||
| "xvmaddadp 36, 44, 32 \n\t" | |||
| "xvmaddadp 37, 45, 32 \n\t" | |||
| "lxvpx 44, %5, %11 \n\t" // a2[0], a2[1] | |||
| "xvmaddadp 36, 46, 33 \n\t" | |||
| "xvmaddadp 37, 47, 33 \n\t" | |||
| "lxvpx 46, %6, %11 \n\t" // a3[0], a3[1] | |||
| "xvmaddadp 36, 50, 48 \n\t" | |||
| "xvmaddadp 37, 51, 48 \n\t" | |||
| "lxvpx 50, %7, %11 \n\t" // a4[0] | |||
| "xvmaddadp 36, 52, 49 \n\t" | |||
| "xvmaddadp 37, 53, 49 \n\t" | |||
| "lxvpx 52, %8, %11 \n\t" // a5[0] | |||
| "xvmaddadp 36, 54, 38 \n\t" | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| "lxvpx 54, %9, %11 \n\t" // a6[0] | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| "lxvpx 56, %10, %11 \n\t" // a7[0] | |||
| "addi %11, %11, 32 \n\t" | |||
| "stxvp 36, 0( %2) \n\t" // y0, y1 | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %1, %1, -4 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "lxvp 36, 0( %2) \n\t" // y0, y1 | |||
| "xvmaddadp 36, 40, 34 \n\t" | |||
| "xvmaddadp 37, 41, 34 \n\t" | |||
| "xvmaddadp 36, 42, 35 \n\t" | |||
| "xvmaddadp 37, 43, 35 \n\t" | |||
| "xvmaddadp 36, 44, 32 \n\t" | |||
| "xvmaddadp 37, 45, 32 \n\t" | |||
| "xvmaddadp 36, 46, 33 \n\t" | |||
| "xvmaddadp 37, 47, 33 \n\t" | |||
| "xvmaddadp 36, 50, 48 \n\t" | |||
| "xvmaddadp 37, 51, 48 \n\t" | |||
| "xvmaddadp 36, 52, 49 \n\t" | |||
| "xvmaddadp 37, 53, 49 \n\t" | |||
| "xvmaddadp 36, 54, 38 \n\t" | |||
| "xvmaddadp 37, 55, 38 \n\t" | |||
| "xvmaddadp 36, 56, 39 \n\t" | |||
| "xvmaddadp 37, 57, 39 \n\t" | |||
| "stxvp 36, 0( %2) \n\t" // y0, y1 | |||
| : | |||
| "+m" (*y), | |||
| "+r" (n), // 1 | |||
| "+b" (y), // 2 | |||
| "=b" (a0), // 3 | |||
| "=b" (a1), // 4 | |||
| "=&b" (a2), // 5 | |||
| "=&b" (a3), // 6 | |||
| "=&b" (a4), // 7 | |||
| "=&b" (a5), // 8 | |||
| "=&b" (a6), // 9 | |||
| "=&b" (a7), // 10 | |||
| "=b" (tmp) | |||
| : | |||
| "m" (*x), | |||
| "m" (*ap), | |||
| "d" (alpha), // 14 | |||
| "r" (x), // 15 | |||
| "3" (ap), // 16 | |||
| "4" (lda) // 17 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", "vs48", | |||
| "vs49","vs50","vs51","vs52","vs53","vs54","vs55","vs56", "vs57", "vs58" | |||
| ); | |||
| } | |||
| @@ -26,165 +26,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef __vector_pair __attribute__((aligned(8))) vecp_t; | |||
| #include "dgemv_n_microk_power10.c" | |||
| #define MMA(X, APTR, ACC) \ | |||
| rX = (vec_t *) & X; \ | |||
| rowA = *((vecp_t*)((void*)&APTR)); \ | |||
| __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); | |||
| #define SAVE(ACC, Z) \ | |||
| rowC = (v4sf_t *) &y[Z]; \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| result[0][1] = result[1][0]; \ | |||
| result[2][1] = result[3][0]; \ | |||
| rowC[0] += valpha * result[0]; \ | |||
| rowC[1] += valpha * result[2]; | |||
| void | |||
| dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, | |||
| FLOAT * y, FLOAT alpha) | |||
| { | |||
| BLASLONG i, j, tmp; | |||
| FLOAT *a0 = a_ptr; | |||
| FLOAT *x1 = xo; | |||
| vector double valpha = { alpha, alpha }; | |||
| v4sf_t *rowC; | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| v4sf_t result[4]; | |||
| vecp_t rowA; | |||
| vec_t *rX; | |||
| tmp = (n / 32) * 32; | |||
| for (i = 0; i < tmp; i += 32) | |||
| { | |||
| xo = x1; | |||
| a0 = a_ptr; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| __builtin_mma_xxsetaccz (&acc1); | |||
| __builtin_mma_xxsetaccz (&acc2); | |||
| __builtin_mma_xxsetaccz (&acc3); | |||
| __builtin_mma_xxsetaccz (&acc4); | |||
| __builtin_mma_xxsetaccz (&acc5); | |||
| __builtin_mma_xxsetaccz (&acc6); | |||
| __builtin_mma_xxsetaccz (&acc7); | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| SAVE (&acc0, i + 0); | |||
| SAVE (&acc1, i + 4); | |||
| SAVE (&acc2, i + 8); | |||
| SAVE (&acc3, i + 12); | |||
| SAVE (&acc4, i + 16); | |||
| SAVE (&acc5, i + 20); | |||
| SAVE (&acc6, i + 24); | |||
| SAVE (&acc7, i + 28); | |||
| } | |||
| for (i = tmp; i < n; i += 4) | |||
| { | |||
| xo = x1; | |||
| a0 = a_ptr; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| SAVE (&acc0, i); | |||
| } | |||
| } | |||
| #define NBMAX 4096 | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| @@ -281,13 +125,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda128 = lda << 7; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__ ((aligned (16))); | |||
| FLOAT *ybuffer; | |||
| @@ -296,9 +139,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| BLASLONG n128 = n >> 7; | |||
| n1 = (n - (n128 * 128)) >> 2; | |||
| n2 = (n - (n128 * 128)) & 3; | |||
| BLASLONG n8 = n >> 3; | |||
| n2 = n & 3; | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| @@ -329,14 +171,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n128 ; i++) | |||
| for( i = 0; i < n8 ; i++) | |||
| { | |||
| dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
| a_ptr += lda128; | |||
| x_ptr += 128; | |||
| dgemv_kernel_4x8(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| for( i = 0; i < n1 ; i++) | |||
| if( n & 4 ) | |||
| { | |||
| dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
| a_ptr += lda4; | |||
| @@ -363,20 +205,19 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n128 ; i++) | |||
| for( i = 0; i < n8 ; i++) | |||
| { | |||
| FLOAT xbuffer[128] __attribute__ ((aligned (16))); | |||
| BLASLONG j; | |||
| for ( j = 0; j < 128 ; j++) | |||
| for ( j = 0; j < 8 ; j++) | |||
| { | |||
| xbuffer[j] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| } | |||
| dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); | |||
| a_ptr += lda128; | |||
| dgemv_kernel_4x8(NB,a_ptr,lda,xbuffer,ybuffer,alpha); | |||
| a_ptr += lda8; | |||
| } | |||
| for( i = 0; i < n1 ; i++) | |||
| if( n & 4 ) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||