Browse Source

Merge pull request #2190 from martin-frbg/zdot-zen

Replace vpermpd with vpermilpd in the Haswell/Zen zdot microkernel
tags/v0.3.7
Martin Kroeker GitHub 6 years ago
parent
commit
7b0b7c11d2
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 16 additions and 8 deletions
  1. +16
    -8
      kernel/x86_64/zdot_microk_haswell-2.c

+ 16
- 8
kernel/x86_64/zdot_microk_haswell-2.c View File

@@ -66,13 +66,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
"vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
"vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"

"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
"vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
"vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
"vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
"vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"

"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
"addq $16 , %0 \n\t"
@@ -151,13 +155,17 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"vfmadd231pd %%ymm8 , %%ymm12, %%ymm0 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm9 , %%ymm13, %%ymm1 \n\t" // x_r * y_r, x_i * y_i
"vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
"vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"
"vpermilpd $0x05 , %%ymm12, %%ymm12 \n\t"
"vpermilpd $0x05 , %%ymm13, %%ymm13 \n\t"
// "vpermpd $0xb1 , %%ymm12, %%ymm12 \n\t"
// "vpermpd $0xb1 , %%ymm13, %%ymm13 \n\t"

"vfmadd231pd %%ymm10, %%ymm14, %%ymm2 \n\t" // x_r * y_r, x_i * y_i
"vfmadd231pd %%ymm11, %%ymm15, %%ymm3 \n\t" // x_r * y_r, x_i * y_i
"vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
"vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"
"vpermilpd $0x05 , %%ymm14, %%ymm14 \n\t"
"vpermilpd $0x05 , %%ymm15, %%ymm15 \n\t"
// "vpermpd $0xb1 , %%ymm14, %%ymm14 \n\t"
// "vpermpd $0xb1 , %%ymm15, %%ymm15 \n\t"

"vfmadd231pd %%ymm8 , %%ymm12, %%ymm4 \n\t" // x_r * y_i, x_i * y_r
"addq $16 , %0 \n\t"


Loading…
Cancel
Save