Browse Source

optimized dgemv_t kernel for haswell

tags/v0.2.12^2
wernsaar 11 years ago
parent
commit
8109d8232c
2 changed files with 12 additions and 20 deletions
  1. +1
    -9
      kernel/x86_64/dgemv_t_4.c
  2. +11
    -11
      kernel/x86_64/dgemv_t_microk_haswell-4.c

+ 1
- 9
kernel/x86_64/dgemv_t_4.c View File

@@ -28,17 +28,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "common.h"

/*
#if defined(NEHALEM)
#include "dgemv_t_microk_nehalem-4.c"
#elif defined(BULLDOZER) || defined(PILEDRIVER)
#include "dgemv_t_microk_bulldozer-4.c"
#elif defined(SANDYBRIDGE)
#include "dgemv_t_microk_sandy-4.c"
#elif defined(HASWELL)
#if defined(HASWELL)
#include "dgemv_t_microk_haswell-4.c"
#endif
*/

#define NBMAX 2048



+ 11
- 11
kernel/x86_64/dgemv_t_microk_haswell-4.c View File

@@ -61,25 +61,25 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)

".align 16 \n\t"
".L01LOOP%=: \n\t"
"prefetcht0 384(%2,%0,8) \n\t"
// "prefetcht0 384(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x

"prefetcht0 384(%4,%0,8) \n\t"
// "prefetcht0 384(%4,%0,8) \n\t"
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t"
"prefetcht0 384(%5,%0,8) \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"prefetcht0 384(%6,%0,8) \n\t"
// "prefetcht0 384(%5,%0,8) \n\t"
"vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t"
"prefetcht0 384(%7,%0,8) \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm13, %%ymm7 \n\t"
// "prefetcht0 384(%6,%0,8) \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"addq $8 , %0 \n\t"
// "prefetcht0 384(%7,%0,8) \n\t"
"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t"
"subq $8 , %1 \n\t"
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"

"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"

".L16END%=: \n\t"


Loading…
Cancel
Save