Browse Source

Merge pull request #473 from wernsaar/develop

changed inline assembler labels to short form
tags/v0.2.14^2
Zhang Xianyi 11 years ago
parent
commit
8fe7a9ce6f
38 changed files with 241 additions and 241 deletions
  1. +2
    -2
      kernel/x86_64/caxpy_microk_bulldozer-2.c
  2. +24
    -24
      kernel/x86_64/cgemv_n_microk_haswell-4.c
  3. +18
    -18
      kernel/x86_64/cgemv_t_microk_haswell-4.c
  4. +2
    -2
      kernel/x86_64/daxpy_microk_bulldozer-2.c
  5. +2
    -2
      kernel/x86_64/daxpy_microk_nehalem-2.c
  6. +2
    -2
      kernel/x86_64/ddot_microk_bulldozer-2.c
  7. +2
    -2
      kernel/x86_64/ddot_microk_nehalem-2.c
  8. +4
    -4
      kernel/x86_64/dgemv_n_4.c
  9. +12
    -12
      kernel/x86_64/dgemv_n_microk_haswell-4.c
  10. +4
    -4
      kernel/x86_64/dgemv_n_microk_nehalem-4.c
  11. +14
    -14
      kernel/x86_64/dgemv_t_4.c
  12. +6
    -6
      kernel/x86_64/dgemv_t_microk_haswell-4.c
  13. +2
    -2
      kernel/x86_64/dsymv_L_microk_bulldozer-2.c
  14. +2
    -2
      kernel/x86_64/dsymv_L_microk_nehalem-2.c
  15. +2
    -2
      kernel/x86_64/dsymv_U_microk_bulldozer-2.c
  16. +2
    -2
      kernel/x86_64/dsymv_U_microk_nehalem-2.c
  17. +2
    -2
      kernel/x86_64/saxpy_microk_nehalem-2.c
  18. +2
    -2
      kernel/x86_64/sdot_microk_bulldozer-2.c
  19. +2
    -2
      kernel/x86_64/sdot_microk_nehalem-2.c
  20. +11
    -11
      kernel/x86_64/sgemv_n_4.c
  21. +10
    -10
      kernel/x86_64/sgemv_n_microk_bulldozer-4.c
  22. +16
    -16
      kernel/x86_64/sgemv_n_microk_haswell-4.c
  23. +4
    -4
      kernel/x86_64/sgemv_n_microk_nehalem-4.c
  24. +16
    -16
      kernel/x86_64/sgemv_n_microk_sandy-4.c
  25. +14
    -14
      kernel/x86_64/sgemv_t_4.c
  26. +8
    -8
      kernel/x86_64/sgemv_t_microk_bulldozer-4.c
  27. +8
    -8
      kernel/x86_64/sgemv_t_microk_haswell-4.c
  28. +2
    -2
      kernel/x86_64/sgemv_t_microk_nehalem-4.c
  29. +8
    -8
      kernel/x86_64/sgemv_t_microk_sandy-4.c
  30. +2
    -2
      kernel/x86_64/ssymv_L_microk_bulldozer-2.c
  31. +2
    -2
      kernel/x86_64/ssymv_L_microk_nehalem-2.c
  32. +2
    -2
      kernel/x86_64/ssymv_U_microk_bulldozer-2.c
  33. +2
    -2
      kernel/x86_64/ssymv_U_microk_nehalem-2.c
  34. +2
    -2
      kernel/x86_64/zaxpy_microk_bulldozer-2.c
  35. +8
    -8
      kernel/x86_64/zgemv_n_microk_haswell-4.c
  36. +8
    -8
      kernel/x86_64/zgemv_n_microk_sandy-4.c
  37. +6
    -6
      kernel/x86_64/zgemv_t_microk_bulldozer-4.c
  38. +6
    -6
      kernel/x86_64/zgemv_t_microk_haswell-4.c

+ 2
- 2
kernel/x86_64/caxpy_microk_bulldozer-2.c View File

@@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"prefetcht0 768(%2,%0,4) \n\t" "prefetcht0 768(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x
@@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 24
- 24
kernel/x86_64/cgemv_n_microk_haswell-4.c View File

@@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3


"cmpq $0 , %1 \n\t" "cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t" "prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"2: \n\t"


"cmpq $4, %8 \n\t" "cmpq $4, %8 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"


"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
@@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y


".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1


"cmpq $0 , %1 \n\t" "cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t" "prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"2: \n\t"


"cmpq $4, %6 \n\t" "cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"


"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1
@@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y


".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0


"cmpq $0 , %1 \n\t" "cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 320(%4,%0,4) \n\t" "prefetcht0 320(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0
@@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t" "vmovups %%ymm13,-32(%3,%0,4) \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"2: \n\t"


"cmpq $4, %5 \n\t" "cmpq $4, %5 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"


"vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0


@@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)


"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y


".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vbroadcastss (%5), %%ymm1 \n\t" // alpha_i "vbroadcastss (%5), %%ymm1 \n\t" // alpha_i


"cmpq $0 , %1 \n\t" "cmpq $0 , %1 \n\t"
"je .L01END%= \n\t"
"je 2f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src
"vmovups 32(%2,%0,4), %%ymm9 \n\t" "vmovups 32(%2,%0,4), %%ymm9 \n\t"


@@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y
"vmovups %%ymm13,-32(%3,%0,4) \n\t" "vmovups %%ymm13,-32(%3,%0,4) \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"2: \n\t"


"cmpq $4, %6 \n\t" "cmpq $4, %6 \n\t"
"jne .L02END%= \n\t"
"jne 3f \n\t"


"vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src


@@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a


"vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y


".L02END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :


+ 18
- 18
kernel/x86_64/cgemv_t_microk_haswell-4.c View File

@@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t"


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
@@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t" "prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t" "prefetcht0 192(%5,%0,4) \n\t"
@@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $16 , %0 \n\t" "addq $16 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L08END%=: \n\t"
"3: \n\t"


"vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha
@@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1
@@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t" "prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0
"prefetcht0 192(%5,%0,4) \n\t" "prefetcht0 192(%5,%0,4) \n\t"
@@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $16 , %0 \n\t" "addq $16 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L08END%=: \n\t"
"3: \n\t"


"vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha
@@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0


@@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"
"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L08END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,4) \n\t" "prefetcht0 192(%4,%0,4) \n\t"
"vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0


@@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *


"addq $16 , %0 \n\t" "addq $16 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L08END%=: \n\t"
"3: \n\t"


"vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha
"vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha


+ 2
- 2
kernel/x86_64/daxpy_microk_bulldozer-2.c View File

@@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmovddup (%4), %%xmm0 \n\t" // alpha "vmovddup (%4), %%xmm0 \n\t" // alpha


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"prefetcht0 768(%3,%0,8) \n\t" "prefetcht0 768(%3,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
@@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 2
- 2
kernel/x86_64/daxpy_microk_nehalem-2.c View File

@@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"shufpd $0, %%xmm0, %%xmm0 \n\t" "shufpd $0, %%xmm0, %%xmm0 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,8) \n\t" // "prefetcht0 192(%2,%0,8) \n\t"
// "prefetcht0 192(%3,%0,8) \n\t" // "prefetcht0 192(%3,%0,8) \n\t"


@@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 2
- 2
kernel/x86_64/ddot_microk_bulldozer-2.c View File

@@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x
"vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x
@@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t"


+ 2
- 2
kernel/x86_64/ddot_microk_nehalem-2.c View File

@@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xorpd %%xmm7, %%xmm7 \n\t" "xorpd %%xmm7, %%xmm7 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%2,%0,8), %%xmm12 \n\t" // 2 * x "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x
"movups (%3,%0,8), %%xmm8 \n\t" // 2 * y "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y
@@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"addpd %%xmm5, %%xmm4 \n\t" "addpd %%xmm5, %%xmm4 \n\t"
"addpd %%xmm7, %%xmm6 \n\t" "addpd %%xmm7, %%xmm6 \n\t"


+ 4
- 4
kernel/x86_64/dgemv_n_4.c View File

@@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufpd $0, %%xmm13, %%xmm13 \n\t" "shufpd $0, %%xmm13, %%xmm13 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
"movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y


@@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :
@@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"shufpd $0, %%xmm12, %%xmm12 \n\t" "shufpd $0, %%xmm12, %%xmm12 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm8 \n\t" // 2 * a "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a
"movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a
"movups (%3,%0,8), %%xmm4 \n\t" // 2 * y "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y
@@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 12
- 12
kernel/x86_64/dgemv_n_microk_haswell-4.c View File

@@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha "vbroadcastsd (%9), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"jz 2f \n\t"


"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
@@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L8LABEL%=: \n\t"
"2: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 3f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
@@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastsd (%8), %%ymm6 \n\t" // alpha "vbroadcastsd (%8), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L8LABEL%= \n\t"
"jz 2f \n\t"


"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
@@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L8LABEL%=: \n\t"
"2: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L8END%= \n\t"
"je 3f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
@@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L8END%=: \n\t"
"3: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :


+ 4
- 4
kernel/x86_64/dgemv_n_microk_nehalem-4.c View File

@@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
@@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :
@@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufpd $0, %%xmm6 , %%xmm6 \n\t" "shufpd $0, %%xmm6 , %%xmm6 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorpd %%xmm4 , %%xmm4 \n\t" "xorpd %%xmm4 , %%xmm4 \n\t"
"xorpd %%xmm5 , %%xmm5 \n\t" "xorpd %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,8), %%xmm7 \n\t" // 2 * y "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y
@@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 14
- 14
kernel/x86_64/dgemv_t_4.c View File

@@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"xorpd %%xmm11 , %%xmm11 \n\t" "xorpd %%xmm11 , %%xmm11 \n\t"
"testq $2 , %1 \n\t" "testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"


"movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0 "movups (%3,%0,8) , %%xmm12 \n\t" // ap0
@@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"subq $2 , %1 \n\t" "subq $2 , %1 \n\t"
"addpd %%xmm13 , %%xmm11 \n\t" "addpd %%xmm13 , %%xmm11 \n\t"


".L01LABEL%=: \n\t"
"2: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%5,%0,8) , %%xmm14 \n\t" // x "movups (%5,%0,8) , %%xmm14 \n\t" // x
"movups (%3,%0,8) , %%xmm12 \n\t" // ap0 "movups (%3,%0,8) , %%xmm12 \n\t" // ap0
@@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"3: \n\t"


"haddpd %%xmm10, %%xmm10 \n\t" "haddpd %%xmm10, %%xmm10 \n\t"
"haddpd %%xmm11, %%xmm11 \n\t" "haddpd %%xmm11, %%xmm11 \n\t"
@@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"xorpd %%xmm10 , %%xmm10 \n\t" "xorpd %%xmm10 , %%xmm10 \n\t"
"testq $2 , %1 \n\t" "testq $2 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"


"movups (%3,%0,8) , %%xmm12 \n\t" "movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t"
@@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"addpd %%xmm12 , %%xmm10 \n\t" "addpd %%xmm12 , %%xmm10 \n\t"
"subq $2 , %1 \n\t" "subq $2 , %1 \n\t"


".L01LABEL%=: \n\t"
"2: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%3,%0,8) , %%xmm12 \n\t" "movups (%3,%0,8) , %%xmm12 \n\t"
"movups 16(%3,%0,8) , %%xmm14 \n\t" "movups 16(%3,%0,8) , %%xmm14 \n\t"
@@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"addpd %%xmm14 , %%xmm9 \n\t" "addpd %%xmm14 , %%xmm9 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"3: \n\t"


"addpd %%xmm9 , %%xmm10 \n\t" "addpd %%xmm9 , %%xmm10 \n\t"
"haddpd %%xmm10, %%xmm10 \n\t" "haddpd %%xmm10, %%xmm10 \n\t"
@@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"shufpd $0 , %%xmm10 , %%xmm10 \n\t" "shufpd $0 , %%xmm10 , %%xmm10 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%3,%0,8) , %%xmm12 \n\t" "movups (%3,%0,8) , %%xmm12 \n\t"
"movups (%4,%0,8) , %%xmm11 \n\t" "movups (%4,%0,8) , %%xmm11 \n\t"
@@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"subq $2 , %1 \n\t" "subq $2 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,8) \n\t" "movups %%xmm11, -16(%4,%0,8) \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 6
- 6
kernel/x86_64/dgemv_t_microk_haswell-4.c View File

@@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t"


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x


@@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 384(%2,%0,8) \n\t" // "prefetcht0 384(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x
"vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x
@@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"3: \n\t"


"vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t"


+ 2
- 2
kernel/x86_64/dsymv_L_microk_bulldozer-2.c View File

@@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovddup 24(%8), %%xmm7 \n\t" // temp1[1] "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1]


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
@@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovups %%xmm11 , -16(%3,%0,8) \n\t" "vmovups %%xmm11 , -16(%3,%0,8) \n\t"


"cmpq %0 , %1 \n\t" "cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovsd (%9), %%xmm4 \n\t" "vmovsd (%9), %%xmm4 \n\t"
"vmovsd 8(%9), %%xmm5 \n\t" "vmovsd 8(%9), %%xmm5 \n\t"


+ 2
- 2
kernel/x86_64/dsymv_L_microk_nehalem-2.c View File

@@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"shufpd $0, %%xmm7, %%xmm7 \n\t" "shufpd $0, %%xmm7, %%xmm7 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t" "movups %%xmm12 , %%xmm11 \n\t"
@@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y


"cmpq %0 , %1 \n\t" "cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"movsd (%9), %%xmm4 \n\t" // temp1[0] "movsd (%9), %%xmm4 \n\t" // temp1[0]
"movsd 8(%9), %%xmm5 \n\t" // temp1[1] "movsd 8(%9), %%xmm5 \n\t" // temp1[1]


+ 2
- 2
kernel/x86_64/dsymv_U_microk_bulldozer-2.c View File

@@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t" "xorq %0,%0 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x
@@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovups %%xmm9 , -32(%3,%0,8) \n\t" "vmovups %%xmm9 , -32(%3,%0,8) \n\t"
"vmovups %%xmm11 , -16(%3,%0,8) \n\t" "vmovups %%xmm11 , -16(%3,%0,8) \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t"


+ 2
- 2
kernel/x86_64/dsymv_U_microk_nehalem-2.c View File

@@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t" "xorq %0,%0 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%4,%0,8), %%xmm12 \n\t" // 2 * a "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a
"movups (%2,%0,8), %%xmm8 \n\t" // 2 * x "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x
"movups %%xmm12 , %%xmm11 \n\t" "movups %%xmm12 , %%xmm11 \n\t"
@@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y


"subq $2 , %1 \n\t" "subq $2 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"haddpd %%xmm0, %%xmm0 \n\t" "haddpd %%xmm0, %%xmm0 \n\t"
"haddpd %%xmm1, %%xmm1 \n\t" "haddpd %%xmm1, %%xmm1 \n\t"


+ 2
- 2
kernel/x86_64/saxpy_microk_nehalem-2.c View File

@@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"shufps $0, %%xmm0, %%xmm0 \n\t" "shufps $0, %%xmm0, %%xmm0 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,4) \n\t" // "prefetcht0 192(%2,%0,4) \n\t"
// "prefetcht0 192(%3,%0,4) \n\t" // "prefetcht0 192(%3,%0,4) \n\t"


@@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 2
- 2
kernel/x86_64/sdot_microk_bulldozer-2.c View File

@@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
"vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x
@@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t"
"vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t"


+ 2
- 2
kernel/x86_64/sdot_microk_nehalem-2.c View File

@@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"xorps %%xmm7, %%xmm7 \n\t" "xorps %%xmm7, %%xmm7 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm8 \n\t" // 4 * x "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x
"movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
@@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"addps %%xmm5, %%xmm4 \n\t" "addps %%xmm5, %%xmm4 \n\t"
"addps %%xmm7, %%xmm6 \n\t" "addps %%xmm7, %%xmm6 \n\t"


+ 11
- 11
kernel/x86_64/sgemv_n_4.c View File

@@ -129,7 +129,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufps $0, %%xmm13, %%xmm13 \n\t" "shufps $0, %%xmm13, %%xmm13 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y


"movups (%4,%0,4), %%xmm8 \n\t" "movups (%4,%0,4), %%xmm8 \n\t"
@@ -143,7 +143,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y


"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :
@@ -166,7 +166,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


#endif #endif


#ifndef HAVE_KERNEL_4x2
#ifndef HAVE_KERNEL_4x1


static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));


@@ -184,10 +184,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"shufps $0, %%xmm12, %%xmm12 \n\t" "shufps $0, %%xmm12, %%xmm12 \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 2f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
@@ -203,12 +203,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a


"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"2: \n\t"


"testq $0x04, %5 \n\t" "testq $0x04, %5 \n\t"
"jz .L08LABEL%= \n\t"
"jz 3f \n\t"


"movups (%3,%0,4), %%xmm4 \n\t" // 4 * y "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a
@@ -218,7 +218,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"3: \n\t"
: :
: :
"r" (i), // 0 "r" (i), // 0
@@ -262,7 +262,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
( (


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%2,%0,4) , %%xmm12 \n\t" "movups (%2,%0,4) , %%xmm12 \n\t"
"movups (%3,%0,4) , %%xmm11 \n\t" "movups (%3,%0,4) , %%xmm11 \n\t"
@@ -271,7 +271,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"movups %%xmm11, -16(%3,%0,4) \n\t" "movups %%xmm11, -16(%3,%0,4) \n\t"


"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 10
- 10
kernel/x86_64/sgemv_n_microk_bulldozer-4.c View File

@@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%xmm8 \n\t" // alpha "vbroadcastss (%9), %%xmm8 \n\t" // alpha


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"




".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
@@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y


"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"


: :
: :
@@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%xmm8 \n\t" // alpha "vbroadcastss (%8), %%xmm8 \n\t" // alpha


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"


@@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 16
- 16
kernel/x86_64/sgemv_n_microk_haswell-4.c View File

@@ -50,7 +50,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%ymm6 \n\t" // alpha "vbroadcastss (%9), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
@@ -76,10 +76,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
@@ -106,14 +106,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"


".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
@@ -147,9 +147,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -197,7 +197,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha "vbroadcastss (%8), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
@@ -217,10 +217,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
@@ -240,14 +240,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"


".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
@@ -270,9 +270,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :


+ 4
- 4
kernel/x86_64/sgemv_n_microk_nehalem-4.c View File

@@ -60,7 +60,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t" "xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
@@ -103,7 +103,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO


"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :
@@ -155,7 +155,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"shufps $0, %%xmm6 , %%xmm6 \n\t" "shufps $0, %%xmm6 , %%xmm6 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm4 , %%xmm4 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y


@@ -178,7 +178,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addps %%xmm7 , %%xmm11 \n\t" "addps %%xmm7 , %%xmm11 \n\t"
"movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 16
- 16
kernel/x86_64/sgemv_n_microk_sandy-4.c View File

@@ -51,7 +51,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%ymm6 \n\t" // alpha "vbroadcastss (%9), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
@@ -85,10 +85,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $4, %0 \n\t" "addq $4, %0 \n\t"
"subq $4, %1 \n\t" "subq $4, %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
@@ -123,14 +123,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"subq $8, %1 \n\t" "subq $8, %1 \n\t"




".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"


@@ -190,9 +190,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addq $16, %8 \n\t" "addq $16, %8 \n\t"
"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -241,7 +241,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha "vbroadcastss (%8), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
@@ -265,10 +265,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"addq $4, %0 \n\t" "addq $4, %0 \n\t"
"subq $4, %1 \n\t" "subq $4, %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
@@ -293,14 +293,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"subq $8, %1 \n\t" "subq $8, %1 \n\t"




".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y
@@ -339,9 +339,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :


+ 14
- 14
kernel/x86_64/sgemv_t_4.c View File

@@ -84,7 +84,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"xorps %%xmm11 , %%xmm11 \n\t" "xorps %%xmm11 , %%xmm11 \n\t"
"testq $4 , %1 \n\t" "testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"


"movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0 "movups (%3,%0,4) , %%xmm12 \n\t" // ap0
@@ -96,13 +96,13 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"addps %%xmm13 , %%xmm11 \n\t" "addps %%xmm13 , %%xmm11 \n\t"


".L01LABEL%=: \n\t"
"2: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%5,%0,4) , %%xmm14 \n\t" // x "movups (%5,%0,4) , %%xmm14 \n\t" // x
"movups (%3,%0,4) , %%xmm12 \n\t" // ap0 "movups (%3,%0,4) , %%xmm12 \n\t" // ap0
@@ -122,9 +122,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"3: \n\t"


"haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t"
"haddps %%xmm11, %%xmm11 \n\t" "haddps %%xmm11, %%xmm11 \n\t"
@@ -165,7 +165,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"xorps %%xmm10 , %%xmm10 \n\t" "xorps %%xmm10 , %%xmm10 \n\t"
"testq $4 , %1 \n\t" "testq $4 , %1 \n\t"
"jz .L01LABEL%= \n\t"
"jz 2f \n\t"


"movups (%3,%0,4) , %%xmm12 \n\t" "movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t"
@@ -174,13 +174,13 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"addps %%xmm12 , %%xmm10 \n\t" "addps %%xmm12 , %%xmm10 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L01LABEL%=: \n\t"
"2: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L01END%= \n\t"
"je 3f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%3,%0,4) , %%xmm12 \n\t" "movups (%3,%0,4) , %%xmm12 \n\t"
"movups 16(%3,%0,4) , %%xmm14 \n\t" "movups 16(%3,%0,4) , %%xmm14 \n\t"
@@ -193,9 +193,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"addps %%xmm14 , %%xmm9 \n\t" "addps %%xmm14 , %%xmm9 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L01END%=: \n\t"
"3: \n\t"


"addps %%xmm9 , %%xmm10 \n\t" "addps %%xmm9 , %%xmm10 \n\t"
"haddps %%xmm10, %%xmm10 \n\t" "haddps %%xmm10, %%xmm10 \n\t"
@@ -255,7 +255,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"shufps $0 , %%xmm10 , %%xmm10 \n\t" "shufps $0 , %%xmm10 , %%xmm10 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%3,%0,4) , %%xmm12 \n\t" "movups (%3,%0,4) , %%xmm12 \n\t"
"movups (%4,%0,4) , %%xmm11 \n\t" "movups (%4,%0,4) , %%xmm11 \n\t"
@@ -265,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"movups %%xmm11, -16(%4,%0,4) \n\t" "movups %%xmm11, -16(%4,%0,4) \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 8
- 8
kernel/x86_64/sgemv_t_microk_bulldozer-4.c View File

@@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t"


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
@@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x
@@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"


".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x


"prefetcht0 384(%4,%0,4) \n\t" "prefetcht0 384(%4,%0,4) \n\t"
@@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"
"vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t"
"vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t"
"vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t"


+ 8
- 8
kernel/x86_64/sgemv_t_microk_haswell-4.c View File

@@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x


@@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x


@@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"


".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 384(%2,%0,4) \n\t" "prefetcht0 384(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
@@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"addq $16, %0 \n\t" "addq $16, %0 \n\t"
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"


"vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t"
"vextractf128 $1 , %%ymm5, %%xmm13 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t"


+ 2
- 2
kernel/x86_64/sgemv_t_microk_nehalem-4.c View File

@@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"xorps %%xmm7 , %%xmm7 \n\t" "xorps %%xmm7 , %%xmm7 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x
"movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0
@@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"addps %%xmm10, %%xmm6 \n\t" "addps %%xmm10, %%xmm6 \n\t"
"addps %%xmm11, %%xmm7 \n\t" "addps %%xmm11, %%xmm7 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"haddps %%xmm4, %%xmm4 \n\t" "haddps %%xmm4, %%xmm4 \n\t"
"haddps %%xmm5, %%xmm5 \n\t" "haddps %%xmm5, %%xmm5 \n\t"


+ 8
- 8
kernel/x86_64/sgemv_t_microk_sandy-4.c View File

@@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t"


"testq $0x04, %1 \n\t" "testq $0x04, %1 \n\t"
"jz .L08LABEL%= \n\t"
"jz 2f \n\t"


"vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x


@@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t"


".L08LABEL%=: \n\t"
"2: \n\t"


"testq $0x08, %1 \n\t" "testq $0x08, %1 \n\t"
"jz .L16LABEL%= \n\t"
"jz 3f \n\t"


"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x


@@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $8 , %1 \n\t" "subq $8 , %1 \n\t"
"vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t"


".L16LABEL%=: \n\t"
"3: \n\t"


"cmpq $0, %1 \n\t" "cmpq $0, %1 \n\t"
"je .L16END%= \n\t"
"je 4f \n\t"




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 384(%2,%0,4) \n\t" "prefetcht0 384(%2,%0,4) \n\t"
"vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x
"vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x
@@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"subq $16, %1 \n\t" "subq $16, %1 \n\t"
"vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t"


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


".L16END%=: \n\t"
"4: \n\t"


"vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t"


+ 2
- 2
kernel/x86_64/ssymv_L_microk_bulldozer-2.c View File

@@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3] "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3]


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a
"vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x
@@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovups %%xmm9 , -16(%3,%0,4) \n\t" "vmovups %%xmm9 , -16(%3,%0,4) \n\t"


"cmpq %0 , %1 \n\t" "cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovss (%9), %%xmm4 \n\t" "vmovss (%9), %%xmm4 \n\t"
"vmovss 4(%9), %%xmm5 \n\t" "vmovss 4(%9), %%xmm5 \n\t"


+ 2
- 2
kernel/x86_64/ssymv_L_microk_nehalem-2.c View File

@@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
"shufps $0, %%xmm7, %%xmm7 \n\t" "shufps $0, %%xmm7, %%xmm7 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y


@@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"cmpq %0 , %1 \n\t" "cmpq %0 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"movss (%9), %%xmm4 \n\t" // temp1[0] "movss (%9), %%xmm4 \n\t" // temp1[0]
"movss 4(%9), %%xmm5 \n\t" // temp1[1] "movss 4(%9), %%xmm5 \n\t" // temp1[1]


+ 2
- 2
kernel/x86_64/ssymv_U_microk_bulldozer-2.c View File

@@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t" "xorq %0,%0 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y
@@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t"
"vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t"


+ 2
- 2
kernel/x86_64/ssymv_U_microk_nehalem-2.c View File

@@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"xorq %0,%0 \n\t" "xorq %0,%0 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x
"movups (%3,%0,4), %%xmm9 \n\t" // 4 * y "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y


@@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT


"addq $4 , %0 \n\t" "addq $4 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"haddps %%xmm0, %%xmm0 \n\t" "haddps %%xmm0, %%xmm0 \n\t"
"haddps %%xmm1, %%xmm1 \n\t" "haddps %%xmm1, %%xmm1 \n\t"


+ 2
- 2
kernel/x86_64/zaxpy_microk_bulldozer-2.c View File

@@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"prefetcht0 768(%2,%0,8) \n\t" "prefetcht0 768(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x
@@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


: :
: :


+ 8
- 8
kernel/x86_64/zgemv_n_microk_haswell-4.c View File

@@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,8) \n\t" "prefetcht0 192(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,8) \n\t" "prefetcht0 192(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%4,%0,8) \n\t" "prefetcht0 192(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
"vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0
@@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
"prefetcht0 192(%2,%0,8) \n\t" "prefetcht0 192(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
"vmovups 32(%2,%0,8), %%ymm9 \n\t" "vmovups 32(%2,%0,8), %%ymm9 \n\t"
@@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :


+ 8
- 8
kernel/x86_64/zgemv_n_microk_sandy-4.c View File

@@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)




".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


//"prefetcht0 256(%4,%0,8) \n\t" //"prefetcht0 256(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
"vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


// "prefetcht0 256(%4,%0,8) \n\t" // "prefetcht0 256(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


// "prefetcht0 256(%4,%0,8) \n\t" // "prefetcht0 256(%4,%0,8) \n\t"
"vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0
@@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :
@@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a
"vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"
// "prefetcht0 192(%2,%0,8) \n\t" // "prefetcht0 192(%2,%0,8) \n\t"
"vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src
"vmovups 32(%2,%0,8), %%ymm9 \n\t" "vmovups 32(%2,%0,8), %%ymm9 \n\t"
@@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"
"vzeroupper \n\t" "vzeroupper \n\t"


: :


+ 6
- 6
kernel/x86_64/zgemv_t_microk_bulldozer-4.c View File

@@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
@@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup (%8) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
@@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
@@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup (%6) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
@@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
"vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0
@@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0
"vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0


"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup (%5) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha


+ 6
- 6
kernel/x86_64/zgemv_t_microk_haswell-4.c View File

@@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t"


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"prefetcht0 192(%2,%0,8) \n\t" "prefetcht0 192(%2,%0,8) \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
@@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup (%8) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha
@@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"prefetcht0 192(%2,%0,8) \n\t" "prefetcht0 192(%2,%0,8) \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
@@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup (%6) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha
@@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *
"vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp


".align 16 \n\t" ".align 16 \n\t"
".L01LOOP%=: \n\t"
"1: \n\t"


"prefetcht0 192(%2,%0,8) \n\t" "prefetcht0 192(%2,%0,8) \n\t"
"vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0
@@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *


"addq $8 , %0 \n\t" "addq $8 , %0 \n\t"
"subq $4 , %1 \n\t" "subq $4 , %1 \n\t"
"jnz .L01LOOP%= \n\t"
"jnz 1b \n\t"


"vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup (%5) , %%xmm0 \n\t" // value from alpha
"vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha


Loading…
Cancel
Save