changed inline assembler labels to short formtags/v0.2.14^2
| @@ -40,7 +40,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vbroadcastss 4(%4), %%xmm1 \n\t" // imag part of alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 768(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%xmm5 \n\t" // 2 complex values from x | |||
| @@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -49,10 +49,10 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastss 28(%2), %%ymm7 \n\t" // imag part x3 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 320(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| @@ -115,12 +115,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %8 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| @@ -155,7 +155,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -200,10 +200,10 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastss 12(%2), %%ymm3 \n\t" // imag part x1 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 320(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| @@ -248,12 +248,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $16, %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %6 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups (%5,%0,4), %%ymm10 \n\t" // 4 complex values form a1 | |||
| @@ -279,7 +279,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -320,10 +320,10 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastss 4(%2), %%ymm1 \n\t" // imag part x0 | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 320(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 4 complex values form a0 | |||
| @@ -359,12 +359,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13,-32(%3,%0,4) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %5 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 4 complex values form a0 | |||
| @@ -386,7 +386,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -452,10 +452,10 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vbroadcastss (%5), %%ymm1 \n\t" // alpha_i | |||
| "cmpq $0 , %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values from src | |||
| "vmovups 32(%2,%0,4), %%ymm9 \n\t" | |||
| @@ -489,12 +489,12 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vmovups %%ymm12,-64(%3,%0,4) \n\t" // 4 complex values to y | |||
| "vmovups %%ymm13,-32(%3,%0,4) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $4, %6 \n\t" | |||
| "jne .L02END%= \n\t" | |||
| "jne 3f \n\t" | |||
| "vmovups (%2,%0,4), %%ymm8 \n\t" // 4 complex values src | |||
| @@ -516,7 +516,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vmovups %%ymm12, (%3,%0,4) \n\t" // 4 complex values to y | |||
| ".L02END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -47,7 +47,7 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| @@ -72,12 +72,12 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L08END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| @@ -125,9 +125,9 @@ static void cgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L08END%=: \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%8) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%8) , %%xmm1 \n\t" // value from alpha | |||
| @@ -269,7 +269,7 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11 \n\t" // temp | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "vmovups (%5,%0,4), %%ymm5 \n\t" // 4 complex values from a1 | |||
| @@ -288,12 +288,12 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L08END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| @@ -325,9 +325,9 @@ static void cgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L08END%=: \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%6) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%6) , %%xmm1 \n\t" // value from alpha | |||
| @@ -426,7 +426,7 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| @@ -442,12 +442,12 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L08END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmovups (%4,%0,4), %%ymm4 \n\t" // 4 complex values from a0 | |||
| @@ -472,9 +472,9 @@ static void cgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "addq $16 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L08END%=: \n\t" | |||
| "3: \n\t" | |||
| "vbroadcastss (%5) , %%xmm0 \n\t" // value from alpha | |||
| "vbroadcastss 4(%5) , %%xmm1 \n\t" // value from alpha | |||
| @@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vmovddup (%4), %%xmm0 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 768(%3,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| @@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -40,7 +40,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "shufpd $0, %%xmm0, %%xmm0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 192(%2,%0,8) \n\t" | |||
| // "prefetcht0 192(%3,%0,8) \n\t" | |||
| @@ -70,7 +70,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "vmovups 16(%2,%0,8), %%xmm13 \n\t" // 2 * x | |||
| "vmovups 32(%2,%0,8), %%xmm14 \n\t" // 2 * x | |||
| @@ -55,7 +55,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| @@ -42,7 +42,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "xorpd %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x | |||
| "movups (%3,%0,8), %%xmm8 \n\t" // 2 * y | |||
| @@ -65,7 +65,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "addpd %%xmm5, %%xmm4 \n\t" | |||
| "addpd %%xmm7, %%xmm6 \n\t" | |||
| @@ -125,7 +125,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "shufpd $0, %%xmm13, %%xmm13 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y | |||
| "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y | |||
| @@ -148,7 +148,7 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -187,7 +187,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "shufpd $0, %%xmm12, %%xmm12 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a | |||
| "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a | |||
| "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y | |||
| @@ -203,7 +203,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -50,7 +50,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vbroadcastsd (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L8LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| @@ -77,14 +77,14 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L8LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| @@ -118,9 +118,9 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $8 , %1 \n\t" | |||
| "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -168,7 +168,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vbroadcastsd (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L8LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| @@ -188,14 +188,14 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L8LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L8END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y | |||
| @@ -218,9 +218,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L8END%=: \n\t" | |||
| "3: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -60,7 +60,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| @@ -142,7 +142,7 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -194,7 +194,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "shufpd $0, %%xmm6 , %%xmm6 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| @@ -239,7 +239,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -78,7 +78,7 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "xorpd %%xmm11 , %%xmm11 \n\t" | |||
| "testq $2 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "movups (%5,%0,8) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 | |||
| @@ -90,13 +90,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "subq $2 , %1 \n\t" | |||
| "addpd %%xmm13 , %%xmm11 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%5,%0,8) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 | |||
| @@ -116,9 +116,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "3: \n\t" | |||
| "haddpd %%xmm10, %%xmm10 \n\t" | |||
| "haddpd %%xmm11, %%xmm11 \n\t" | |||
| @@ -157,7 +157,7 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "xorpd %%xmm10 , %%xmm10 \n\t" | |||
| "testq $2 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups (%4,%0,8) , %%xmm11 \n\t" | |||
| @@ -166,13 +166,13 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "addpd %%xmm12 , %%xmm10 \n\t" | |||
| "subq $2 , %1 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups 16(%3,%0,8) , %%xmm14 \n\t" | |||
| @@ -185,9 +185,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "subq $4 , %1 \n\t" | |||
| "addpd %%xmm14 , %%xmm9 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "3: \n\t" | |||
| "addpd %%xmm9 , %%xmm10 \n\t" | |||
| "haddpd %%xmm10, %%xmm10 \n\t" | |||
| @@ -246,7 +246,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "shufpd $0 , %%xmm10 , %%xmm10 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups (%4,%0,8) , %%xmm11 \n\t" | |||
| @@ -256,7 +256,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "subq $2 , %1 \n\t" | |||
| "movups %%xmm11, -16(%4,%0,8) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -42,7 +42,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x | |||
| @@ -54,13 +54,13 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 384(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x | |||
| "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x | |||
| @@ -80,9 +80,9 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "subq $8 , %1 \n\t" | |||
| "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "3: \n\t" | |||
| "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" | |||
| "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" | |||
| @@ -44,7 +44,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vmovddup 24(%8), %%xmm7 \n\t" // temp1[1] | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| @@ -90,7 +90,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vmovups %%xmm11 , -16(%3,%0,8) \n\t" | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovsd (%9), %%xmm4 \n\t" | |||
| "vmovsd 8(%9), %%xmm5 \n\t" | |||
| @@ -48,7 +48,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "shufpd $0, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| @@ -85,7 +85,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "movsd (%9), %%xmm4 \n\t" // temp1[0] | |||
| "movsd 8(%9), %%xmm5 \n\t" // temp1[1] | |||
| @@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| @@ -93,7 +93,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "vmovups %%xmm9 , -32(%3,%0,8) \n\t" | |||
| "vmovups %%xmm11 , -16(%3,%0,8) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vhaddpd %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddpd %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| @@ -51,7 +51,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | |||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | |||
| "movups %%xmm12 , %%xmm11 \n\t" | |||
| @@ -88,7 +88,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "movups %%xmm9,-16(%3,%0,8) \n\t" // 2 * y | |||
| "subq $2 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "haddpd %%xmm0, %%xmm0 \n\t" | |||
| "haddpd %%xmm1, %%xmm1 \n\t" | |||
| @@ -40,7 +40,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "shufps $0, %%xmm0, %%xmm0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 192(%2,%0,4) \n\t" | |||
| // "prefetcht0 192(%3,%0,4) \n\t" | |||
| @@ -70,7 +70,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x | |||
| @@ -55,7 +55,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| @@ -42,7 +42,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "xorps %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "movups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| @@ -64,7 +64,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "addps %%xmm5, %%xmm4 \n\t" | |||
| "addps %%xmm7, %%xmm6 \n\t" | |||
| @@ -129,7 +129,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "shufps $0, %%xmm13, %%xmm13 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" | |||
| @@ -143,7 +143,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -166,7 +166,7 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| #ifndef HAVE_KERNEL_4x1 | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| @@ -184,10 +184,10 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "shufps $0, %%xmm12, %%xmm12 \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 2f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a | |||
| @@ -203,12 +203,12 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x04, %5 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a | |||
| @@ -218,7 +218,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| @@ -262,7 +262,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| ( | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,4) , %%xmm12 \n\t" | |||
| "movups (%3,%0,4) , %%xmm11 \n\t" | |||
| @@ -271,7 +271,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| "movups %%xmm11, -16(%3,%0,4) \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -49,7 +49,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vbroadcastss (%9), %%xmm8 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -71,10 +71,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $4 , %1 \n\t" | |||
| "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -107,13 +107,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -178,9 +178,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| : | |||
| : | |||
| @@ -227,7 +227,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vbroadcastss (%8), %%xmm8 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| @@ -243,7 +243,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -50,7 +50,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vbroadcastss (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" | |||
| @@ -76,10 +76,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| @@ -106,14 +106,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| @@ -147,9 +147,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $16, %1 \n\t" | |||
| "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -197,7 +197,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vbroadcastss (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| @@ -217,10 +217,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| @@ -240,14 +240,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y | |||
| @@ -270,9 +270,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -60,7 +60,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "xorps %%xmm4 , %%xmm4 \n\t" | |||
| "xorps %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| @@ -103,7 +103,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -155,7 +155,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "shufps $0, %%xmm6 , %%xmm6 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "xorps %%xmm4 , %%xmm4 \n\t" | |||
| "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| @@ -178,7 +178,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addps %%xmm7 , %%xmm11 \n\t" | |||
| "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -51,7 +51,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vbroadcastss (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" | |||
| @@ -85,10 +85,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $4, %0 \n\t" | |||
| "subq $4, %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| @@ -123,14 +123,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "subq $8, %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| @@ -190,9 +190,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addq $16, %8 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -241,7 +241,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vbroadcastss (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| @@ -265,10 +265,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $4, %0 \n\t" | |||
| "subq $4, %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| @@ -293,14 +293,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "subq $8, %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y | |||
| @@ -339,9 +339,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -84,7 +84,7 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "xorps %%xmm11 , %%xmm11 \n\t" | |||
| "testq $4 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "movups (%5,%0,4) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 | |||
| @@ -96,13 +96,13 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm13 , %%xmm11 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%5,%0,4) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 | |||
| @@ -122,9 +122,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "3: \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| "haddps %%xmm11, %%xmm11 \n\t" | |||
| @@ -165,7 +165,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "xorps %%xmm10 , %%xmm10 \n\t" | |||
| "testq $4 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups (%4,%0,4) , %%xmm11 \n\t" | |||
| @@ -174,13 +174,13 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| "je 3f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups 16(%3,%0,4) , %%xmm14 \n\t" | |||
| @@ -193,9 +193,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "subq $8 , %1 \n\t" | |||
| "addps %%xmm14 , %%xmm9 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L01END%=: \n\t" | |||
| "3: \n\t" | |||
| "addps %%xmm9 , %%xmm10 \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| @@ -255,7 +255,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "shufps $0 , %%xmm10 , %%xmm10 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups (%4,%0,4) , %%xmm11 \n\t" | |||
| @@ -265,7 +265,7 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "subq $4 , %1 \n\t" | |||
| "movups %%xmm11, -16(%4,%0,4) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| @@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| @@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| @@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "subq $16, %1 \n\t" | |||
| "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| @@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| @@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| @@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x | |||
| @@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" | |||
| "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" | |||
| @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "xorps %%xmm7 , %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 | |||
| @@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addps %%xmm10, %%xmm6 \n\t" | |||
| "addps %%xmm11, %%xmm7 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "haddps %%xmm4, %%xmm4 \n\t" | |||
| "haddps %%xmm5, %%xmm5 \n\t" | |||
| @@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| @@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "subq $4 , %1 \n\t" | |||
| "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "2: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "jz 3f \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| @@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "subq $8 , %1 \n\t" | |||
| "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "3: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| "je 4f \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x | |||
| @@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "subq $16, %1 \n\t" | |||
| "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| ".L16END%=: \n\t" | |||
| "4: \n\t" | |||
| "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" | |||
| @@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3] | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a | |||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x | |||
| @@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vmovups %%xmm9 , -16(%3,%0,4) \n\t" | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovss (%9), %%xmm4 \n\t" | |||
| "vmovss 4(%9), %%xmm5 \n\t" | |||
| @@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F | |||
| "shufps $0, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | |||
| @@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F | |||
| "addq $4 , %0 \n\t" | |||
| "cmpq %0 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "movss (%9), %%xmm4 \n\t" // temp1[0] | |||
| "movss 4(%9), %%xmm5 \n\t" // temp1[1] | |||
| @@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y | |||
| @@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| @@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "xorq %0,%0 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | |||
| "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | |||
| @@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "haddps %%xmm0, %%xmm0 \n\t" | |||
| "haddps %%xmm1, %%xmm1 \n\t" | |||
| @@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 768(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x | |||
| @@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,8) \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 | |||
| @@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,8) \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 | |||
| @@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%4,%0,8) \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 | |||
| @@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src | |||
| "vmovups 32(%2,%0,8), %%ymm9 \n\t" | |||
| @@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| //"prefetcht0 256(%4,%0,8) \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 256(%4,%0,8) \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| @@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 256(%4,%0,8) \n\t" | |||
| "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 | |||
| @@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| // "prefetcht0 192(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src | |||
| "vmovups 32(%2,%0,8), %%ymm9 \n\t" | |||
| @@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 | |||
| "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 | |||
| @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovddup (%8) , %%xmm0 \n\t" // value from alpha | |||
| "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha | |||
| @@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 | |||
| "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 | |||
| @@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovddup (%6) , %%xmm0 \n\t" // value from alpha | |||
| "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha | |||
| @@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 | |||
| "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 | |||
| @@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 | |||
| "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovddup (%5) , %%xmm0 \n\t" // value from alpha | |||
| "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha | |||
| @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%2,%0,8) \n\t" | |||
| "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 | |||
| @@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovddup (%8) , %%xmm0 \n\t" // value from alpha | |||
| "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha | |||
| @@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%2,%0,8) \n\t" | |||
| "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 | |||
| @@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovddup (%6) , %%xmm0 \n\t" // value from alpha | |||
| "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha | |||
| @@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 192(%2,%0,8) \n\t" | |||
| "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 | |||
| @@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * | |||
| "addq $8 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "jnz 1b \n\t" | |||
| "vmovddup (%5) , %%xmm0 \n\t" // value from alpha | |||
| "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha | |||