Replace .align with .p2aligns in dscal.c and the Nehalem microkernels as welltags/v0.3.0
| @@ -39,7 +39,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| "movsd (%4), %%xmm0 \n\t" // alpha | "movsd (%4), %%xmm0 \n\t" // alpha | ||||
| "shufpd $0, %%xmm0, %%xmm0 \n\t" | "shufpd $0, %%xmm0, %%xmm0 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| // "prefetcht0 192(%2,%0,8) \n\t" | // "prefetcht0 192(%2,%0,8) \n\t" | ||||
| // "prefetcht0 192(%3,%0,8) \n\t" | // "prefetcht0 192(%3,%0,8) \n\t" | ||||
| @@ -41,7 +41,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| "xorpd %%xmm6, %%xmm6 \n\t" | "xorpd %%xmm6, %%xmm6 \n\t" | ||||
| "xorpd %%xmm7, %%xmm7 \n\t" | "xorpd %%xmm7, %%xmm7 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x | "movups (%2,%0,8), %%xmm12 \n\t" // 2 * x | ||||
| @@ -62,7 +62,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||||
| "subq $4 , %1 \n\t" | "subq $4 , %1 \n\t" | ||||
| "jz 2f \n\t" | "jz 2f \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "xorpd %%xmm4 , %%xmm4 \n\t" | "xorpd %%xmm4 , %%xmm4 \n\t" | ||||
| @@ -99,7 +99,7 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ | |||||
| "leaq (%1,%4,4), %2 \n\t" | "leaq (%1,%4,4), %2 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movsd (%1) , %%xmm4 \n\t" | "movsd (%1) , %%xmm4 \n\t" | ||||
| @@ -47,7 +47,7 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||||
| "shufpd $0, %%xmm6, %%xmm6 \n\t" | "shufpd $0, %%xmm6, %%xmm6 \n\t" | ||||
| "shufpd $0, %%xmm7, %%xmm7 \n\t" | "shufpd $0, %%xmm7, %%xmm7 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | ||||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | ||||
| @@ -50,7 +50,7 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||||
| "xorq %0,%0 \n\t" | "xorq %0,%0 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | "movups (%4,%0,8), %%xmm12 \n\t" // 2 * a | ||||
| "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | "movups (%2,%0,8), %%xmm8 \n\t" // 2 * x | ||||
| @@ -39,7 +39,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| "movss (%4), %%xmm0 \n\t" // alpha | "movss (%4), %%xmm0 \n\t" // alpha | ||||
| "shufps $0, %%xmm0, %%xmm0 \n\t" | "shufps $0, %%xmm0, %%xmm0 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| // "prefetcht0 192(%2,%0,4) \n\t" | // "prefetcht0 192(%2,%0,4) \n\t" | ||||
| // "prefetcht0 192(%3,%0,4) \n\t" | // "prefetcht0 192(%3,%0,4) \n\t" | ||||
| @@ -41,7 +41,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| "xorps %%xmm6, %%xmm6 \n\t" | "xorps %%xmm6, %%xmm6 \n\t" | ||||
| "xorps %%xmm7, %%xmm7 \n\t" | "xorps %%xmm7, %%xmm7 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | ||||
| "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x | "movups (%3,%0,4), %%xmm8 \n\t" // 4 * x | ||||
| @@ -59,18 +59,18 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||||
| "shufps $0, %%xmm6 , %%xmm6 \n\t" | "shufps $0, %%xmm6 , %%xmm6 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "xorps %%xmm4 , %%xmm4 \n\t" | "xorps %%xmm4 , %%xmm4 \n\t" | ||||
| "xorps %%xmm5 , %%xmm5 \n\t" | "xorps %%xmm5 , %%xmm5 \n\t" | ||||
| "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | ||||
| ".align 2 \n\t" | |||||
| ".p2align 1 \n\t" | |||||
| "movups (%4,%0,4), %%xmm8 \n\t" | "movups (%4,%0,4), %%xmm8 \n\t" | ||||
| "movups (%5,%0,4), %%xmm9 \n\t" | "movups (%5,%0,4), %%xmm9 \n\t" | ||||
| "movups (%6,%0,4), %%xmm10 \n\t" | "movups (%6,%0,4), %%xmm10 \n\t" | ||||
| "movups (%7,%0,4), %%xmm11 \n\t" | "movups (%7,%0,4), %%xmm11 \n\t" | ||||
| ".align 2 \n\t" | |||||
| ".p2align 1 \n\t" | |||||
| "mulps %%xmm12, %%xmm8 \n\t" | "mulps %%xmm12, %%xmm8 \n\t" | ||||
| "mulps %%xmm13, %%xmm9 \n\t" | "mulps %%xmm13, %%xmm9 \n\t" | ||||
| "mulps %%xmm14, %%xmm10 \n\t" | "mulps %%xmm14, %%xmm10 \n\t" | ||||
| @@ -84,7 +84,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||||
| "movups (%5,%8,4), %%xmm9 \n\t" | "movups (%5,%8,4), %%xmm9 \n\t" | ||||
| "movups (%6,%8,4), %%xmm10 \n\t" | "movups (%6,%8,4), %%xmm10 \n\t" | ||||
| "movups (%7,%8,4), %%xmm11 \n\t" | "movups (%7,%8,4), %%xmm11 \n\t" | ||||
| ".align 2 \n\t" | |||||
| ".p2align 1 \n\t" | |||||
| "mulps %%xmm0 , %%xmm8 \n\t" | "mulps %%xmm0 , %%xmm8 \n\t" | ||||
| "mulps %%xmm1 , %%xmm9 \n\t" | "mulps %%xmm1 , %%xmm9 \n\t" | ||||
| "mulps %%xmm2 , %%xmm10 \n\t" | "mulps %%xmm2 , %%xmm10 \n\t" | ||||
| @@ -154,7 +154,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||||
| "movss (%8), %%xmm6 \n\t" // alpha | "movss (%8), %%xmm6 \n\t" // alpha | ||||
| "shufps $0, %%xmm6 , %%xmm6 \n\t" | "shufps $0, %%xmm6 , %%xmm6 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "xorps %%xmm4 , %%xmm4 \n\t" | "xorps %%xmm4 , %%xmm4 \n\t" | ||||
| "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | ||||
| @@ -40,7 +40,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||||
| "xorps %%xmm6 , %%xmm6 \n\t" | "xorps %%xmm6 , %%xmm6 \n\t" | ||||
| "xorps %%xmm7 , %%xmm7 \n\t" | "xorps %%xmm7 , %%xmm7 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | ||||
| @@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F | |||||
| "shufps $0, %%xmm6, %%xmm6 \n\t" | "shufps $0, %%xmm6, %%xmm6 \n\t" | ||||
| "shufps $0, %%xmm7, %%xmm7 \n\t" | "shufps $0, %%xmm7, %%xmm7 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | ||||
| "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | ||||
| @@ -50,7 +50,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||||
| "xorq %0,%0 \n\t" | "xorq %0,%0 \n\t" | ||||
| ".align 16 \n\t" | |||||
| ".p2align 4 \n\t" | |||||
| "1: \n\t" | "1: \n\t" | ||||
| "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x | ||||
| "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y | ||||