@@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define JMP jmp | #define JMP jmp | ||||
#define NOP | #define NOP | ||||
#define XOR xorpd | #define XOR xorpd | ||||
#undef MOVQ | |||||
#define MOVQ movq | #define MOVQ movq | ||||
#define XOR_SY vxorps | #define XOR_SY vxorps | ||||
@@ -305,7 +306,7 @@ movq %r11, kk; | |||||
MOVQ bn,j; | MOVQ bn,j; | ||||
SARQ $2,j; # Rn = 4 | SARQ $2,j; # Rn = 4 | ||||
JLE .L0_loopE; | JLE .L0_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L0_bodyB:; | .L0_bodyB:; | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -320,7 +321,7 @@ MOVQ ba,ptrba; | |||||
MOVQ bm,i; | MOVQ bm,i; | ||||
SARQ $3,i; # Rm = 8 | SARQ $3,i; # Rm = 8 | ||||
JLE .L1_loopE; | JLE .L1_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L1_bodyB:; | .L1_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -367,7 +368,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2,k; # Unroll 4 times | SARQ $2,k; # Unroll 4 times | ||||
JLE .L2_loopE; | JLE .L2_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L2_bodyB:; | .L2_bodyB:; | ||||
# Computing kernel | # Computing kernel | ||||
@@ -591,7 +592,7 @@ ADD2_SY yvec7, yvec8, yvec8; | |||||
.L2_bodyE:; | .L2_bodyE:; | ||||
DECQ k; | DECQ k; | ||||
JG .L2_bodyB; | JG .L2_bodyB; | ||||
.align 64; | |||||
ALIGN_5 | |||||
.L2_loopE:; | .L2_loopE:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -599,7 +600,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L3_loopE; | JLE .L3_loopE; | ||||
.align 64 | |||||
ALIGN_5 | |||||
.L3_loopB: | .L3_loopB: | ||||
######### Unroll 1 ################## | ######### Unroll 1 ################## | ||||
PREFETCH0 PRESIZE*SIZE(ptrba) | PREFETCH0 PRESIZE*SIZE(ptrba) | ||||
@@ -717,7 +718,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L4_loopE; | JLE .L4_loopE; | ||||
.align 64 | |||||
ALIGN_5 | |||||
.L4_loopB:; | .L4_loopB:; | ||||
######### Unroll 1 ################## | ######### Unroll 1 ################## | ||||
PREFETCH0 PRESIZE*SIZE(ptrba) | PREFETCH0 PRESIZE*SIZE(ptrba) | ||||
@@ -875,7 +876,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L4_loopEx; | JNE .L4_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
EXTRA_SY $1,yvec15,xvec7; | EXTRA_SY $1,yvec15,xvec7; | ||||
EXTRA_SY $1,yvec14,xvec6; | EXTRA_SY $1,yvec14,xvec6; | ||||
EXTRA_SY $1,yvec13,xvec5; | EXTRA_SY $1,yvec13,xvec5; | ||||
@@ -934,7 +935,7 @@ ADDQ $16*SIZE,C1; | |||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
JMP .L1_loopE; | JMP .L1_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L4_loopEx: | .L4_loopEx: | ||||
EXTRA_SY $1, yvec15, xvec7; | EXTRA_SY $1, yvec15, xvec7; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
@@ -1077,11 +1078,11 @@ ADDQ $16*SIZE, C0; | |||||
ADDQ $16*SIZE, C1; | ADDQ $16*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L1_loopE:; | .L1_loopE:; | ||||
TEST $4, bm; | TEST $4, bm; | ||||
JLE .L5_loopE; | JLE .L5_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L5_bodyB: | .L5_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -1113,7 +1114,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L8_loopE; | JLE .L8_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L8_bodyB: | .L8_bodyB: | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -1242,7 +1243,7 @@ ADDQ $32*SIZE, ptrba; | |||||
ADDQ $32*SIZE, ptrbb; | ADDQ $32*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L8_bodyB; | JG .L8_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L8_loopE: | .L8_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1250,7 +1251,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L9_loopE; | JLE .L9_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L9_bodyB: | .L9_bodyB: | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -1323,7 +1324,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L10_loopE; | JLE .L10_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L10_bodyB: | .L10_bodyB: | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -1494,7 +1495,7 @@ ADDQ $8*SIZE, C1; | |||||
.L5_loopE: | .L5_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L6_loopE; | JLE .L6_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L6_bodyB: | .L6_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -1527,7 +1528,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L11_loopE; | JLE .L11_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L11_bodyB: | .L11_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | ||||
@@ -1652,7 +1653,7 @@ ADDQ $16*SIZE, ptrba; | |||||
ADDQ $32*SIZE, ptrbb; | ADDQ $32*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L11_bodyB; | JG .L11_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L11_loopE: | .L11_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1660,7 +1661,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L12_loopE; | JLE .L12_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L12_bodyB: | .L12_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | ||||
@@ -1731,7 +1732,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L13_loopE; | JLE .L13_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L13_bodyB: | .L13_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 | ||||
@@ -1875,7 +1876,7 @@ ADDQ $4*SIZE, C1; | |||||
.L6_loopE: | .L6_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L7_loopE; | JLE .L7_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L7_bodyB: | .L7_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -1905,7 +1906,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L14_loopE; | JLE .L14_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L14_bodyB: | .L14_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -1978,7 +1979,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $32*SIZE, ptrbb; | ADDQ $32*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L14_bodyB; | JG .L14_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L14_loopE: | .L14_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1986,7 +1987,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L15_loopE; | JLE .L15_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L15_bodyB: | .L15_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2031,7 +2032,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L16_loopE; | JLE .L16_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L16_bodyB: | .L16_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2129,11 +2130,11 @@ LEAQ (C,ldc,4),C; | |||||
.L0_bodyE:; | .L0_bodyE:; | ||||
DECQ j; | DECQ j; | ||||
JG .L0_bodyB; | JG .L0_bodyB; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L0_loopE:; | .L0_loopE:; | ||||
TEST $2, bn; | TEST $2, bn; | ||||
JLE .L20_loopE; | JLE .L20_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L20_bodyB: | .L20_bodyB: | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -2145,7 +2146,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $3, i; | SARQ $3, i; | ||||
JLE .L21_loopE; | JLE .L21_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L21_bodyB: | .L21_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -2181,7 +2182,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L211_loopE; | JLE .L211_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L211_bodyB: | .L211_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -2430,7 +2431,7 @@ ADDQ $64*SIZE, ptrba; | |||||
ADDQ $16*SIZE, ptrbb; | ADDQ $16*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L211_bodyB; | JG .L211_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L211_loopE: | .L211_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2438,7 +2439,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L212_loopE; | JLE .L212_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L212_bodyB: | .L212_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -2571,7 +2572,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L213_loopE; | JLE .L213_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L213_bodyB: | .L213_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -2825,11 +2826,11 @@ ADDQ $16*SIZE, C0; | |||||
ADDQ $16*SIZE, C1; | ADDQ $16*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L21_bodyB; | JG .L21_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L21_loopE: | .L21_loopE: | ||||
TEST $4, bm; | TEST $4, bm; | ||||
JLE .L22_loopE; | JLE .L22_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L22_bodyB: | .L22_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -2862,7 +2863,7 @@ MOVQ %rax, kkk; | |||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L221_loopE; | JLE .L221_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L221_bodyB: | .L221_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -3002,7 +3003,7 @@ ADDQ $32*SIZE, ptrba; | |||||
ADDQ $16*SIZE, ptrbb; | ADDQ $16*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L221_bodyB; | JG .L221_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L221_loopE: | .L221_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3010,7 +3011,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L222_loopE; | JLE .L222_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L222_bodyB: | .L222_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -3089,7 +3090,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L223_loopE; | JLE .L223_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L223_bodyB: | .L223_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -3237,7 +3238,7 @@ ADDQ $8*SIZE, C1; | |||||
.L22_loopE: | .L22_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L23_loopE; | JLE .L23_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L23_bodyB: | .L23_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -3267,7 +3268,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L231_loopE; | JLE .L231_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L231_bodyB: | .L231_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -3351,7 +3352,7 @@ ADDQ $16*SIZE, ptrba; | |||||
ADDQ $16*SIZE, ptrbb; | ADDQ $16*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L231_bodyB; | JG .L231_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L231_loopE: | .L231_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3359,7 +3360,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L232_loopE; | JLE .L232_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L232_bodyB: | .L232_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -3409,7 +3410,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L233_loopE; | JLE .L233_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L233_bodyB: | .L233_bodyB: | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
ODUP_SX 0*SIZE(ptrbb), xvec5; | ODUP_SX 0*SIZE(ptrbb), xvec5; | ||||
@@ -3503,7 +3504,7 @@ ADDQ $4*SIZE, C1; | |||||
.L23_loopE: | .L23_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L24_loopE; | JLE .L24_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L24_bodyB: | .L24_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -3532,7 +3533,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L241_loopE; | JLE .L241_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L241_bodyB: | .L241_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -3585,7 +3586,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L242_loopE; | JLE .L242_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L242_bodyB: | .L242_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -3616,7 +3617,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L243_loopE; | JLE .L243_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L243_bodyB: | .L243_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -3684,7 +3685,7 @@ LEAQ (C, ldc, 2), C; | |||||
.L20_loopE: | .L20_loopE: | ||||
TEST $1, bn; | TEST $1, bn; | ||||
JLE .L30_loopE; | JLE .L30_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L30_bodyB: | .L30_bodyB: | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -3695,7 +3696,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $3, i; | SARQ $3, i; | ||||
JLE .L31_loopE; | JLE .L31_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L31_bodyB: | .L31_bodyB: | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
@@ -3727,7 +3728,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L311_loopE; | JLE .L311_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L311_bodyB: | .L311_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -3800,7 +3801,7 @@ ADDQ $64*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L311_bodyB; | JG .L311_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L311_loopE: | .L311_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3808,7 +3809,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L312_loopE; | JLE .L312_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L312_bodyB: | .L312_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -3853,7 +3854,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L313_loopE; | JLE .L313_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L313_bodyB: | .L313_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -3941,11 +3942,11 @@ ADDQ $8, kk; | |||||
ADDQ $16*SIZE, C0; | ADDQ $16*SIZE, C0; | ||||
DECQ i; | DECQ i; | ||||
JG .L31_bodyB; | JG .L31_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L31_loopE: | .L31_loopE: | ||||
TEST $4, bm; | TEST $4, bm; | ||||
JLE .L32_loopE; | JLE .L32_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L32_bodyB: | .L32_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -3974,7 +3975,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L321_loopE; | JLE .L321_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L321_bodyB: | .L321_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -4023,7 +4024,7 @@ ADDQ $32*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L321_bodyB; | JG .L321_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L321_loopE: | .L321_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -4031,7 +4032,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L322_loopE; | JLE .L322_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L322_bodyB: | .L322_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -4064,7 +4065,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L323_loopE; | JLE .L323_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L323_bodyB: | .L323_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
@@ -4128,7 +4129,7 @@ ADDQ $8*SIZE, C0; | |||||
.L32_loopE: | .L32_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L33_loopE; | JLE .L33_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L33_bodyB: | .L33_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -4157,7 +4158,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L331_loopE; | JLE .L331_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L331_bodyB: | .L331_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
@@ -4202,7 +4203,7 @@ ADDQ $16*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L331_bodyB; | JG .L331_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L331_loopE: | .L331_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -4210,7 +4211,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L332_loopE; | JLE .L332_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L332_bodyB: | .L332_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
@@ -4241,7 +4242,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L333_loopE; | JLE .L333_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L333_bodyB: | .L333_bodyB: | ||||
#### Unroll 1 #### | #### Unroll 1 #### | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
@@ -4300,7 +4301,7 @@ ADDQ $4*SIZE, C0; | |||||
.L33_loopE: | .L33_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L34_loopE; | JLE .L34_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L34_bodyB: | .L34_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -4329,7 +4330,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L341_loopE; | JLE .L341_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L341_bodyB: | .L341_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -4354,7 +4355,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L341_bodyB; | JG .L341_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L341_loopE: | .L341_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -4362,7 +4363,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L342_loopE; | JLE .L342_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L342_bodyB: | .L342_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -4383,7 +4384,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L343_loopE; | JLE .L343_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L343_bodyB: | .L343_bodyB: | ||||
XOR_SY yvec0, yvec0, yvec0; | XOR_SY yvec0, yvec0, yvec0; | ||||
XOR_SY yvec2, yvec2, yvec2; | XOR_SY yvec2, yvec2, yvec2; | ||||
@@ -140,6 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define JNE jne | #define JNE jne | ||||
#define NOP | #define NOP | ||||
#define XOR xorpd | #define XOR xorpd | ||||
#undef MOVQ | |||||
#define MOVQ movq | #define MOVQ movq | ||||
#define XOR_SY vxorps | #define XOR_SY vxorps | ||||
@@ -265,7 +266,7 @@ movq %r11, kk | |||||
MOVQ bn,j; | MOVQ bn,j; | ||||
SARQ $2,j; # Rn = 4 | SARQ $2,j; # Rn = 4 | ||||
JLE .L0_loopE; | JLE .L0_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L0_bodyB:; | .L0_bodyB:; | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -281,7 +282,7 @@ MOVQ ba,ptrba; | |||||
MOVQ bm,i; | MOVQ bm,i; | ||||
SARQ $3,i; # Rm = 8 | SARQ $3,i; # Rm = 8 | ||||
JLE .L1_loopE; | JLE .L1_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L1_bodyB:; | .L1_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -328,7 +329,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2,k; | SARQ $2,k; | ||||
JLE .L2_loopE; | JLE .L2_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L2_bodyB:; | .L2_bodyB:; | ||||
# Computing kernel | # Computing kernel | ||||
@@ -448,7 +449,7 @@ ADD_DY yvec8, yvec7, yvec8; | |||||
.L2_bodyE:; | .L2_bodyE:; | ||||
DECQ k; | DECQ k; | ||||
JG .L2_bodyB; | JG .L2_bodyB; | ||||
.align 64; | |||||
ALIGN_5 | |||||
.L2_loopE:; | .L2_loopE:; | ||||
PREFETCH2 0*SIZE(prebb); | PREFETCH2 0*SIZE(prebb); | ||||
ADDQ $8*SIZE, prebb; | ADDQ $8*SIZE, prebb; | ||||
@@ -459,7 +460,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L3_loopE; | JLE .L3_loopE; | ||||
.align 64 | |||||
ALIGN_5 | |||||
.L3_bodyB: | .L3_bodyB: | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
PREFETCH0 64*SIZE(ptrba) | PREFETCH0 64*SIZE(ptrba) | ||||
@@ -529,7 +530,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L4_loopE; | JLE .L4_loopE; | ||||
.align 64 | |||||
ALIGN_5 | |||||
.L4_bodyB:; | .L4_bodyB:; | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
PREFETCH0 64*SIZE(ptrba) | PREFETCH0 64*SIZE(ptrba) | ||||
@@ -588,7 +589,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L4_loopEx; # Unalign part write back | JNE .L4_loopEx; # Unalign part write back | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
EXTRA_DY $1,yvec15,xvec7; | EXTRA_DY $1,yvec15,xvec7; | ||||
EXTRA_DY $1,yvec14,xvec6; | EXTRA_DY $1,yvec14,xvec6; | ||||
@@ -648,7 +649,7 @@ ADDQ $8*SIZE,C1; | |||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
JMP .L1_loopE; | JMP .L1_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L4_loopEx:; | .L4_loopEx:; | ||||
EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
@@ -776,11 +777,11 @@ ADDQ $8*SIZE, C0; | |||||
ADDQ $8*SIZE, C1; | ADDQ $8*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L1_loopE:; | .L1_loopE:; | ||||
TEST $4, bm; # Rm = 4 | TEST $4, bm; # Rm = 4 | ||||
JLE .L5_loopE; | JLE .L5_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L5_bodyB:; | .L5_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -816,7 +817,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L6_loopE; | JLE .L6_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L6_bodyB:; | .L6_bodyB:; | ||||
# Computing kernel | # Computing kernel | ||||
@@ -887,7 +888,7 @@ MUL_DY yvec1, yvec5, yvec7; | |||||
ADD_DY yvec9, yvec7, yvec9; | ADD_DY yvec9, yvec7, yvec9; | ||||
DECQ k; | DECQ k; | ||||
JG .L6_bodyB; | JG .L6_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L6_loopE:; | .L6_loopE:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -896,7 +897,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L7_loopE; | JLE .L7_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L7_bodyB:; | .L7_bodyB:; | ||||
#### Untoll time 1 #### | #### Untoll time 1 #### | ||||
LD_DY 4*SIZE(ptrba), yvec1; | LD_DY 4*SIZE(ptrba), yvec1; | ||||
@@ -940,7 +941,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L8_loopE; | JLE .L8_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L8_bodyB:; | .L8_bodyB:; | ||||
#### Untoll time 1 #### | #### Untoll time 1 #### | ||||
MUL_DY yvec0, yvec2, yvec6; | MUL_DY yvec0, yvec2, yvec6; | ||||
@@ -977,7 +978,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L8_loopEx; # Unalign part write back | JNE .L8_loopEx; # Unalign part write back | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
EXTRA_DY $1,yvec15,xvec7; | EXTRA_DY $1,yvec15,xvec7; | ||||
EXTRA_DY $1,yvec13,xvec5; | EXTRA_DY $1,yvec13,xvec5; | ||||
@@ -1014,7 +1015,7 @@ ADDQ $4, kk | |||||
ADDQ $4*SIZE, C0; | ADDQ $4*SIZE, C0; | ||||
ADDQ $4*SIZE, C1; | ADDQ $4*SIZE, C1; | ||||
JMP .L5_loopE; | JMP .L5_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L8_loopEx:; | .L8_loopEx:; | ||||
EXTRA_DY $1,yvec15,xvec7; | EXTRA_DY $1,yvec15,xvec7; | ||||
EXTRA_DY $1,yvec13,xvec5; | EXTRA_DY $1,yvec13,xvec5; | ||||
@@ -1080,7 +1081,7 @@ ADDQ $4*SIZE, C1; | |||||
.L5_loopE:; | .L5_loopE:; | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L9_loopE; | JLE .L9_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L9_bodyB:; | .L9_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -1117,7 +1118,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L10_loopE; | JLE .L10_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L10_bodyB:; | .L10_bodyB:; | ||||
# Computing kernel | # Computing kernel | ||||
@@ -1192,7 +1193,7 @@ MUL_DX xvec1, xvec5; | |||||
ADD_DX xvec5, xvec9; | ADD_DX xvec5, xvec9; | ||||
DECQ k; | DECQ k; | ||||
JG .L10_bodyB; | JG .L10_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L10_loopE:; | .L10_loopE:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk | TEST $2, bk | ||||
@@ -1201,7 +1202,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L11_loopE; | JLE .L11_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L11_bodyB:; | .L11_bodyB:; | ||||
##### Unroll time 1 #### | ##### Unroll time 1 #### | ||||
LD_DX 4*SIZE(ptrbb), xvec6; | LD_DX 4*SIZE(ptrbb), xvec6; | ||||
@@ -1248,7 +1249,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L12_loopE; | JLE .L12_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L12_bodyB:; | .L12_bodyB:; | ||||
SHUF_DX $0x4e, xvec3, xvec5; | SHUF_DX $0x4e, xvec3, xvec5; | ||||
MUL_DX xvec0, xvec2; | MUL_DX xvec0, xvec2; | ||||
@@ -1285,7 +1286,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L12_loopEx; | JNE .L12_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
ADD_DX 0*SIZE(C0), xvec13; | ADD_DX 0*SIZE(C0), xvec13; | ||||
@@ -1310,7 +1311,7 @@ ADDQ $2, kk | |||||
ADDQ $2*SIZE, C0 | ADDQ $2*SIZE, C0 | ||||
ADDQ $2*SIZE, C1 | ADDQ $2*SIZE, C1 | ||||
JMP .L9_loopE; | JMP .L9_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L12_loopEx: | .L12_loopEx: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
LDL_DX 0*SIZE(C0), xvec14; | LDL_DX 0*SIZE(C0), xvec14; | ||||
@@ -1349,7 +1350,7 @@ ADDQ $2*SIZE, C1; | |||||
.L9_loopE:; | .L9_loopE:; | ||||
TEST $1, bm | TEST $1, bm | ||||
JLE .L13_loopE; | JLE .L13_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L13_bodyB:; | .L13_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -1379,7 +1380,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L14_loopE; | JLE .L14_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L14_bodyB:; | .L14_bodyB:; | ||||
BROAD_DY 0*SIZE(ptrba), yvec0; | BROAD_DY 0*SIZE(ptrba), yvec0; | ||||
LD_DY 0*SIZE(ptrbb), yvec2; | LD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1404,7 +1405,7 @@ ADDQ $4*SIZE, ptrba; | |||||
ADDQ $16*SIZE, ptrbb; | ADDQ $16*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L14_bodyB; | JG .L14_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L14_loopE: | .L14_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1413,7 +1414,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L15_loopE; | JLE .L15_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L15_bodyB: | .L15_bodyB: | ||||
BROAD_DY 0*SIZE(ptrba), yvec0; | BROAD_DY 0*SIZE(ptrba), yvec0; | ||||
LD_DY 0*SIZE(ptrbb), yvec2; | LD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1434,7 +1435,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L16_loopE; | JLE .L16_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L16_bodyB:; | .L16_bodyB:; | ||||
BROAD_DY 0*SIZE(ptrba), yvec0; | BROAD_DY 0*SIZE(ptrba), yvec0; | ||||
LD_DY 0*SIZE(ptrbb), yvec2; | LD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1485,11 +1486,11 @@ LEAQ (C,ldc,4),C; | |||||
.L0_bodyE:; | .L0_bodyE:; | ||||
DECQ j; | DECQ j; | ||||
JG .L0_bodyB; | JG .L0_bodyB; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L0_loopE:; | .L0_loopE:; | ||||
TEST $2, bn; | TEST $2, bn; | ||||
JLE .L20_loopE; | JLE .L20_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L20_loopB:; | .L20_loopB:; | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -1501,7 +1502,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $3, i; # Rm = 8 | SARQ $3, i; # Rm = 8 | ||||
JLE .L21_loopE; | JLE .L21_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L21_bodyB:; | .L21_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -1538,7 +1539,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L211_loopE; | JLE .L211_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L211_bodyB: | .L211_bodyB: | ||||
# Computing kernel | # Computing kernel | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
@@ -1692,7 +1693,7 @@ MUL_DX xvec3, xvec7; | |||||
ADD_DX xvec7, xvec8; | ADD_DX xvec7, xvec8; | ||||
DECQ k; | DECQ k; | ||||
JG .L211_bodyB; | JG .L211_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L211_loopE: | .L211_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1701,7 +1702,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L212_loopE; | JLE .L212_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L212_bodyB: | .L212_bodyB: | ||||
# Computing kernel | # Computing kernel | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
@@ -1788,7 +1789,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L213_loopE; | JLE .L213_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L213_bodyB: | .L213_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
@@ -1858,7 +1859,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L213_loopEx; | JNE .L213_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
ADD_DX 0*SIZE(C0), xvec11; | ADD_DX 0*SIZE(C0), xvec11; | ||||
@@ -1893,7 +1894,7 @@ ADDQ $8*SIZE, C1; | |||||
DECQ i; | DECQ i; | ||||
JG .L21_bodyB; | JG .L21_bodyB; | ||||
JMP .L21_loopE; | JMP .L21_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L213_loopEx:; | .L213_loopEx:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
LDL_DX 0*SIZE(C0), xvec0; | LDL_DX 0*SIZE(C0), xvec0; | ||||
@@ -1956,7 +1957,7 @@ JG .L21_bodyB; | |||||
.L21_loopE:; | .L21_loopE:; | ||||
TEST $4, bm; # Rm = 4 | TEST $4, bm; # Rm = 4 | ||||
JLE .L22_loopE; | JLE .L22_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L22_bodyB:; | .L22_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -1989,7 +1990,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L221_loopE; | JLE .L221_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L221_bodyB:; | .L221_bodyB:; | ||||
# Computing kernel | # Computing kernel | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
@@ -2071,7 +2072,7 @@ MUL_DX xvec1, xvec5; | |||||
ADD_DX xvec5, xvec10; | ADD_DX xvec5, xvec10; | ||||
DECQ k; | DECQ k; | ||||
JG .L221_bodyB; | JG .L221_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L221_loopE:; | .L221_loopE:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2080,7 +2081,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L222_loopE; | JLE .L222_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L222_bodyB: | .L222_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
@@ -2129,7 +2130,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L223_loopE; | JLE .L223_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L223_bodyB: | .L223_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
@@ -2171,7 +2172,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L223_loopEx; | JNE .L223_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
ADD_DX 0*SIZE(C0), xvec11; | ADD_DX 0*SIZE(C0), xvec11; | ||||
@@ -2196,7 +2197,7 @@ ADDQ $4, kk | |||||
ADDQ $4*SIZE, C0; | ADDQ $4*SIZE, C0; | ||||
ADDQ $4*SIZE, C1; | ADDQ $4*SIZE, C1; | ||||
JMP .L22_loopE; | JMP .L22_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L223_loopEx:; | .L223_loopEx:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
LDL_DX 0*SIZE(C0), xvec0; | LDL_DX 0*SIZE(C0), xvec0; | ||||
@@ -2237,7 +2238,7 @@ ADDQ $4*SIZE, C1; | |||||
.L22_loopE:; | .L22_loopE:; | ||||
TEST $2, bm; # Rm = 2 | TEST $2, bm; # Rm = 2 | ||||
JLE .L23_loopE; | JLE .L23_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L23_bodyB: | .L23_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2267,7 +2268,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L231_loopE; | JLE .L231_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L231_bodyB: | .L231_bodyB: | ||||
# Computing kernel | # Computing kernel | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
@@ -2309,7 +2310,7 @@ ADD_DX xvec5, xvec11; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L231_bodyB; | JG .L231_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L231_loopE: | .L231_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2318,7 +2319,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L232_loopE; | JLE .L232_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L232_bodyB: | .L232_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
@@ -2347,7 +2348,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L233_loopE; | JLE .L233_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L233_bodyB: | .L233_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
@@ -2373,7 +2374,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L233_loopEx; | JNE .L233_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
ADD_DX 0*SIZE(C0), xvec11; | ADD_DX 0*SIZE(C0), xvec11; | ||||
@@ -2394,7 +2395,7 @@ ADDQ $2, kk; | |||||
ADDQ $2*SIZE, C0; | ADDQ $2*SIZE, C0; | ||||
ADDQ $2*SIZE, C1; | ADDQ $2*SIZE, C1; | ||||
JMP .L23_loopE; | JMP .L23_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L233_loopEx:; | .L233_loopEx:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
LDL_DX 0*SIZE(C0), xvec0; | LDL_DX 0*SIZE(C0), xvec0; | ||||
@@ -2425,7 +2426,7 @@ ADDQ $2*SIZE, C1; | |||||
.L23_loopE: | .L23_loopE: | ||||
TEST $1, bm; # Rm = 1 | TEST $1, bm; # Rm = 1 | ||||
JLE .L24_loopE; | JLE .L24_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L24_bodyB: | .L24_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2454,7 +2455,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L241_loopE; | JLE .L241_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L241_bodyB: | .L241_bodyB: | ||||
BROAD_DX 0*SIZE(ptrba), xvec0; | BROAD_DX 0*SIZE(ptrba), xvec0; | ||||
LD_DX 0*SIZE(ptrbb), xvec2; | LD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2479,7 +2480,7 @@ ADDQ $4*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L241_bodyB; | JG .L241_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L241_loopE: | .L241_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2488,7 +2489,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L242_loopE; | JLE .L242_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L242_bodyB: | .L242_bodyB: | ||||
BROAD_DX 0*SIZE(ptrba), xvec0; | BROAD_DX 0*SIZE(ptrba), xvec0; | ||||
LD_DX 0*SIZE(ptrbb), xvec2; | LD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2509,7 +2510,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L243_loopE; | JLE .L243_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L243_bodyB: | .L243_bodyB: | ||||
BROAD_DX 0*SIZE(ptrba), xvec0; | BROAD_DX 0*SIZE(ptrba), xvec0; | ||||
LD_DX 0*SIZE(ptrbb), xvec2; | LD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2551,7 +2552,7 @@ LEAQ (C, ldc, 2), C; | |||||
.L20_loopE:; | .L20_loopE:; | ||||
TEST $1, bn; # Rn = 1 | TEST $1, bn; # Rn = 1 | ||||
JLE .L30_loopE; | JLE .L30_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L30_bodyB: | .L30_bodyB: | ||||
#if defined(TRMMKERNEL)&&defined(LEFT) | #if defined(TRMMKERNEL)&&defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -2562,7 +2563,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $3, i; | SARQ $3, i; | ||||
JLE .L31_loopE; | JLE .L31_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L31_bodyB: | .L31_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2593,7 +2594,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L311_loopE; | JLE .L311_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L311_bodyB: | .L311_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
@@ -2634,7 +2635,7 @@ ADD_DY yvec4, yvec14, yvec14; | |||||
ADDQ $4*SIZE, ptrbb; | ADDQ $4*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L311_bodyB; | JG .L311_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L311_loopE: | .L311_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2643,7 +2644,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L312_loopE; | JLE .L312_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L312_bodyB: | .L312_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
@@ -2673,7 +2674,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L313_loopE; | JLE .L313_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L313_bodyB: | .L313_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
@@ -2696,7 +2697,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L313_loopEx; | JNE .L313_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
EXTRA_DY $1, yvec15, xvec13; | EXTRA_DY $1, yvec15, xvec13; | ||||
EXTRA_DY $1, yvec14, xvec12; | EXTRA_DY $1, yvec14, xvec12; | ||||
@@ -2724,7 +2725,7 @@ ADDQ $8*SIZE, C0; | |||||
DECQ i; | DECQ i; | ||||
JG .L31_bodyB; | JG .L31_bodyB; | ||||
JMP .L31_loopE; | JMP .L31_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L313_loopEx: | .L313_loopEx: | ||||
EXTRA_DY $1, yvec15, xvec13; | EXTRA_DY $1, yvec15, xvec13; | ||||
EXTRA_DY $1, yvec14, xvec12; | EXTRA_DY $1, yvec14, xvec12; | ||||
@@ -2766,7 +2767,7 @@ JG .L31_bodyB; | |||||
.L31_loopE: | .L31_loopE: | ||||
TEST $4, bm | TEST $4, bm | ||||
JLE .L32_loopE; | JLE .L32_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L32_bodyB: | .L32_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2796,7 +2797,7 @@ MOVQ %rax, kkk | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L321_loopE; | JLE .L321_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L321_bodyB: | .L321_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec1; | BROAD_DY 0*SIZE(ptrbb), yvec1; | ||||
@@ -2821,7 +2822,7 @@ ADDQ $16*SIZE, ptrba; | |||||
ADDQ $4*SIZE, ptrbb; | ADDQ $4*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L321_bodyB; | JG .L321_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L321_loopE: | .L321_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2830,7 +2831,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L322_loopE; | JLE .L322_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L322_bodyB: | .L322_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec1; | BROAD_DY 0*SIZE(ptrbb), yvec1; | ||||
@@ -2852,7 +2853,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L323_loopE; | JLE .L323_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L323_bodyB: | .L323_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec1; | BROAD_DY 0*SIZE(ptrbb), yvec1; | ||||
@@ -2870,7 +2871,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L323_loopEx; | JNE .L323_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing Back #### | #### Writing Back #### | ||||
EXTRA_DY $1, yvec15, xvec14; | EXTRA_DY $1, yvec15, xvec14; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
@@ -2891,7 +2892,7 @@ ADDQ $4, kk | |||||
#endif | #endif | ||||
ADDQ $4*SIZE, C0; | ADDQ $4*SIZE, C0; | ||||
JMP .L32_loopE; | JMP .L32_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L323_loopEx: | .L323_loopEx: | ||||
#### Writing Back #### | #### Writing Back #### | ||||
EXTRA_DY $1, yvec15, xvec14; | EXTRA_DY $1, yvec15, xvec14; | ||||
@@ -2921,7 +2922,7 @@ ADDQ $4*SIZE, C0; | |||||
.L32_loopE: | .L32_loopE: | ||||
TEST $2, bm | TEST $2, bm | ||||
JLE .L33_loopE; | JLE .L33_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L33_bodyB: | .L33_bodyB: | ||||
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2951,7 +2952,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L331_loopE; | JLE .L331_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L331_bodyB: | .L331_bodyB: | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2976,7 +2977,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $4*SIZE, ptrbb; | ADDQ $4*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L331_bodyB; | JG .L331_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L331_loopE: | .L331_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2,bk; | TEST $2,bk; | ||||
@@ -2985,7 +2986,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax | TEST $2, %rax | ||||
#endif | #endif | ||||
JLE .L332_loopE; | JLE .L332_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L332_bodyB: | .L332_bodyB: | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -3006,7 +3007,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L333_loopE; | JLE .L333_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L333_bodyB: | .L333_bodyB: | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -3039,7 +3040,7 @@ ADDQ $2*SIZE, C0; | |||||
.L33_loopE: | .L33_loopE: | ||||
TEST $1, bm | TEST $1, bm | ||||
JLE .L34_loopE; | JLE .L34_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L34_bodyB: | .L34_bodyB: | ||||
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -3068,7 +3069,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L341_loopE; | JLE .L341_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L341_bodyB: | .L341_bodyB: | ||||
movsd 0*SIZE(ptrba), xvec0; | movsd 0*SIZE(ptrba), xvec0; | ||||
movsd 0*SIZE(ptrbb), xvec1; | movsd 0*SIZE(ptrbb), xvec1; | ||||
@@ -3093,7 +3094,7 @@ addq $4*SIZE, ptrba; | |||||
addq $4*SIZE, ptrbb; | addq $4*SIZE, ptrbb; | ||||
decq k; | decq k; | ||||
JG .L341_bodyB; | JG .L341_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L341_loopE: | .L341_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3102,7 +3103,7 @@ MOVQ kkk, %rax; | |||||
TEST $2, %rax; | TEST $2, %rax; | ||||
#endif | #endif | ||||
JLE .L342_loopE; | JLE .L342_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L342_bodyB: | .L342_bodyB: | ||||
movsd 0*SIZE(ptrba), xvec0; | movsd 0*SIZE(ptrba), xvec0; | ||||
movsd 0*SIZE(ptrbb), xvec1; | movsd 0*SIZE(ptrbb), xvec1; | ||||
@@ -3124,7 +3125,7 @@ MOVQ kkk, %rax; | |||||
TEST $1, %rax; | TEST $1, %rax; | ||||
#endif | #endif | ||||
JLE .L343_loopE; | JLE .L343_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L343_bodyB: | .L343_bodyB: | ||||
movsd 0*SIZE(ptrba), xvec0; | movsd 0*SIZE(ptrba), xvec0; | ||||
movsd 0*SIZE(ptrbb), xvec1; | movsd 0*SIZE(ptrbb), xvec1; | ||||
@@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define JMP jmp | #define JMP jmp | ||||
#define NOP | #define NOP | ||||
#define XOR xorpd | #define XOR xorpd | ||||
#undef MOVQ | |||||
#define MOVQ movq | #define MOVQ movq | ||||
#define XOR_SY vxorps | #define XOR_SY vxorps | ||||
@@ -273,7 +274,7 @@ movq %r11, kk | |||||
MOVQ bn,j; | MOVQ bn,j; | ||||
SARQ $3,j; | SARQ $3,j; | ||||
JLE .L0_loopE; | JLE .L0_loopE; | ||||
.align 16; | |||||
ALIGN_4; | |||||
.L0_bodyB:; | .L0_bodyB:; | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -289,7 +290,7 @@ MOVQ ba,ptrba; | |||||
MOVQ bm,i; | MOVQ bm,i; | ||||
SARQ $3,i; | SARQ $3,i; | ||||
JLE .L1_loopE; | JLE .L1_loopE; | ||||
.align 16; | |||||
ALIGN_4; | |||||
.L1_bodyB:; | .L1_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -342,7 +343,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2,k; | SARQ $2,k; | ||||
JLE .L2_loopE; | JLE .L2_loopE; | ||||
.align 16; | |||||
ALIGN_4; | |||||
.L2_bodyB:; | .L2_bodyB:; | ||||
# Computing kernel | # Computing kernel | ||||
@@ -472,7 +473,7 @@ ADD_SY yvec8, yvec7, yvec8; | |||||
.L2_bodyE:; | .L2_bodyE:; | ||||
DECQ k; | DECQ k; | ||||
JG .L2_bodyB; | JG .L2_bodyB; | ||||
.align 64; | |||||
ALIGN_4 | |||||
.L2_loopE:; | .L2_loopE:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -480,7 +481,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L3_loopE; | JLE .L3_loopE; | ||||
.align 64 | |||||
ALIGN_4 | |||||
.L3_loobB: | .L3_loobB: | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
MUL_SY yvec0, yvec2, yvec6; | MUL_SY yvec0, yvec2, yvec6; | ||||
@@ -550,7 +551,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L4_loopE; | JLE .L4_loopE; | ||||
.align 64 | |||||
ALIGN_4 | |||||
.L4_loopB:; | .L4_loopB:; | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
MUL_SY yvec0, yvec2, yvec6; | MUL_SY yvec0, yvec2, yvec6; | ||||
@@ -609,7 +610,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L4_loopEx; | JNE .L4_loopEx; | ||||
.align 16 | |||||
ALIGN_4 | |||||
LEAQ (ldc,ldc,2),%rax; | LEAQ (ldc,ldc,2),%rax; | ||||
EXTRA_SY $1,yvec15,xvec7; | EXTRA_SY $1,yvec15,xvec7; | ||||
EXTRA_SY $1,yvec14,xvec6; | EXTRA_SY $1,yvec14,xvec6; | ||||
@@ -669,7 +670,7 @@ ADDQ $8*SIZE,C1; | |||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
JMP .L1_loopE; | JMP .L1_loopE; | ||||
.align 16; | |||||
ALIGN_4; | |||||
.L4_loopEx: | .L4_loopEx: | ||||
LEAQ (ldc,ldc,2),%rax; | LEAQ (ldc,ldc,2),%rax; | ||||
EXTRA_SY $1, yvec15, xvec7; | EXTRA_SY $1, yvec15, xvec7; | ||||
@@ -813,11 +814,11 @@ ADDQ $8*SIZE, C0; | |||||
ADDQ $8*SIZE, C1; | ADDQ $8*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L1_loopE:; | .L1_loopE:; | ||||
TEST $4, bm; | TEST $4, bm; | ||||
JLE .L5_loopE; | JLE .L5_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L5_bodyB: | .L5_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -857,7 +858,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L8_loopE; | JLE .L8_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L8_bodyB: | .L8_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
@@ -983,7 +984,7 @@ MUL_SX xvec1, xvec5; | |||||
ADD_SX xvec5, xvec8; | ADD_SX xvec5, xvec8; | ||||
DECQ k; | DECQ k; | ||||
JG .L8_bodyB; | JG .L8_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L8_loopE: | .L8_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -991,7 +992,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L9_loopE; | JLE .L9_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L9_bodyB: | .L9_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
SHUF_SX $0x4e, xvec2, xvec4; | SHUF_SX $0x4e, xvec2, xvec4; | ||||
@@ -1062,7 +1063,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L10_loopE; | JLE .L10_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L10_bodyB: | .L10_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
SHUF_SX $0x4e, xvec2, xvec4; | SHUF_SX $0x4e, xvec2, xvec4; | ||||
@@ -1122,7 +1123,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L10_loopEx; | JNE .L10_loopEx; | ||||
.align 16 | |||||
ALIGN_4 | |||||
LEAQ (ldc,ldc,2),%rax; | LEAQ (ldc,ldc,2),%rax; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
ADD_SX 0*SIZE(C0), xvec15; | ADD_SX 0*SIZE(C0), xvec15; | ||||
@@ -1155,7 +1156,7 @@ ADDQ $4, kk | |||||
ADDQ $4*SIZE, C0; | ADDQ $4*SIZE, C0; | ||||
ADDQ $4*SIZE, C1; | ADDQ $4*SIZE, C1; | ||||
JMP .L5_loopE; | JMP .L5_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L10_loopEx: | .L10_loopEx: | ||||
LEAQ (ldc,ldc,2),%rax; | LEAQ (ldc,ldc,2),%rax; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
@@ -1215,7 +1216,7 @@ ADDQ $4*SIZE, C1; | |||||
.L5_loopE: | .L5_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L6_loopE; | JLE .L6_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L6_bodyB: | .L6_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -1249,7 +1250,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L11_loopE; | JLE .L11_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L11_bodyB: | .L11_bodyB: | ||||
#### Computing kernel | #### Computing kernel | ||||
LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | ||||
@@ -1318,7 +1319,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $32*SIZE, ptrbb; | ADDQ $32*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L11_bodyB; | JG .L11_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L11_loopE: | .L11_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1326,7 +1327,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L12_loopE; | JLE .L12_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L12_bodyB: | .L12_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | ||||
SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 | SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 | ||||
@@ -1368,7 +1369,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L13_loopE; | JLE .L13_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L13_bodyB: | .L13_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 | ||||
SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 | SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 | ||||
@@ -1433,7 +1434,7 @@ ADDQ $2*SIZE, C1; | |||||
.L6_loopE: | .L6_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L7_loopE; | JLE .L7_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L7_bodyB: | .L7_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -1465,7 +1466,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L14_loopE; | JLE .L14_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L14_bodyB: | .L14_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -1503,7 +1504,7 @@ ADDQ $4*SIZE, ptrba; | |||||
ADDQ $32*SIZE, ptrbb; | ADDQ $32*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L14_bodyB; | JG .L14_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L14_loopE: | .L14_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1511,7 +1512,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L15_loopE; | JLE .L15_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L15_bodyB: | .L15_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -1538,7 +1539,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L16_loopE; | JLE .L16_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L16_bodyB: | .L16_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -1611,11 +1612,11 @@ LEAQ (C,ldc,8),C; | |||||
.L0_bodyE:; | .L0_bodyE:; | ||||
DECQ j; | DECQ j; | ||||
JG .L0_bodyB; | JG .L0_bodyB; | ||||
.align 16; | |||||
ALIGN_4; | |||||
.L0_loopE:; | .L0_loopE:; | ||||
TEST $4, bn; # Rn = 4 | TEST $4, bn; # Rn = 4 | ||||
JLE .L20_loopE; | JLE .L20_loopE; | ||||
.align 16; | |||||
ALIGN_4; | |||||
.L20_bodyB: | .L20_bodyB: | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -1628,7 +1629,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $3, i; | SARQ $3, i; | ||||
JLE .L21_loopE; | JLE .L21_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L21_bodyB: | .L21_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -1668,7 +1669,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2,k; | SARQ $2,k; | ||||
JLE .L211_loopE; | JLE .L211_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L211_bodyB: | .L211_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
ODUP_SX 0*SIZE(ptrbb), xvec3; | ODUP_SX 0*SIZE(ptrbb), xvec3; | ||||
@@ -1800,7 +1801,7 @@ ADD_SX xvec7, xvec8; | |||||
LD_SX 4*SIZE(ptrba), xvec1; | LD_SX 4*SIZE(ptrba), xvec1; | ||||
DECQ k; | DECQ k; | ||||
JG .L211_bodyB; | JG .L211_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L211_loopE: | .L211_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk | TEST $2, bk | ||||
@@ -1808,7 +1809,7 @@ TEST $2, bk | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L212_loopE; | JLE .L212_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L212_bodyB: | .L212_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
ODUP_SX 0*SIZE(ptrbb), xvec3; | ODUP_SX 0*SIZE(ptrbb), xvec3; | ||||
@@ -1882,7 +1883,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L213_loopE; | JLE .L213_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L213_bodyB: | .L213_bodyB: | ||||
ODUP_SX 0*SIZE(ptrbb), xvec3; | ODUP_SX 0*SIZE(ptrbb), xvec3; | ||||
SHUF_SX $0x4e, xvec2, xvec4; | SHUF_SX $0x4e, xvec2, xvec4; | ||||
@@ -1982,11 +1983,11 @@ ADDQ $8*SIZE, C0; | |||||
ADDQ $8*SIZE, C1; | ADDQ $8*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L21_bodyB; | JG .L21_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L21_loopE: | .L21_loopE: | ||||
TEST $4, bm; | TEST $4, bm; | ||||
JLE .L22_loopE; | JLE .L22_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L22_bodyB: | .L22_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2019,7 +2020,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L221_loopE; | JLE .L221_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L221_bodyB: | .L221_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; | EDUP_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2089,7 +2090,7 @@ ADDQ $16*SIZE, ptrbb; | |||||
DECQ k; | DECQ k; | ||||
JG .L221_bodyB; | JG .L221_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L221_loopE: | .L221_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2097,7 +2098,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L222_loopE; | JLE .L222_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L222_bodyB: | .L222_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; | EDUP_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2139,7 +2140,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L223_loopE; | JLE .L223_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L223_bodyB: | .L223_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; | EDUP_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2203,7 +2204,7 @@ ADDQ $4*SIZE, C1; | |||||
.L22_loopE: | .L22_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L23_loopE; | JLE .L23_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L23_bodyB: | .L23_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2234,7 +2235,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L231_loopE; | JLE .L231_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L231_bodyB: | .L231_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
@@ -2274,7 +2275,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $16*SIZE, ptrbb; | ADDQ $16*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L231_bodyB; | JG .L231_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L231_loopE: | .L231_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2282,7 +2283,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L232_loopE; | JLE .L232_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L232_bodyB: | .L232_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
@@ -2310,7 +2311,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L233_loopE; | JLE .L233_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L233_bodyB: | .L233_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
EDUP_SX 0*SIZE(ptrbb), xvec4; | EDUP_SX 0*SIZE(ptrbb), xvec4; | ||||
@@ -2356,7 +2357,7 @@ ADDQ $2*SIZE, C1; | |||||
.L23_loopE: | .L23_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L24_loopE; | JLE .L24_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L24_bodyB: | .L24_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2386,7 +2387,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L241_loopE; | JLE .L241_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L241_bodyB: | .L241_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec1; | LD_SX 0*SIZE(ptrbb), xvec1; | ||||
@@ -2419,7 +2420,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L242_loopE; | JLE .L242_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L242_bodyB: | .L242_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec1; | LD_SX 0*SIZE(ptrbb), xvec1; | ||||
@@ -2440,7 +2441,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L243_loopE; | JLE .L243_loopE; | ||||
.align 16; | |||||
ALIGN_4; | |||||
.L243_bodyB: | .L243_bodyB: | ||||
BROAD_SX 0*SIZE(ptrba), xvec0; | BROAD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec1; | LD_SX 0*SIZE(ptrbb), xvec1; | ||||
@@ -2491,7 +2492,7 @@ LEAQ (C, ldc, 4), C; | |||||
.L20_loopE: | .L20_loopE: | ||||
TEST $2, bn; | TEST $2, bn; | ||||
JLE .L30_loopE; | JLE .L30_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L30_bodyB: | .L30_bodyB: | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -2503,7 +2504,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $3, i; | SARQ $3, i; | ||||
JLE .L31_loopE; | JLE .L31_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L31_bodyB: | .L31_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2536,7 +2537,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L311_loopE; | JLE .L311_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L311_bodyB: | .L311_bodyB: | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
SHUF_SX $0x50, xvec2, xvec3; | SHUF_SX $0x50, xvec2, xvec3; | ||||
@@ -2612,7 +2613,7 @@ ADDQ $32*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L311_bodyB; | JG .L311_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L311_loopE: | .L311_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2620,7 +2621,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L312_loopE; | JLE .L312_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L312_bodyB: | .L312_bodyB: | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
SHUF_SX $0x50, xvec2, xvec3; | SHUF_SX $0x50, xvec2, xvec3; | ||||
@@ -2666,7 +2667,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L313_loopE; | JLE .L313_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L313_bodyB: | .L313_bodyB: | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
SHUF_SX $0x50, xvec2, xvec3; | SHUF_SX $0x50, xvec2, xvec3; | ||||
@@ -2731,11 +2732,11 @@ ADDQ $8*SIZE, C0; | |||||
ADDQ $8*SIZE, C1; | ADDQ $8*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L31_bodyB; | JG .L31_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L31_loopE: | .L31_loopE: | ||||
TEST $4, bm; | TEST $4, bm; | ||||
JLE .L32_loopE; | JLE .L32_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L32_bodyB: | .L32_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2766,7 +2767,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L321_loopE; | JLE .L321_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L321_bodyB: | .L321_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2806,7 +2807,7 @@ ADDQ $16*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L321_bodyB; | JG .L321_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L321_loopE: | .L321_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2814,7 +2815,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L322_loopE; | JLE .L322_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L322_bodyB: | .L322_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2842,7 +2843,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L323_loopE; | JLE .L323_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L323_bodyB: | .L323_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
LD_SX 0*SIZE(ptrbb), xvec2; | LD_SX 0*SIZE(ptrbb), xvec2; | ||||
@@ -2887,7 +2888,7 @@ ADDQ $4*SIZE, C1; | |||||
.L32_loopE: | .L32_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L33_loopE; | JLE .L33_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L33_bodyB: | .L33_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -2920,7 +2921,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L331_loopE; | JLE .L331_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L331_bodyB: | .L331_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 | LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 | EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 | ||||
@@ -2943,7 +2944,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L331_bodyB; | JG .L331_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L331_loopE: | .L331_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2951,7 +2952,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L332_loopE; | JLE .L332_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L332_bodyB: | .L332_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 | LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 | ||||
EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 | EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 | ||||
@@ -2972,7 +2973,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L333_loopE; | JLE .L333_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L333_bodyB: | .L333_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 1*SIZE(ptrba), xvec1; | movss 1*SIZE(ptrba), xvec1; | ||||
@@ -3031,7 +3032,7 @@ ADDQ $2*SIZE, C1; | |||||
.L33_loopE: | .L33_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L34_loopE; | JLE .L34_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L34_bodyB: | .L34_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -3062,7 +3063,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L341_loopE; | JLE .L341_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L341_bodyB: | .L341_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 0*SIZE(ptrbb), xvec1; | movss 0*SIZE(ptrbb), xvec1; | ||||
@@ -3104,7 +3105,7 @@ addq $4*SIZE, ptrba; | |||||
addq $8*SIZE, ptrbb; | addq $8*SIZE, ptrbb; | ||||
decq k; | decq k; | ||||
jg .L341_bodyB; | jg .L341_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L341_loopE: | .L341_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3112,7 +3113,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L342_loopE; | JLE .L342_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L342_bodyB: | .L342_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 0*SIZE(ptrbb), xvec1; | movss 0*SIZE(ptrbb), xvec1; | ||||
@@ -3140,7 +3141,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L343_loopE; | JLE .L343_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L343_bodyB: | .L343_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 0*SIZE(ptrbb), xvec1; | movss 0*SIZE(ptrbb), xvec1; | ||||
@@ -3189,7 +3190,7 @@ LEAQ (C, ldc, 2), C; | |||||
.L30_loopE: | .L30_loopE: | ||||
TEST $1, bn; | TEST $1, bn; | ||||
JLE .L40_loopE; | JLE .L40_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L40_bodyB: | .L40_bodyB: | ||||
#if defined(TRMMKERNEL)&&defined(LEFT) | #if defined(TRMMKERNEL)&&defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -3200,7 +3201,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $3, i; | SARQ $3, i; | ||||
JLE .L41_loopE; | JLE .L41_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L41_bodyB: | .L41_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -3230,7 +3231,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L411_loopE; | JLE .L411_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L411_bodyB: | .L411_bodyB: | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
BROAD_SY 0*SIZE(ptrbb), yvec1; | BROAD_SY 0*SIZE(ptrbb), yvec1; | ||||
@@ -3256,7 +3257,7 @@ ADDQ $32*SIZE, ptrba; | |||||
ADDQ $4*SIZE, ptrbb; | ADDQ $4*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L411_bodyB; | JG .L411_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L411_loopE: | .L411_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3264,7 +3265,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L412_loopE; | JLE .L412_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L412_bodyB: | .L412_bodyB: | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
BROAD_SY 0*SIZE(ptrbb), yvec1; | BROAD_SY 0*SIZE(ptrbb), yvec1; | ||||
@@ -3285,7 +3286,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L413_loopE; | JLE .L413_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L413_bodyB: | .L413_bodyB: | ||||
LD_SY 0*SIZE(ptrba), yvec0; | LD_SY 0*SIZE(ptrba), yvec0; | ||||
BROAD_SY 0*SIZE(ptrbb), yvec1; | BROAD_SY 0*SIZE(ptrbb), yvec1; | ||||
@@ -3329,11 +3330,11 @@ ADDQ $8, kk; | |||||
ADDQ $8*SIZE, C0; | ADDQ $8*SIZE, C0; | ||||
DECQ i; | DECQ i; | ||||
JG .L41_bodyB; | JG .L41_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L41_loopE: | .L41_loopE: | ||||
TEST $4, bm; | TEST $4, bm; | ||||
JLE .L42_loopE; | JLE .L42_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L42_bodyB: | .L42_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -3362,7 +3363,7 @@ MOVQ %rax, kkk | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L421_loopE; | JLE .L421_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L421_bodyB: | .L421_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
BROAD_SX 0*SIZE(ptrbb), xvec1; | BROAD_SX 0*SIZE(ptrbb), xvec1; | ||||
@@ -3387,7 +3388,7 @@ ADDQ $16*SIZE, ptrba; | |||||
ADDQ $4*SIZE, ptrbb; | ADDQ $4*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L421_bodyB; | JG .L421_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L421_loopE: | .L421_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3395,7 +3396,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L422_loopE; | JLE .L422_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L422_bodyB: | .L422_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
BROAD_SX 0*SIZE(ptrbb), xvec1; | BROAD_SX 0*SIZE(ptrbb), xvec1; | ||||
@@ -3416,7 +3417,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L423_loopE; | JLE .L423_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L423_bodyB: | .L423_bodyB: | ||||
LD_SX 0*SIZE(ptrba), xvec0; | LD_SX 0*SIZE(ptrba), xvec0; | ||||
BROAD_SX 0*SIZE(ptrbb), xvec1; | BROAD_SX 0*SIZE(ptrbb), xvec1; | ||||
@@ -3451,7 +3452,7 @@ ADDQ $4*SIZE, C0; | |||||
.L42_loopE: | .L42_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L43_loopE; | JLE .L43_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L43_bodyB: | .L43_bodyB: | ||||
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -3481,7 +3482,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L431_loopE; | JLE .L431_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L431_bodyB: | .L431_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 1*SIZE(ptrba), xvec1; | movss 1*SIZE(ptrba), xvec1; | ||||
@@ -3518,7 +3519,7 @@ addq $8*SIZE, ptrba; | |||||
addq $4*SIZE, ptrbb; | addq $4*SIZE, ptrbb; | ||||
decq k; | decq k; | ||||
JG .L431_bodyB; | JG .L431_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L431_loopE: | .L431_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3526,7 +3527,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L432_loopE; | JLE .L432_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L432_bodyB: | .L432_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 1*SIZE(ptrba), xvec1; | movss 1*SIZE(ptrba), xvec1; | ||||
@@ -3553,7 +3554,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L433_loopE; | JLE .L433_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L433_bodyB: | .L433_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 1*SIZE(ptrba), xvec1; | movss 1*SIZE(ptrba), xvec1; | ||||
@@ -3592,7 +3593,7 @@ addq $2*SIZE, C0; | |||||
.L43_loopE: | .L43_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L44_loopE; | JLE .L44_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L44_bodyB: | .L44_bodyB: | ||||
#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) | ||||
MOVQ bb, ptrbb; | MOVQ bb, ptrbb; | ||||
@@ -3621,7 +3622,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L441_loopE; | JLE .L441_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L441_bodyB: | .L441_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 0*SIZE(ptrbb), xvec1; | movss 0*SIZE(ptrbb), xvec1; | ||||
@@ -3646,7 +3647,7 @@ addq $4*SIZE, ptrba; | |||||
addq $4*SIZE, ptrbb; | addq $4*SIZE, ptrbb; | ||||
decq k; | decq k; | ||||
JG .L441_bodyB; | JG .L441_bodyB; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L441_loopE: | .L441_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3654,7 +3655,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L442_loopE; | JLE .L442_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L442_bodyB: | .L442_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 0*SIZE(ptrbb), xvec1; | movss 0*SIZE(ptrbb), xvec1; | ||||
@@ -3675,7 +3676,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L443_loopE; | JLE .L443_loopE; | ||||
.align 16 | |||||
ALIGN_4 | |||||
.L443_bodyB: | .L443_bodyB: | ||||
movss 0*SIZE(ptrba), xvec0; | movss 0*SIZE(ptrba), xvec0; | ||||
movss 0*SIZE(ptrbb), xvec1; | movss 0*SIZE(ptrbb), xvec1; | ||||
@@ -145,6 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define JMP jmp | #define JMP jmp | ||||
#define NOP | #define NOP | ||||
#define XOR xorpd | #define XOR xorpd | ||||
#undef MOVQ | |||||
#define MOVQ movq | |||||
#define XOR_SY vxorps | #define XOR_SY vxorps | ||||
#define XOR_DY vxorpd | #define XOR_DY vxorpd | ||||
@@ -297,7 +299,7 @@ movq %r11, kk; | |||||
MOVQ bn,j; | MOVQ bn,j; | ||||
SARQ $2,j; # Rn = 4 | SARQ $2,j; # Rn = 4 | ||||
JLE .L0_loopE; | JLE .L0_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L0_bodyB:; | .L0_bodyB:; | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -312,7 +314,7 @@ MOVQ ba,ptrba; | |||||
MOVQ bm,i; | MOVQ bm,i; | ||||
SARQ $2,i; # Rm = 4 | SARQ $2,i; # Rm = 4 | ||||
JLE .L1_loopE; | JLE .L1_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L1_bodyB:; | .L1_bodyB:; | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -361,7 +363,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2,k; # Unroll 4 times | SARQ $2,k; # Unroll 4 times | ||||
JLE .L2_loopE; | JLE .L2_loopE; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L2_bodyB:; | .L2_bodyB:; | ||||
#### Computing kernel #### | #### Computing kernel #### | ||||
@@ -584,7 +586,7 @@ ADD2_DY yvec6, yvec12, yvec12; | |||||
ADD2_DY yvec7, yvec8, yvec8; | ADD2_DY yvec7, yvec8, yvec8; | ||||
DECQ k; | DECQ k; | ||||
JG .L2_bodyB; | JG .L2_bodyB; | ||||
.align 64; | |||||
ALIGN_5 | |||||
.L2_loopE:; | .L2_loopE:; | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -592,7 +594,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L3_loopE; | JLE .L3_loopE; | ||||
.align 64 | |||||
ALIGN_5 | |||||
.L3_bodyB: | .L3_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
LD_DY 4*SIZE(ptrba), yvec1; | LD_DY 4*SIZE(ptrba), yvec1; | ||||
@@ -710,7 +712,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L4_loopE; | JLE .L4_loopE; | ||||
.align 64 | |||||
ALIGN_5 | |||||
.L4_loopB:; | .L4_loopB:; | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
PREFETCH0 PRESIZE*SIZE(ptrba); | PREFETCH0 PRESIZE*SIZE(ptrba); | ||||
@@ -852,7 +854,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L4_loopEx; | JNE .L4_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Store Back #### | #### Store Back #### | ||||
EXTRA_DY $1,yvec15,xvec7; | EXTRA_DY $1,yvec15,xvec7; | ||||
EXTRA_DY $1,yvec14,xvec6; | EXTRA_DY $1,yvec14,xvec6; | ||||
@@ -912,7 +914,7 @@ ADDQ $8*SIZE,C1; | |||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
JMP .L1_loopE; | JMP .L1_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L4_loopEx: | .L4_loopEx: | ||||
EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
EXTRA_DY $1, yvec14, xvec6; | EXTRA_DY $1, yvec14, xvec6; | ||||
@@ -1024,11 +1026,11 @@ ADDQ $8*SIZE, C0; | |||||
ADDQ $8*SIZE, C1; | ADDQ $8*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L1_bodyB; | JG .L1_bodyB; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L1_loopE:; | .L1_loopE:; | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L5_loopE; | JLE .L5_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L5_bodyB: | .L5_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -1060,7 +1062,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L7_loopE; | JLE .L7_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L7_bodyB: | .L7_bodyB: | ||||
#### Compute kernel #### | #### Compute kernel #### | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
@@ -1194,7 +1196,7 @@ ADD2_DY yvec7, yvec12, yvec12; | |||||
ADDQ $32*SIZE, ptrbb; | ADDQ $32*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L7_bodyB; | JG .L7_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L7_loopE: | .L7_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1202,7 +1204,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L8_loopE; | JLE .L8_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L8_bodyB: | .L8_bodyB: | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
@@ -1276,7 +1278,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L9_loopE; | JLE .L9_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L9_bodyB: | .L9_bodyB: | ||||
#### Unroll times 1 #### | #### Unroll times 1 #### | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
@@ -1364,7 +1366,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L9_loopEx; | JNE .L9_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing back #### | #### Writing back #### | ||||
EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
EXTRA_DY $1, yvec14, xvec6; | EXTRA_DY $1, yvec14, xvec6; | ||||
@@ -1401,7 +1403,7 @@ ADDQ $2, kk; | |||||
ADDQ $4*SIZE, C0; | ADDQ $4*SIZE, C0; | ||||
ADDQ $4*SIZE, C1; | ADDQ $4*SIZE, C1; | ||||
JMP .L5_loopE; | JMP .L5_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L9_loopEx: | .L9_loopEx: | ||||
EXTRA_DY $1, yvec15, xvec7; | EXTRA_DY $1, yvec15, xvec7; | ||||
EXTRA_DY $1, yvec14, xvec6; | EXTRA_DY $1, yvec14, xvec6; | ||||
@@ -1466,7 +1468,7 @@ ADDQ $4*SIZE, C1; | |||||
.L5_loopE: | .L5_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L6_loopE; | JLE .L6_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L6_bodyB: | .L6_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -1496,7 +1498,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L10_loopE; | JLE .L10_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L10_bodyB: | .L10_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1570,7 +1572,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $32*SIZE, ptrbb; | ADDQ $32*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L10_bodyB; | JG .L10_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L10_loopE: | .L10_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1578,7 +1580,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L11_loopE; | JLE .L11_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L11_bodyB: | .L11_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1624,7 +1626,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L12_loopE; | JLE .L12_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L12_bodyB: | .L12_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1722,11 +1724,11 @@ LEAQ (C,ldc,4),C; | |||||
.L0_bodyE:; | .L0_bodyE:; | ||||
DECQ j; | DECQ j; | ||||
JG .L0_bodyB; | JG .L0_bodyB; | ||||
.align 32; | |||||
ALIGN_5; | |||||
.L0_loopE:; | .L0_loopE:; | ||||
TEST $2, bn; | TEST $2, bn; | ||||
JLE .L20_loopE; | JLE .L20_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L20_bodyB: | .L20_bodyB: | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -1738,7 +1740,7 @@ MOVQ ba, ptrba; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $2, i; | SARQ $2, i; | ||||
JLE .L21_loopE; | JLE .L21_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L21_bodyB: | .L21_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -1770,7 +1772,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L211_loopE; | JLE .L211_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L211_bodyB: | .L211_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1891,7 +1893,7 @@ ADD2_DY yvec7, yvec12, yvec12; | |||||
ADDQ $32*SIZE, ptrba; | ADDQ $32*SIZE, ptrba; | ||||
DECQ k; | DECQ k; | ||||
JG .L211_bodyB; | JG .L211_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L211_loopE: | .L211_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -1899,7 +1901,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L212_loopE; | JLE .L212_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L212_bodyB: | .L212_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -1969,7 +1971,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L213_loopE; | JLE .L213_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L213_bodyB: | .L213_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2058,7 +2060,7 @@ MOVQ C0, %rax; | |||||
OR ldc, %rax; | OR ldc, %rax; | ||||
TEST $15, %rax; | TEST $15, %rax; | ||||
JNE .L213_loopEx; | JNE .L213_loopEx; | ||||
.align 32 | |||||
ALIGN_5 | |||||
#### Writing back #### | #### Writing back #### | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
ADD_DX 0*SIZE(C0),xvec15; | ADD_DX 0*SIZE(C0),xvec15; | ||||
@@ -2093,7 +2095,7 @@ ADDQ $8*SIZE, C1; | |||||
DECQ i; | DECQ i; | ||||
JG .L21_bodyB; | JG .L21_bodyB; | ||||
JMP .L21_loopE; | JMP .L21_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L213_loopEx: | .L213_loopEx: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
LDL_DX 0*SIZE(C0), xvec0; | LDL_DX 0*SIZE(C0), xvec0; | ||||
@@ -2153,11 +2155,11 @@ ADDQ $8*SIZE, C0; | |||||
ADDQ $8*SIZE, C1; | ADDQ $8*SIZE, C1; | ||||
DECQ i; | DECQ i; | ||||
JG .L21_bodyB; | JG .L21_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L21_loopE: | .L21_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L22_loopE; | JLE .L22_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L22_bodyB: | .L22_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -2187,7 +2189,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L221_loopE; | JLE .L221_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L221_bodyB: | .L221_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2268,7 +2270,7 @@ ADD2_DY yvec6, yvec13, yvec13; | |||||
ADDQ $16*SIZE, ptrba; | ADDQ $16*SIZE, ptrba; | ||||
DECQ k; | DECQ k; | ||||
JG .L221_bodyB; | JG .L221_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L221_loopE: | .L221_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2276,7 +2278,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L222_loopE; | JLE .L222_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L222_bodyB: | .L222_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2325,7 +2327,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L223_loopE; | JLE .L223_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L223_bodyB: | .L223_bodyB: | ||||
#### Unroll time 1 #### | #### Unroll time 1 #### | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2419,7 +2421,7 @@ ADDQ $4*SIZE, C1; | |||||
.L22_loopE: | .L22_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L23_loopE; | JLE .L23_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L23_bodyB: | .L23_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -2448,7 +2450,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L231_loopE; | JLE .L231_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L231_bodyB: | .L231_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2498,7 +2500,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $16*SIZE, ptrbb; | ADDQ $16*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L231_bodyB; | JG .L231_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L231_loopE: | .L231_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2506,7 +2508,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L232_loopE; | JLE .L232_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L232_bodyB: | .L232_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2540,7 +2542,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L233_loopE; | JLE .L233_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L233_bodyB: | .L233_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i | ||||
EDUP_DY 0*SIZE(ptrbb), yvec2; | EDUP_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2614,7 +2616,7 @@ LEAQ (C, ldc, 2), C; | |||||
.L20_loopE: | .L20_loopE: | ||||
TEST $1, bn; | TEST $1, bn; | ||||
JLE .L30_loopE; | JLE .L30_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L30_bodyB: | .L30_bodyB: | ||||
#if defined(TRMMKERNEL) && defined(LEFT) | #if defined(TRMMKERNEL) && defined(LEFT) | ||||
MOVQ OFFSET, %rax; | MOVQ OFFSET, %rax; | ||||
@@ -2625,7 +2627,7 @@ MOVQ C, C0; | |||||
MOVQ bm, i; | MOVQ bm, i; | ||||
SARQ $2, i; | SARQ $2, i; | ||||
JLE .L31_loopE; | JLE .L31_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L31_bodyB: | .L31_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -2655,7 +2657,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L311_loopE; | JLE .L311_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L311_bodyB: | .L311_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec2; | BROAD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2732,7 +2734,7 @@ ADDQ $32*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L311_bodyB; | JG .L311_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L311_loopE: | .L311_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2740,7 +2742,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L312_loopE; | JLE .L312_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L312_bodyB: | .L312_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec2; | BROAD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2787,7 +2789,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L313_loopE; | JLE .L313_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L313_bodyB: | .L313_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec2; | BROAD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2877,11 +2879,11 @@ ADDQ $4, kk; | |||||
ADDQ $8*SIZE, C0; | ADDQ $8*SIZE, C0; | ||||
DECQ i; | DECQ i; | ||||
JG .L31_bodyB; | JG .L31_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L31_loopE: | .L31_loopE: | ||||
TEST $2, bm; | TEST $2, bm; | ||||
JLE .L32_loopE; | JLE .L32_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L32_bodyB: | .L32_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -2910,7 +2912,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L321_loopE; | JLE .L321_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L321_bodyB: | .L321_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec2; | BROAD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2951,7 +2953,7 @@ ADDQ $16*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L321_bodyB; | JG .L321_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L321_loopE: | .L321_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -2959,7 +2961,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L322_loopE; | JLE .L322_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L322_bodyB: | .L322_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec2; | BROAD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -2988,7 +2990,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L323_loopE; | JLE .L323_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L323_bodyB: | .L323_bodyB: | ||||
LD_DY 0*SIZE(ptrba), yvec0; | LD_DY 0*SIZE(ptrba), yvec0; | ||||
BROAD_DY 0*SIZE(ptrbb), yvec2; | BROAD_DY 0*SIZE(ptrbb), yvec2; | ||||
@@ -3049,7 +3051,7 @@ ADDQ $4*SIZE, C0; | |||||
.L32_loopE: | .L32_loopE: | ||||
TEST $1, bm; | TEST $1, bm; | ||||
JLE .L33_loopE; | JLE .L33_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L33_bodyB: | .L33_bodyB: | ||||
#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) | ||||
MOVQ bb,ptrbb; | MOVQ bb,ptrbb; | ||||
@@ -3078,7 +3080,7 @@ MOVQ %rax, kkk; | |||||
#endif | #endif | ||||
SARQ $2, k; | SARQ $2, k; | ||||
JLE .L331_loopE; | JLE .L331_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L331_bodyB: | .L331_bodyB: | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -3123,7 +3125,7 @@ ADDQ $8*SIZE, ptrba; | |||||
ADDQ $8*SIZE, ptrbb; | ADDQ $8*SIZE, ptrbb; | ||||
DECQ k; | DECQ k; | ||||
JG .L331_bodyB; | JG .L331_bodyB; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L331_loopE: | .L331_loopE: | ||||
#ifndef TRMMKERNEL | #ifndef TRMMKERNEL | ||||
TEST $2, bk; | TEST $2, bk; | ||||
@@ -3131,7 +3133,7 @@ TEST $2, bk; | |||||
TEST $2, kkk; | TEST $2, kkk; | ||||
#endif | #endif | ||||
JLE .L332_loopE; | JLE .L332_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L332_bodyB: | .L332_bodyB: | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||
@@ -3162,7 +3164,7 @@ TEST $1, bk; | |||||
TEST $1, kkk; | TEST $1, kkk; | ||||
#endif | #endif | ||||
JLE .L333_loopE; | JLE .L333_loopE; | ||||
.align 32 | |||||
ALIGN_5 | |||||
.L333_bodyB: | .L333_bodyB: | ||||
LD_DX 0*SIZE(ptrba), xvec0; | LD_DX 0*SIZE(ptrba), xvec0; | ||||
BROAD_DX 0*SIZE(ptrbb), xvec2; | BROAD_DX 0*SIZE(ptrbb), xvec2; | ||||