|
@@ -47,7 +47,7 @@ |
|
|
|
|
|
|
|
|
#ifndef WINDOWS_ABI |
|
|
#ifndef WINDOWS_ABI |
|
|
|
|
|
|
|
|
#define STACKSIZE 64 |
|
|
|
|
|
|
|
|
#define STACKSIZE 128 |
|
|
|
|
|
|
|
|
#define OLD_M %rdi |
|
|
#define OLD_M %rdi |
|
|
#define OLD_N %rsi |
|
|
#define OLD_N %rsi |
|
@@ -57,7 +57,10 @@ |
|
|
#define STACK_Y 16 + STACKSIZE(%rsp) |
|
|
#define STACK_Y 16 + STACKSIZE(%rsp) |
|
|
#define STACK_INCY 24 + STACKSIZE(%rsp) |
|
|
#define STACK_INCY 24 + STACKSIZE(%rsp) |
|
|
#define STACK_BUFFER 32 + STACKSIZE(%rsp) |
|
|
#define STACK_BUFFER 32 + STACKSIZE(%rsp) |
|
|
|
|
|
|
|
|
|
|
|
#define MMM 56(%rsp) |
|
|
|
|
|
#define NN 64(%rsp) |
|
|
|
|
|
#define AA 72(%rsp) |
|
|
|
|
|
#define LDAX 80(%rsp) |
|
|
#else |
|
|
#else |
|
|
|
|
|
|
|
|
#define STACKSIZE 256 |
|
|
#define STACKSIZE 256 |
|
@@ -132,12 +135,44 @@ |
|
|
movq OLD_LDA, LDA |
|
|
movq OLD_LDA, LDA |
|
|
movq OLD_X, X |
|
|
movq OLD_X, X |
|
|
#else |
|
|
#else |
|
|
movq OLD_M, M |
|
|
|
|
|
movq OLD_N, N |
|
|
|
|
|
movq OLD_A, A |
|
|
|
|
|
movq OLD_LDA, LDA |
|
|
|
|
|
|
|
|
movq OLD_M, MMM |
|
|
|
|
|
movq OLD_N, NN |
|
|
|
|
|
movq OLD_A, AA |
|
|
|
|
|
movq OLD_LDA, LDAX |
|
|
|
|
|
#endif |
|
|
|
|
|
#ifdef HAVE_SSE3 |
|
|
|
|
|
#ifndef WINDOWS_ABI |
|
|
|
|
|
movddup %xmm0, ALPHA |
|
|
|
|
|
#else |
|
|
|
|
|
movddup %xmm3, ALPHA |
|
|
#endif |
|
|
#endif |
|
|
|
|
|
#else |
|
|
|
|
|
#ifndef WINDOWS_ABI |
|
|
|
|
|
movapd %xmm0, ALPHA |
|
|
|
|
|
#else |
|
|
|
|
|
movapd %xmm3, ALPHA |
|
|
|
|
|
#endif |
|
|
|
|
|
unpcklpd ALPHA, ALPHA |
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.L0x: |
|
|
|
|
|
xorq M,M |
|
|
|
|
|
addq $1,M |
|
|
|
|
|
salq $22,M |
|
|
|
|
|
subq M,MMM |
|
|
|
|
|
jge .L00 |
|
|
|
|
|
|
|
|
|
|
|
movq MMM,%rax |
|
|
|
|
|
addq M,%rax |
|
|
|
|
|
jle .L999x |
|
|
|
|
|
movq %rax,M |
|
|
|
|
|
|
|
|
|
|
|
.L00: |
|
|
|
|
|
movq LDAX,LDA |
|
|
|
|
|
movq NN,N |
|
|
|
|
|
movq AA,A |
|
|
movq STACK_INCX, INCX |
|
|
movq STACK_INCX, INCX |
|
|
movq STACK_Y, Y |
|
|
movq STACK_Y, Y |
|
|
movq STACK_INCY, INCY |
|
|
movq STACK_INCY, INCY |
|
@@ -153,21 +188,6 @@ |
|
|
|
|
|
|
|
|
subq $-16 * SIZE, A |
|
|
subq $-16 * SIZE, A |
|
|
|
|
|
|
|
|
#ifdef HAVE_SSE3 |
|
|
|
|
|
#ifndef WINDOWS_ABI |
|
|
|
|
|
movddup %xmm0, ALPHA |
|
|
|
|
|
#else |
|
|
|
|
|
movddup %xmm3, ALPHA |
|
|
|
|
|
#endif |
|
|
|
|
|
#else |
|
|
|
|
|
#ifndef WINDOWS_ABI |
|
|
|
|
|
movapd %xmm0, ALPHA |
|
|
|
|
|
#else |
|
|
|
|
|
movapd %xmm3, ALPHA |
|
|
|
|
|
#endif |
|
|
|
|
|
unpcklpd ALPHA, ALPHA |
|
|
|
|
|
#endif |
|
|
|
|
|
|
|
|
|
|
|
testq M, M |
|
|
testq M, M |
|
|
jle .L999 |
|
|
jle .L999 |
|
|
testq N, N |
|
|
testq N, N |
|
@@ -854,7 +874,6 @@ |
|
|
|
|
|
|
|
|
.L21: |
|
|
.L21: |
|
|
#endif |
|
|
#endif |
|
|
|
|
|
|
|
|
subq $4, N |
|
|
subq $4, N |
|
|
|
|
|
|
|
|
leaq 16 * SIZE(BUFFER), X1 |
|
|
leaq 16 * SIZE(BUFFER), X1 |
|
@@ -2461,6 +2480,12 @@ |
|
|
ALIGN_4 |
|
|
ALIGN_4 |
|
|
|
|
|
|
|
|
.L999: |
|
|
.L999: |
|
|
|
|
|
leaq (, M, SIZE), %rax |
|
|
|
|
|
addq %rax,AA |
|
|
|
|
|
jmp .L0x; |
|
|
|
|
|
ALIGN_4 |
|
|
|
|
|
|
|
|
|
|
|
.L999x: |
|
|
movq 0(%rsp), %rbx |
|
|
movq 0(%rsp), %rbx |
|
|
movq 8(%rsp), %rbp |
|
|
movq 8(%rsp), %rbp |
|
|
movq 16(%rsp), %r12 |
|
|
movq 16(%rsp), %r12 |
|
|