Browse Source

Refs #154. Fixed a SEGFAULT bug of dgemv_t when m is very large.

It overflowed the internal buffer. Thus, we split vector x into blocks when m is very large.

Thank @wangqian for this patch.
tags/v0.2.5
Zhang Xianyi 13 years ago
parent
commit
5f0117385e
1 changed files with 47 additions and 22 deletions
  1. +47
    -22
      kernel/x86_64/dgemv_t.S

+ 47
- 22
kernel/x86_64/dgemv_t.S View File

@@ -47,7 +47,7 @@


#ifndef WINDOWS_ABI #ifndef WINDOWS_ABI


#define STACKSIZE 64
#define STACKSIZE 128
#define OLD_M %rdi #define OLD_M %rdi
#define OLD_N %rsi #define OLD_N %rsi
@@ -57,7 +57,10 @@
#define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_Y 16 + STACKSIZE(%rsp)
#define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp)
#define STACK_BUFFER 32 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp)

#define MMM 56(%rsp)
#define NN 64(%rsp)
#define AA 72(%rsp)
#define LDAX 80(%rsp)
#else #else


#define STACKSIZE 256 #define STACKSIZE 256
@@ -132,12 +135,44 @@
movq OLD_LDA, LDA movq OLD_LDA, LDA
movq OLD_X, X movq OLD_X, X
#else #else
movq OLD_M, M
movq OLD_N, N
movq OLD_A, A
movq OLD_LDA, LDA
movq OLD_M, MMM
movq OLD_N, NN
movq OLD_A, AA
movq OLD_LDA, LDAX
#endif
#ifdef HAVE_SSE3
#ifndef WINDOWS_ABI
movddup %xmm0, ALPHA
#else
movddup %xmm3, ALPHA
#endif #endif
#else
#ifndef WINDOWS_ABI
movapd %xmm0, ALPHA
#else
movapd %xmm3, ALPHA
#endif
unpcklpd ALPHA, ALPHA
#endif




.L0x:
xorq M,M
addq $1,M
salq $22,M
subq M,MMM
jge .L00

movq MMM,%rax
addq M,%rax
jle .L999x
movq %rax,M

.L00:
movq LDAX,LDA
movq NN,N
movq AA,A
movq STACK_INCX, INCX movq STACK_INCX, INCX
movq STACK_Y, Y movq STACK_Y, Y
movq STACK_INCY, INCY movq STACK_INCY, INCY
@@ -153,21 +188,6 @@


subq $-16 * SIZE, A subq $-16 * SIZE, A


#ifdef HAVE_SSE3
#ifndef WINDOWS_ABI
movddup %xmm0, ALPHA
#else
movddup %xmm3, ALPHA
#endif
#else
#ifndef WINDOWS_ABI
movapd %xmm0, ALPHA
#else
movapd %xmm3, ALPHA
#endif
unpcklpd ALPHA, ALPHA
#endif

testq M, M testq M, M
jle .L999 jle .L999
testq N, N testq N, N
@@ -854,7 +874,6 @@


.L21: .L21:
#endif #endif

subq $4, N subq $4, N


leaq 16 * SIZE(BUFFER), X1 leaq 16 * SIZE(BUFFER), X1
@@ -2461,6 +2480,12 @@
ALIGN_4 ALIGN_4


.L999: .L999:
leaq (, M, SIZE), %rax
addq %rax,AA
jmp .L0x;
ALIGN_4

.L999x:
movq 0(%rsp), %rbx movq 0(%rsp), %rbx
movq 8(%rsp), %rbp movq 8(%rsp), %rbp
movq 16(%rsp), %r12 movq 16(%rsp), %r12


Loading…
Cancel
Save