Browse Source

Fixed #7. Modified axpy kernel codes to avoid unloop with incx==0 or incy==0 in x86 32bits arch.

tags/v0.1alpha1
Xianyi 14 years ago
parent
commit
12214e1d0f
4 changed files with 61 additions and 0 deletions
  1. +6
    -0
      kernel/x86/axpy_sse.S
  2. +6
    -0
      kernel/x86/axpy_sse2.S
  3. +38
    -0
      kernel/x86/zaxpy_sse.S
  4. +11
    -0
      kernel/x86/zaxpy_sse2.S

+ 6
- 0
kernel/x86/axpy_sse.S View File

@@ -1440,6 +1440,12 @@
.L50:
movl M, %eax
movl Y, YY
//If incx==0 || incy==0, avoid unloop.
cmpl $0, INCX
je .L56
cmpl $0, INCY
je .L56

sarl $3, %eax
jle .L55
ALIGN_3


+ 6
- 0
kernel/x86/axpy_sse2.S View File

@@ -698,6 +698,12 @@
.L40:
movl Y, YY
movl M, %eax
//If incx==0 || incy==0, avoid unloop.
cmpl $0, INCX
je .L46
cmpl $0, INCY
je .L46

sarl $3, %eax
jle .L45
ALIGN_3


+ 38
- 0
kernel/x86/zaxpy_sse.S View File

@@ -2857,6 +2857,11 @@
unpcklps ALPHA_I, ALPHA_R
unpcklps %xmm5, ALPHA_I
#endif
//If incx==0 || incy==0, avoid unloop and jump to end.
cmpl $0, INCX
je .L200
cmpl $0, INCY
je .L200

movl Y, YY

@@ -3090,8 +3095,41 @@
addps %xmm1, %xmm4

movsd %xmm4, (Y)
jmp .L999
ALIGN_3

.L200:
movl M, %eax
cmpl $0, %eax
jle .L999
ALIGN_3

.L201:
movsd (X), %xmm0

#ifdef HAVE_SSE3
movshdup %xmm0, %xmm1
movsldup %xmm0, %xmm0
#else
movaps %xmm0, %xmm1
shufps $0xa0, %xmm0, %xmm0
shufps $0xf5, %xmm1, %xmm1
#endif

mulps ALPHA_R, %xmm0
mulps ALPHA_I, %xmm1

movsd (Y), %xmm4

addps %xmm0, %xmm4
addps %xmm1, %xmm4

movsd %xmm4, (Y)
decl %eax
jg .L201

ALIGN_3
.L999:
popl %ebp
popl %ebx


+ 11
- 0
kernel/x86/zaxpy_sse2.S View File

@@ -1318,6 +1318,12 @@

movl Y, YY
movl M, %eax
//If incx==0 || incy==0, avoid unloop and jump to end.
cmpl $0, INCX
je .L58
cmpl $0, INCY
je .L58

sarl $2, %eax
jle .L55

@@ -1498,6 +1504,7 @@
andl $1, %eax
jle .L999

.L58:
MOVDDUP( 0 * SIZE, X, %xmm0)
MOVDDUP( 1 * SIZE, X, %xmm1)

@@ -1510,6 +1517,10 @@

movlpd %xmm4, 0 * SIZE(YY)
movhpd %xmm4, 1 * SIZE(YY)

decl %eax
jg .L58
ALIGN_3

.L999:


Loading…
Cancel
Save