Browse Source

Fixed #7. 1)Disable the multi-thread and 2) Modified kernel codes to avoid unloop in axpy function when incx==0 or incy==0.

tags/v0.1alpha1
Xianyi Zhang 14 years ago
parent
commit
0cfd29a819
6 changed files with 72 additions and 0 deletions
  1. +5
    -0
      interface/axpy.c
  2. +5
    -0
      interface/zaxpy.c
  3. +6
    -0
      kernel/x86_64/axpy_sse.S
  4. +6
    -0
      kernel/x86_64/axpy_sse2.S
  5. +40
    -0
      kernel/x86_64/zaxpy_sse.S
  6. +10
    -0
      kernel/x86_64/zaxpy_sse2.S

+ 5
- 0
interface/axpy.c View File

@@ -81,6 +81,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
#ifdef SMP
nthreads = num_cpu_avail(1);

//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;

if (nthreads == 1) {
#endif



+ 5
- 0
interface/zaxpy.c View File

@@ -83,6 +83,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
#ifdef SMP
nthreads = num_cpu_avail(1);

//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;

if (nthreads == 1) {
#endif



+ 6
- 0
kernel/x86_64/axpy_sse.S View File

@@ -1463,6 +1463,12 @@
.L50:
movq M, %rax
movq Y, YY
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L56
cmpq $0, INCY
je .L56

sarq $3, %rax
jle .L55
ALIGN_3


+ 6
- 0
kernel/x86_64/axpy_sse2.S View File

@@ -805,6 +805,12 @@
.L40:
movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L46
cmpq $0, INCY
je .L46
sarq $3, %rax
jle .L45
ALIGN_3


+ 40
- 0
kernel/x86_64/zaxpy_sse.S View File

@@ -2893,6 +2893,12 @@
unpcklps %xmm13, %xmm15
#endif

//If incx==0 || incy==0, avoid unloop and jump to end.
cmpq $0, INCX
je .L200
cmpq $0, INCY
je .L200

movq Y, YY

movq M, %rax
@@ -3105,8 +3111,42 @@
addps %xmm1, %xmm8

movsd %xmm8, (Y)
jmp .L999
ALIGN_3
.L200:
movq M, %rax
cmpq $0, %rax
jle .L999
ALIGN_3

.L201:
movsd (X), %xmm0
addq INCX, X

#ifdef HAVE_SSE3
movshdup %xmm0, %xmm1
movsldup %xmm0, %xmm0
#else
pshufd $0xf5, %xmm0, %xmm1
shufps $0xa0, %xmm0, %xmm0
#endif

mulps %xmm14, %xmm0
mulps %xmm15, %xmm1

movsd (Y), %xmm8

addps %xmm0, %xmm8
addps %xmm1, %xmm8

movsd %xmm8, (Y)
addq INCY, Y
decq %rax
jg .L201
ALIGN_3
.L999:
xorq %rax, %rax



+ 10
- 0
kernel/x86_64/zaxpy_sse2.S View File

@@ -1416,6 +1416,12 @@

movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop and jump to end.
cmpq $0, INCX
je .L58
cmpq $0, INCY
je .L58
sarq $3, %rax
jle .L55

@@ -1769,6 +1775,7 @@
andq $1, %rax
jle .L999

.L58:
MOVDDUP( 0 * SIZE, X, %xmm0)
MOVDDUP( 1 * SIZE, X, %xmm1)

@@ -1781,6 +1788,9 @@

movlpd %xmm8, 0 * SIZE(YY)
movhpd %xmm8, 1 * SIZE(YY)
decq %rax
jg .L58
ALIGN_3

.L999:


Loading…
Cancel
Save