Browse Source

Improve performances of ztrmv on small matrices

* Use stack allocation
* Disable multi-threading
* Ref #727
tags/v0.2.16.rc1
Jerome Robert 9 years ago
parent
commit
78dcf5c3d5
3 changed files with 26 additions and 9 deletions
  1. +2
    -2
      driver/level2/trmv_thread.c
  2. +1
    -1
      driver/level2/ztrmv_U.c
  3. +23
    -6
      interface/ztrmv.c

+ 2
- 2
driver/level2/trmv_thread.c View File

@@ -119,7 +119,7 @@ static int trmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F
#endif

x = buffer;
buffer += ((COMPSIZE * args -> m + 1023) & ~1023);
buffer += ((COMPSIZE * args -> m + 3) & ~3);
}

#ifndef TRANS
@@ -403,7 +403,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu

if (num_cpu) {
queue[0].sa = NULL;
queue[0].sb = buffer + num_cpu * (((m + 255) & ~255) + 16) * COMPSIZE;
queue[0].sb = buffer + num_cpu * (((m + 3) & ~3) + 16) * COMPSIZE;

queue[num_cpu - 1].next = NULL;



+ 1
- 1
driver/level2/ztrmv_U.c View File

@@ -56,7 +56,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *b, BLASLONG incb, FLOAT *bu

if (incb != 1) {
B = buffer;
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 4095) & ~4095);
gemvbuffer = (FLOAT *)(((BLASLONG)buffer + m * sizeof(FLOAT) * 2 + 15) & ~15);
COPY_K(m, b, incb, buffer, 1);
}



+ 23
- 6
interface/ztrmv.c View File

@@ -107,7 +107,7 @@ void NAME(char *UPLO, char *TRANS, char *DIAG,
blasint info;
int uplo;
int unit;
int trans;
int trans, buffer_size;
FLOAT *buffer;
#ifdef SMP
int nthreads;
@@ -154,7 +154,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag,
blasint n, FLOAT *a, blasint lda, FLOAT *x, blasint incx) {

int trans, uplo, unit;
int trans, uplo, unit, buffer_size;
blasint info;
FLOAT *buffer;
#ifdef SMP
@@ -227,11 +227,28 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,

if (incx < 0 ) x -= (n - 1) * incx * 2;

buffer = (FLOAT *)blas_memory_alloc(1);

#ifdef SMP
nthreads = num_cpu_avail(2);
// Calibrated on a Xeon E5-2630
if(1L * n * n > 36L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD) {
nthreads = num_cpu_avail(2);
if(nthreads > 2 && 1L * n * n < 64L * sizeof(FLOAT) * sizeof(FLOAT) * GEMM_MULTITHREAD_THRESHOLD)
nthreads = 2;
} else
nthreads = 1;

if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}
else
#endif
{
buffer_size = ((n - 1) / DTB_ENTRIES) * 2 * DTB_ENTRIES + 32 / sizeof(FLOAT);
if(incx != 1)
buffer_size += n * 2;
}
STACK_ALLOC(buffer_size, FLOAT, buffer);

#ifdef SMP
if (nthreads == 1) {
#endif

@@ -245,7 +262,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
}
#endif

blas_memory_free(buffer);
STACK_FREE(buffer);

FUNCTION_PROFILE_END(4, n * n / 2 + n, n * n);



Loading…
Cancel
Save