Browse Source

Merge pull request #1419 from brada4/develop

Initialize unitialized values for repeated calls
tags/v0.3.0
Martin Kroeker GitHub 7 years ago
parent
commit
e388459a27
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 137 additions and 225 deletions
  1. +1
    -0
      driver/others/init.c
  2. +1
    -1
      driver/others/memory.c
  3. +4
    -2
      kernel/generic/laswp_ncopy_2.c
  4. +7
    -19
      kernel/generic/trmm_ltcopy_2.c
  5. +9
    -28
      kernel/generic/trmm_ltcopy_4.c
  6. +19
    -26
      kernel/generic/trmm_utcopy_16.c
  7. +10
    -19
      kernel/generic/trmm_utcopy_2.c
  8. +10
    -15
      kernel/generic/trmm_utcopy_4.c
  9. +2
    -0
      kernel/generic/trmmkernel_16x2.c
  10. +2
    -0
      kernel/generic/trmmkernel_2x2.c
  11. +2
    -0
      kernel/generic/trmmkernel_8x2.c
  12. +9
    -9
      kernel/generic/zlaswp_ncopy_2.c
  13. +4
    -2
      kernel/generic/zlaswp_ncopy_4.c
  14. +7
    -37
      kernel/generic/ztrmm_ltcopy_2.c
  15. +11
    -20
      kernel/generic/ztrmm_utcopy_1.c
  16. +11
    -22
      kernel/generic/ztrmm_utcopy_2.c
  17. +9
    -12
      kernel/generic/ztrmm_utcopy_8.c
  18. +2
    -0
      kernel/generic/ztrmmkernel_2x2.c
  19. +2
    -0
      kernel/generic/ztrmmkernel_4x4.c
  20. +1
    -1
      kernel/mips/cgemv_n_msa.c
  21. +2
    -2
      kernel/mips/dgemv_n_msa.c
  22. +2
    -2
      kernel/mips/sgemv_n_msa.c
  23. +6
    -6
      kernel/x86_64/dtrmm_kernel_4x8_haswell.c
  24. +4
    -2
      lapack/laswp/generic/laswp_k_4.c

+ 1
- 0
driver/others/init.c View File

@@ -903,6 +903,7 @@ void gotoblas_affinity_init(void) {
} }
#else #else
common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
}
#endif #endif


#endif #endif


+ 1
- 1
driver/others/memory.c View File

@@ -177,7 +177,7 @@ int get_num_procs(void) {
cpu_set_t *cpusetp; cpu_set_t *cpusetp;
size_t size; size_t size;
int ret; int ret;
int i,n;
// int i,n;


if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX) #if !defined(OS_LINUX)


+ 4
- 2
kernel/generic/laswp_ncopy_2.c View File

@@ -116,7 +116,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b2 = A2; *b2 = A2;
*b4 = A4; *b4 = A4;
} }
} else
} else {
if (b1 == a2) { if (b1 == a2) {
if (b2 == a2) { if (b2 == a2) {
*(buffer + 0) = A2; *(buffer + 0) = A2;
@@ -139,7 +139,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(buffer + 3) = A4; *(buffer + 3) = A4;
*b1 = A1; *b1 = A1;
*b3 = A3; *b3 = A3;
} else
} else {
if (b2 == b1) { if (b2 == b1) {
*(buffer + 0) = B1; *(buffer + 0) = B1;
*(buffer + 1) = B3; *(buffer + 1) = B3;
@@ -157,6 +157,8 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*b3 = A3; *b3 = A3;
*b4 = A4; *b4 = A4;
} }
}
}
} }


buffer += 4; buffer += 4;


+ 7
- 19
kernel/generic/trmm_ltcopy_2.c View File

@@ -120,30 +120,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
ao2 += 1; */ ao2 += 1; */
b += 2; b += 2;
} else } else
#ifdef UNIT
if (X < posY) { if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
// ao1 += lda;
b += 2;
} else {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT #ifdef UNIT
data02 = *(ao1 + 1);
} else {


b[ 0] = ONE; b[ 0] = ONE;
b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
#endif
// ao1 += 2;
b += 2;
} }
#endif
b[ 1] = *(ao1 + 1);
b += 2;
} }


posY += 2; posY += 2;


+ 9
- 28
kernel/generic/trmm_ltcopy_4.c View File

@@ -410,36 +410,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON


i = (m & 1); i = (m & 1);
if (i) { if (i) {

if (X > posY) {
/* ao1 += 1;
ao2 += 1; */

b += 2;
} else
if (X < posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
// ao1 += lda;
b += 2;
} else {
#ifdef UNIT #ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
if (X < posY) {
#endif #endif
b += 2;
}
b[ 0] = *(ao1 + 0);
#ifdef UNIT
} else {
b[ 0] = ONE;
}
#endif
b[ 1] = *(ao1 + 1);
b += 2;
} }
posY += 2; posY += 2;
} }


+ 19
- 26
kernel/generic/trmm_utcopy_16.c View File

@@ -1487,23 +1487,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) { if (X < posY) {
a01 ++; a01 ++;
a02 ++; a02 ++;
b += 2;
} else
} else {
#ifdef UNIT
if (X > posY) { if (X > posY) {
#endif
b[ 0] = *(a01 + 0); b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
// a01 += lda;
b += 2;
} else {
#ifdef UNIT #ifdef UNIT
} else {
b[ 0] = ONE; b[ 0] = ONE;
b[ 1] = *(a01 + 1);
#else
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
#endif
b += 2;
} }
#endif
b[ 1] = *(a01 + 1);
}
b += 2;
} }
posY += 2; posY += 2;
} }
@@ -1522,25 +1518,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i > 0) { if (i > 0) {
do { do {
if (X < posY) { if (X < posY) {
a01 += 1;
b ++;
} else
a01 ++;
} else {
#ifdef UNIT
if (X > posY) { if (X > posY) {
#endif
b[ 0] = *(a01 + 0); b[ 0] = *(a01 + 0);
a01 += lda;
b ++;
} else {
#ifdef UNIT #ifdef UNIT
} else {
b[ 0] = ONE; b[ 0] = ONE;
#else
b[ 0] = *(a01 + 0);
#endif
a01 += lda;
b ++;
} }

X += 1;
i --;
#endif
a01 += lda;
}
b ++;
X ++;
i --;
} while (i > 0); } while (i > 0);
} }
// posY += 1; // posY += 1;


+ 10
- 19
kernel/generic/trmm_utcopy_2.c View File

@@ -161,27 +161,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m; i = m;
if (m > 0) { if (m > 0) {
do { do {
if (X < posY) {
b += 1;
ao1 += 1;
} else
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
b += 1;
ao1 += lda;
} else {
#ifdef UNIT #ifdef UNIT
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
if (X > posY) {
#endif #endif
b += 1;
ao1 += lda;
}

X += 1;
b[ 0] = *(ao1 + 0);
#ifdef UNIT
} else {
b[ 0] = ONE;
}
#endif
b ++;
ao1 += lda;
X ++;
i --; i --;
} while (i > 0); } while (i > 0);
} }


+ 10
- 15
kernel/generic/trmm_utcopy_4.c View File

@@ -443,26 +443,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do { do {


if (X < posY) { if (X < posY) {
b += 1;
ao1 += 1; ao1 += 1;
} else
} else {
#ifdef UNIT
if (X > posY) { if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT #ifdef UNIT
} else {
b[ 0] = ONE; b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
b += 1;
} }

X += 1;
#endif
ao1 += lda;
}
b ++;
X ++;
i --; i --;
} while (i > 0); } while (i > 0);
} }


+ 2
- 0
kernel/generic/trmmkernel_16x2.c View File

@@ -52,6 +52,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL


#if !defined(LEFT) #if !defined(LEFT)
off = -offset; off = -offset;
#else
off = 0;
#endif #endif






+ 2
- 0
kernel/generic/trmmkernel_2x2.c View File

@@ -11,6 +11,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
BLASLONG off, temp; BLASLONG off, temp;
#if defined(TRMMKERNEL) && !defined(LEFT) #if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset; off = -offset;
#else
off = 0;
#endif #endif
for (j=0; j<bn/2; j+=1) for (j=0; j<bn/2; j+=1)
{ {


+ 2
- 0
kernel/generic/trmmkernel_8x2.c View File

@@ -34,6 +34,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL


#if !defined(LEFT) #if !defined(LEFT)
off = -offset; off = -offset;
#else
off = 0;
#endif #endif






+ 9
- 9
kernel/generic/zlaswp_ncopy_2.c View File

@@ -204,20 +204,20 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b4 + 0) = A7; *(b4 + 0) = A7;
*(b4 + 1) = A8; *(b4 + 1) = A8;
} }
}
}


buffer += 8;
buffer += 8;


b1 = a + ip1;
b2 = a + ip2;
b1 = a + ip1;
b2 = a + ip2;


b3 = b1 + lda;
b4 = b2 + lda;
b3 = b1 + lda;
b4 = b2 + lda;


a1 += 4;
a3 += 4;
a1 += 4;
a3 += 4;


i --;
i --;
} while (i > 0); } while (i > 0);
} }




+ 4
- 2
kernel/generic/zlaswp_ncopy_4.c View File

@@ -462,7 +462,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b4 + 0) = A7; *(b4 + 0) = A7;
*(b4 + 1) = A8; *(b4 + 1) = A8;
} }
} else
} else {
if (b1 == a2) { if (b1 == a2) {
if (b2 == a2) { if (b2 == a2) {
*(buffer + 0) = A3; *(buffer + 0) = A3;
@@ -503,7 +503,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b1 + 1) = A2; *(b1 + 1) = A2;
*(b3 + 0) = A5; *(b3 + 0) = A5;
*(b3 + 1) = A6; *(b3 + 1) = A6;
} else
} else {
if (b2 == b1) { if (b2 == b1) {
*(buffer + 0) = B1; *(buffer + 0) = B1;
*(buffer + 1) = B2; *(buffer + 1) = B2;
@@ -536,6 +536,8 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT *a, BLASLONG lda, blasint
*(b4 + 0) = A7; *(b4 + 0) = A7;
*(b4 + 1) = A8; *(b4 + 1) = A8;
} }
}
}
} }


buffer += 8; buffer += 8;


+ 7
- 37
kernel/generic/ztrmm_ltcopy_2.c View File

@@ -139,48 +139,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} }


if (m & 1) { if (m & 1) {

if (X > posY) {
/* ao1 += 2;
ao2 += 2; */
b += 4;

} else
#ifdef UNIT
if (X < posY) { if (X < posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);

b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;

// ao1 += lda;
b += 4;
} else {
#endif
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
#ifdef UNIT #ifdef UNIT
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);

} else {
b[ 0] = ONE; b[ 0] = ONE;
b[ 1] = ZERO; b[ 1] = ZERO;
b[ 2] = data3;
b[ 3] = data4;
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);

b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
#endif
b += 4;
} }
#endif
b += 4;
} }


posY += 2; posY += 2;


+ 11
- 20
kernel/generic/ztrmm_utcopy_1.c View File

@@ -44,7 +44,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
BLASLONG i, js; BLASLONG i, js;
BLASLONG X; BLASLONG X;


FLOAT data01, data02;
// FLOAT data01, data02;
FLOAT *ao1; FLOAT *ao1;


lda += lda; lda += lda;
@@ -65,30 +65,21 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do { do {
if (X < posY) { if (X < posY) {
ao1 += 2; ao1 += 2;
b += 2;
} else
} else {
#ifdef UNIT
if (X > posY) { if (X > posY) {
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;

} else {
#endif
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
#ifdef UNIT #ifdef UNIT
} else {
b[ 0] = ONE; b[ 0] = ONE;
b[ 1] = ZERO; b[ 1] = ZERO;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);
b[ 0] = data01;
b[ 1] = data02;
#endif
ao1 += lda;
b += 2;
} }

#endif
ao1 += lda;
}
b += 2;
X ++; X ++;
i --; i --;
} while (i > 0); } while (i > 0);


+ 11
- 22
kernel/generic/ztrmm_utcopy_2.c View File

@@ -203,33 +203,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do { do {
if (X < posY) { if (X < posY) {
ao1 += 2; ao1 += 2;
b += 2;
} else
} else {
#ifdef UNIT
if (X > posY) { if (X > posY) {
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);

b[ 0] = data1;
b[ 1] = data2;

ao1 += lda;
b += 2;
} else {
#endif
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
#ifdef UNIT #ifdef UNIT
} else {
b[ 0] = ONE; b[ 0] = ONE;
b[ 1] = ZERO; b[ 1] = ZERO;
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);

b[ 0] = data1;
b[ 1] = data2;
#endif
ao1 += lda;
b += 2;
} }

X += 1;
#endif
ao1 += lda;
}
b += 2;
X ++;
i --; i --;
} while (i > 0); } while (i > 0);
} }


+ 9
- 12
kernel/generic/ztrmm_utcopy_8.c View File

@@ -856,25 +856,22 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do { do {
if (X < posY) { if (X < posY) {
a01 += 2; a01 += 2;
b += 2;
} else
} else {
#ifdef UNIT
if (X > posY) { if (X > posY) {
#endif
b[ 0] = *(a01 + 0); b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1); b[ 1] = *(a01 + 1);
a01 += lda;
b += 2;
} else {
#ifdef UNIT #ifdef UNIT
} else {
b[ 0] = ONE; b[ 0] = ONE;
b[ 1] = ZERO; b[ 1] = ZERO;
#else
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
#endif
a01 += lda;
b += 2;
} }
X += 1;
#endif
a01 += lda;
}
b += 2;
X ++;
i --; i --;
} while (i > 0); } while (i > 0);
} }


+ 2
- 0
kernel/generic/ztrmmkernel_2x2.c View File

@@ -15,6 +15,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* b


#if defined(TRMMKERNEL) && !defined(LEFT) #if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset; off = -offset;
#else
off = 0;
#endif #endif
for (j=0; j<bn/2; j+=1) for (j=0; j<bn/2; j+=1)
{ {


+ 2
- 0
kernel/generic/ztrmmkernel_4x4.c View File

@@ -53,6 +53,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha_r, FLOAT alpha_i,FLOAT


#if defined(TRMMKERNEL) && !defined(LEFT) #if defined(TRMMKERNEL) && !defined(LEFT)
off = -offset; off = -offset;
#else
off = 0;
#endif #endif


for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops


+ 1
- 1
kernel/mips/cgemv_n_msa.c View File

@@ -511,7 +511,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG i, j, k, k_pref, pref_offset; BLASLONG i, j, k, k_pref, pref_offset;
FLOAT *y_org = y; FLOAT *y_org = y;
FLOAT *pa0, *pa1, *pa2, *pa3; FLOAT *pa0, *pa1, *pa2, *pa3;
FLOAT temp_r, temp_i, res0, res1, temp0_r;
FLOAT temp_r = 0.0, temp_i = 0.0, res0, res1, temp0_r;
FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i; FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
v4f32 alphar, alphai; v4f32 alphar, alphai;
v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i; v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;


+ 2
- 2
kernel/mips/dgemv_n_msa.c View File

@@ -484,10 +484,10 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v2f64 v_alpha; v2f64 v_alpha;
v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
v2f64 x0, x1, x2, x3, y0 = 0.0, y1 = 0.0, y2 = 0.0, y3 = 0.0;
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
v2f64 t30, t31, tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;


v_alpha = COPY_DOUBLE_TO_VECTOR(alpha); v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);




+ 2
- 2
kernel/mips/sgemv_n_msa.c View File

@@ -423,9 +423,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
FLOAT *y_org = y; FLOAT *y_org = y;
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7; FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
v4f32 v_alpha, x0, x1, y0, y1;
v4f32 v_alpha, x0, x1, y0 = 0.0, y1 = 0.0;
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15; v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
v4f32 tp0 = 0.0, tp1 = 0.0, tp2 = 0.0, tp3 = 0.0, tp4 = 0.0, tp5 = 0.0, tp6 = 0.0, tp7 = 0.0;


v_alpha = COPY_FLOAT_TO_VECTOR(alpha); v_alpha = COPY_FLOAT_TO_VECTOR(alpha);




+ 6
- 6
kernel/x86_64/dtrmm_kernel_4x8_haswell.c View File

@@ -777,9 +777,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3_2 = 0; res3_2 = 0;
res3_3 = 0; res3_3 = 0;


temp = backwards ? bk-off :
left ? off + 4 : // number of values in A
off + 4; // number of values in B
temp = backwards ? bk-off : off + 4;
/* left ? off + 4 : // number of values in A
off + 4; // number of values in B */


for (k=0; k<temp; k++) for (k=0; k<temp; k++)
{ {
@@ -857,9 +857,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C3[3] = res3_3; C3[3] = res3_3;


if (!backwards) { if (!backwards) {
temp = bk-off;
temp = left ? temp - 4 : // number of values in A
temp - 4; // number of values in B
temp = bk-off - 4;
/* temp = left ? temp - 4 : // number of values in A
temp - 4; // number of values in B */


ptrba += temp*4; // number of values in A ptrba += temp*4; // number of values in A
ptrbb += temp*4; // number of values in B ptrbb += temp*4; // number of values in B


+ 4
- 2
lapack/laswp/generic/laswp_k_4.c View File

@@ -174,7 +174,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*a8 = B8; *a8 = B8;
*b8 = A8; *b8 = A8;
} }
} else
} else {
if (b1 == a2) { if (b1 == a2) {
if (b2 != a1) { if (b2 != a1) {
if (b2 == a2) { if (b2 == a2) {
@@ -225,7 +225,7 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b5 = A5; *b5 = A5;
*a7 = B7; *a7 = B7;
*b7 = A7; *b7 = A7;
} else
} else {
if (b2 == b1) { if (b2 == b1) {
*a1 = B1; *a1 = B1;
*a2 = A1; *a2 = A1;
@@ -257,6 +257,8 @@ int CNAME(BLASLONG n, BLASLONG k1, BLASLONG k2, FLOAT dummy1, FLOAT *a, BLASLONG
*b7 = A7; *b7 = A7;
*b8 = A8; *b8 = A8;
} }
}
}
} }


b1 = a + ip1; b1 = a + ip1;


Loading…
Cancel
Save