Browse Source

Merge pull request #1564 from martin-frbg/issue1563

Revert changes from PR#1419
tags/v0.3.0^2
Martin Kroeker GitHub 7 years ago
parent
commit
5082fe4306
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 147 additions and 84 deletions
  1. +22
    -10
      kernel/generic/trmm_ltcopy_2.c
  2. +37
    -30
      kernel/generic/trmm_utcopy_16.c
  3. +23
    -14
      kernel/generic/trmm_utcopy_2.c
  4. +23
    -18
      kernel/generic/trmm_utcopy_4.c
  5. +1
    -1
      kernel/generic/trsm_ltcopy_4.c
  6. +38
    -8
      kernel/generic/ztrmm_ltcopy_2.c
  7. +1
    -1
      kernel/generic/ztrsm_utcopy_1.c
  8. +2
    -2
      kernel/generic/ztrsm_utcopy_2.c

+ 22
- 10
kernel/generic/trmm_ltcopy_2.c View File

@@ -116,22 +116,34 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) {

if (X > posY) {
/* ao1 += 1;
ao2 += 1; */
ao1 += 1;
ao2 += 1;
b += 2;
} else
#ifdef UNIT
if (X < posY) {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
data02 = *(ao1 + 1);

b[ 0] = ONE;
b[ 1] = data02;
#else
data01 = *(ao1 + 0);
data02 = *(ao1 + 1);

b[ 0] = data01;
b[ 1] = data02;
#endif
ao1 += 2;
b += 2;
}
#endif
b[ 1] = *(ao1 + 1);
b += 2;
}

posY += 2;
@@ -178,7 +190,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0);
}

// posY += 1;
posY += 1;
}

return 0;


+ 37
- 30
kernel/generic/trmm_utcopy_16.c View File

@@ -518,7 +518,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 15);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i;
@@ -533,7 +533,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a13 += i;
a14 += i;
a15 += i;
a16 += i; */
a16 += i;
b += 16 * i;
} else
if (X > posY) {
@@ -1130,14 +1130,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 7);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i;
a05 += i;
a06 += i;
a07 += i;
a08 += i; */
a08 += i;
b += 8 * i;
} else
if (X > posY) {
@@ -1156,13 +1156,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += 8;
}

/* a02 += i * lda;
a02 += i * lda;
a03 += i * lda;
a04 += i * lda;
a05 += i * lda;
a06 += i * lda;
a07 += i * lda;
a08 += i * lda; */
a08 += i * lda;
} else {
#ifdef UNIT
b[ 0] = ONE;
@@ -1371,10 +1371,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = (m & 3);
if (i > 0) {
if (X < posY) {
/* a01 += i;
a01 += i;
a02 += i;
a03 += i;
a04 += i; */
a04 += i;
b += 4 * i;
} else
if (X > posY) {
@@ -1387,9 +1387,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
a01 += lda;
b += 4;
}
/* a02 += lda;
a02 += lda;
a03 += lda;
a04 += lda; */
a04 += lda;
} else {

#ifdef UNIT
@@ -1487,19 +1487,23 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {
a01 ++;
a02 ++;
} else {
#ifdef UNIT
b += 2;
} else
if (X > posY) {
#endif
b[ 0] = *(a01 + 0);
#ifdef UNIT
b[ 1] = *(a01 + 1);
a01 += lda;
b += 2;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
b[ 1] = *(a01 + 1);
#else
b[ 0] = *(a01 + 0);
b[ 1] = *(a01 + 1);
#endif
b[ 1] = *(a01 + 1);
}
b += 2;
b += 2;
}
}
posY += 2;
}
@@ -1518,25 +1522,28 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i > 0) {
do {
if (X < posY) {
a01 ++;
} else {
#ifdef UNIT
a01 += 1;
b ++;
} else
if (X > posY) {
#endif
b[ 0] = *(a01 + 0);
#ifdef UNIT
a01 += lda;
b ++;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
#else
b[ 0] = *(a01 + 0);
#endif
a01 += lda;
}
b ++;
X ++;
i --;
a01 += lda;
b ++;
}

X += 1;
i --;
} while (i > 0);
}
// posY += 1;
posY += 1;
}

return 0;


+ 23
- 14
kernel/generic/trmm_utcopy_2.c View File

@@ -117,8 +117,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (m & 1) {

if (X < posY) {
/* ao1 += 1;
ao2 += 1; */
ao1 += 1;
ao2 += 1;
b += 2;
} else
if (X > posY) {
@@ -127,7 +127,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON

b[ 0] = data01;
b[ 1] = data02;
// ao1 += lda;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
@@ -139,7 +139,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = ZERO;
#endif
// ao1 += lda;
ao1 += lda;
b += 2;
}
}
@@ -161,18 +161,27 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
i = m;
if (m > 0) {
do {
if (X < posY) {
b += 1;
ao1 += 1;
} else
if (X > posY) {
data01 = *(ao1 + 0);
b[ 0] = data01;
b += 1;
ao1 += lda;
} else {
#ifdef UNIT
if (X > posY) {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
} else {
b[ 0] = ONE;
}
b[ 0] = ONE;
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
b ++;
ao1 += lda;
X ++;
b += 1;
ao1 += lda;
}

X += 1;
i --;
} while (i > 0);
}


+ 23
- 18
kernel/generic/trmm_utcopy_4.c View File

@@ -201,18 +201,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (X < posY) {

if (m & 2) {
/* ao1 += 2;
ao1 += 2;
ao2 += 2;
ao3 += 2;
ao4 += 2; */
ao4 += 2;
b += 8;
}

if (m & 1) {
/* ao1 += 1;
ao1 += 1;
ao2 += 1;
ao3 += 1;
ao4 += 1; */
ao4 += 1;
b += 4;
}

@@ -238,7 +238,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 7] = data08;

ao1 += 2 * lda;
// ao2 += 2 * lda;
ao2 += 2 * lda;
b += 8;
}

@@ -253,7 +253,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 2] = data03;
b[ 3] = data04;

// ao1 += lda;
ao1 += lda;
b += 4;
}

@@ -401,7 +401,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
if (i) {

if (X < posY) {
// ao1 += 2;
ao1 += 2;
b += 2;
} else
if (X > posY) {
@@ -411,7 +411,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b[ 0] = data01;
b[ 1] = data02;

// ao1 += lda;
ao1 += lda;
b += 2;
} else {
#ifdef UNIT
@@ -443,21 +443,26 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
do {

if (X < posY) {
b += 1;
ao1 += 1;
} else {
#ifdef UNIT
} else
if (X > posY) {
#endif
b[ 0] = *(ao1 + 0);
#ifdef UNIT
data01 = *(ao1 + 0);
b[ 0] = data01;
ao1 += lda;
b += 1;
} else {
#ifdef UNIT
b[ 0] = ONE;
}
#else
data01 = *(ao1 + 0);
b[ 0] = data01;
#endif
ao1 += lda;
}
b ++;
X ++;
ao1 += lda;
b += 1;
}

X += 1;
i --;
} while (i > 0);
}


+ 1
- 1
kernel/generic/trsm_ltcopy_4.c View File

@@ -206,7 +206,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT
}

a1 += 2 * lda;
// a2 += 2 * lda;
a2 += 2 * lda;
b += 8;

ii += 2;


+ 38
- 8
kernel/generic/ztrmm_ltcopy_2.c View File

@@ -139,18 +139,48 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}

if (m & 1) {
#ifdef UNIT

if (X > posY) {
ao1 += 2;
ao2 += 2;
b += 4;

} else
if (X < posY) {
#endif
b[ 0] = *(ao1 + 0);
b[ 1] = *(ao1 + 1);
#ifdef UNIT
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);

b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;

ao1 += lda;
b += 4;
} else {
#ifdef UNIT
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);

b[ 0] = ONE;
b[ 1] = ZERO;
}
b[ 2] = data3;
b[ 3] = data4;
#else
data1 = *(ao1 + 0);
data2 = *(ao1 + 1);
data3 = *(ao1 + 2);
data4 = *(ao1 + 3);

b[ 0] = data1;
b[ 1] = data2;
b[ 2] = data3;
b[ 3] = data4;
#endif
b += 4;
b += 4;
}
}

posY += 2;
@@ -203,7 +233,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
} while (i > 0);
}

// posY += 1;
posY += 1;
}

return 0;


+ 1
- 1
kernel/generic/ztrsm_utcopy_1.c View File

@@ -43,7 +43,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT

BLASLONG i, ii, j, jj;

FLOAT data01 = 0.0, data02 = 0.0;
FLOAT data01, data02;
FLOAT *a1;

lda *= 2;


+ 2
- 2
kernel/generic/ztrsm_utcopy_2.c View File

@@ -43,8 +43,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG offset, FLOAT

BLASLONG i, ii, j, jj;

FLOAT data01 = 0.0, data02 = 0.0, data03, data04;
FLOAT data05, data06, data07 = 0.0, data08 = 0.0;
FLOAT data01, data02, data03, data04;
FLOAT data05, data06, data07, data08;
FLOAT *a1, *a2;

lda *= 2;


Loading…
Cancel
Save