Browse Source

some clean-up & commentary

tags/v0.3.19
Bine Brank 3 years ago
parent
commit
b58d4f31ab
10 changed files with 104 additions and 107 deletions
  1. +1
    -1
      kernel/arm64/KERNEL.ARMV8SVE
  2. +8
    -7
      kernel/arm64/dgemm_kernel_sve_v1x8.S
  3. +28
    -10
      kernel/arm64/dgemm_kernel_sve_v2x8.S
  4. +25
    -25
      kernel/arm64/dgemm_ncopy_sve_v1.c
  5. +24
    -24
      kernel/arm64/dgemm_tcopy_sve_v1.c
  6. +9
    -8
      kernel/arm64/dtrmm_kernel_sve_v1x8.S
  7. +2
    -7
      kernel/arm64/trmm_lncopy_sve_v1.c
  8. +2
    -7
      kernel/arm64/trmm_ltcopy_sve_v1.c
  9. +2
    -10
      kernel/arm64/trmm_uncopy_sve_v1.c
  10. +3
    -8
      kernel/arm64/trmm_utcopy_sve_v1.c

+ 1
- 1
kernel/arm64/KERNEL.ARMV8SVE View File

@@ -143,7 +143,7 @@ endif
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)

DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S

DGEMMINCOPY = dgemm_ncopy_sve_v1.c


+ 8
- 7
kernel/arm64/dgemm_kernel_sve_v1x8.S View File

@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10
#define alphaZ z2.d

#define A_PRE_SIZE 2560
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128

@@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
//incb pA, all, mul #2
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8

ld1rd z8.d, p0/z, [pB]
@@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ptrue p0.d // create true predicate

mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN

/******************************************************************************/
/* Repeat this as long as there are 8 left in N */

.align 5
.Ldgemm_kernel_L8_BEGIN:
@@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.Ldgemm_kernel_L8_Mv1_BEGIN:

/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
whilelt p1.d, counterI, origM
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension

.align 5
@@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bgt .Ldgemm_kernel_L8_BEGIN

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 4 left in N */

.align 5
.Ldgemm_kernel_L4_BEGIN:
@@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, temp // B = B + K * 4 * 8

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 2 left in N */

.align 5
.Ldgemm_kernel_L2_BEGIN:
@@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 1 left in N */

.align 5
.Ldgemm_kernel_L1_BEGIN:


+ 28
- 10
kernel/arm64/dgemm_kernel_sve_v2x8.S View File

@@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/

/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
This means that we sweep two panels of packed A when iterating in a loop over K.
With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */

#define ASSEMBLER
#include "common.h"

@@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10
#define alphaZ z7.d

#define A_PRE_SIZE 2560
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128

@@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

//v00 ALPHA -> pA10_0
//v01 pA10_1
//v02
//v03
//v02 pA20_0
//v03 pA20_1
//v04
//v05
//v06
@@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//v21 must save C5
//v22 must save C6
//v23 must save C7
//v24 must save C8
//v25 must save C9
//v26 must save C10
//v27 must save C11
//v28 must save C12
//v29 must save C13
//v30 must save C14
//v31 must save C15

/*******************************************************************************
* Macro definitions
@@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA1]
ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one
//incb pA1, all, mul #2
add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8

ld1rd z8.d, p0/z, [pB]
@@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
ptrue p0.d // create true predicate

mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldgemm_kernel_L4_BEGIN

/******************************************************************************/
/* Repeat this as long as there are 8 left in N */

.align 5
.Ldgemm_kernel_L8_BEGIN:
@@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.Ldgemm_kernel_L8_Mv2_BEGIN:

mov counterI, #0
cmp origM, vec_lenx2
cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN
blt .Ldgemm_kernel_L8_Mv1_BEGIN

mov counterI, origM

/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
mul temp, vec_len, origK // generate address of pA2
add pA2, pA1, temp, lsl #3 // pA1 = start of A array
prfm PLDL1KEEP, [pA2]
@@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
cmp counterI, origM
beq .Ldgemm_kernel_L8_END

//////////////////////////////////
//////////////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
.Ldgemm_kernel_L8_Mv1_BEGIN:

whilelt p1.d, counterI, origM //SVE instruction
@@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bgt .Ldgemm_kernel_L8_BEGIN

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 4 left in N */

.align 5
.Ldgemm_kernel_L4_BEGIN:
@@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
beq .Ldgemm_kernel_L4_END

//////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
.Ldgemm_kernel_L4_Mv1_BEGIN:

whilelt p1.d, counterI, origM //SVE instruction
@@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, temp // B = B + K * 4 * 8

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 2 left in N */

.align 5
.Ldgemm_kernel_L2_BEGIN:
@@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


//////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
.Ldgemm_kernel_L2_Mv1_BEGIN:

whilelt p1.d, counterI, origM //SVE instruction
@@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 1 left in N */

.align 5
.Ldgemm_kernel_L1_BEGIN:
@@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


//////////////////////////////////
// We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
.Ldgemm_kernel_L1_Mv1_BEGIN:

whilelt p1.d, counterI, origM //SVE instruction


+ 25
- 25
kernel/arm64/dgemm_ncopy_sve_v1.c View File

@@ -40,40 +40,40 @@
#include "common.h"
#include <arm_sve.h>

// TODO: write in assembly with proper unrolling
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;

svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd();
svint64_t lda_vec = svindex_s64(0LL, lda);
uint64_t sve_size = svcntd();

aoffset = a;
boffset = b;
aoffset = a;
boffset = b;

j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {

aoffset1 = aoffset;
aoffset1 = aoffset;

uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1++;
boffset += active;
}
aoffset += sve_size * lda;

j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);


} while (svptest_any(svptrue_b64(), pg));
} while (svptest_any(svptrue_b64(), pg));

return 0;
return 0;
}

+ 24
- 24
kernel/arm64/dgemm_tcopy_sve_v1.c View File

@@ -40,38 +40,38 @@
#include "common.h"
#include <arm_sve.h>

// TODO: write in assembly with proper unrolling
// TODO: write in assembly with proper unrolling of inner loop
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;
BLASLONG j;
IFLOAT *aoffset, *aoffset1, *boffset;

uint64_t sve_size = svcntd();
uint64_t sve_size = svcntd();

aoffset = a;
boffset = b;
aoffset = a;
boffset = b;

j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {
j = 0;
svbool_t pg = svwhilelt_b64(j, n);
uint64_t active = svcntp_b64(svptrue_b64(), pg);
do {

aoffset1 = aoffset;
aoffset1 = aoffset;

uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;
uint64_t i_cnt = m;
while (i_cnt--) {
svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
svst1_f64(pg, (double *) boffset, a_vec);
aoffset1 += lda;
boffset += active;
}
aoffset += sve_size;

j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);
j += svcntd();
pg = svwhilelt_b64(j, n);
active = svcntp_b64(svptrue_b64(), pg);

} while (svptest_any(svptrue_b64(), pg));
} while (svptest_any(svptrue_b64(), pg));

return 0;
return 0;
}

+ 9
- 8
kernel/arm64/dtrmm_kernel_sve_v1x8.S View File

@@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define alpha0 d10
#define alphaZ z2.d

#define A_PRE_SIZE 2560
#define A_PRE_SIZE 1536
#define B_PRE_SIZE 512
#define C_PRE_SIZE 128

@@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro KERNELv1x8_I
ld1d z0.d, p1/z, [pA]
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one
//incb pA, all, mul #2
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8

ld1rd z8.d, p0/z, [pB]
@@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

mov pB, origPB
// Loop over N
mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0
ble .Ldtrmm_kernel_L4_BEGIN

/******************************************************************************/
/* Repeat this as long as there are 8 left in N */

.align 5
.Ldtrmm_kernel_L8_BEGIN:
@@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

.Ldtrmm_kernel_L8_Mv1_BEGIN:

/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
mov counterI, #0
whilelt p1.d, counterI, origM //SVE instruction
cntp lanes, p0, p1.d
whilelt p1.d, counterI, origM
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension

.align 5
.Ldtrmm_kernel_L8_Mv1_20:
@@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
bgt .Ldtrmm_kernel_L8_BEGIN

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 4 left in N */

.align 5
.Ldtrmm_kernel_L4_BEGIN:
@@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 2 left in N */

.align 5
.Ldtrmm_kernel_L2_BEGIN:
@@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

/******************************************************************************/
/******************************************************************************/
/* Repeat the same thing if 1 left in N */

.align 5
.Ldtrmm_kernel_L1_BEGIN:


+ 2
- 7
kernel/arm64/trmm_lncopy_sve_v1.c View File

@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON

BLASLONG i, js;
BLASLONG X;
//printf("Using trmm_ln.\n");

int sve_len = svcntd();
svint64_t index = svindex_s64(0LL, lda);
@@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}

i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do
{
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
if (X > posY) {
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
svst1(pn, b, aj_vec);
ao ++;
@@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
@@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
} while (i < m);

//printf("\n");


posY += n_active;
js += n_active;
pn = svwhilelt_b64(js, n);


+ 2
- 7
kernel/arm64/trmm_ltcopy_sve_v1.c View File

@@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
BLASLONG i, js;
BLASLONG X;

//printf("Using trmm_lt.\n");

int sve_len = svcntd();

FLOAT *ao;
@@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}

i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do
{
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
if (X > posY) {
ao ++;
b += n_active;
X ++;
@@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
@@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
} while (i < m);

//printf("\n");


posY += n_active;
js += n_active;


+ 2
- 10
kernel/arm64/trmm_uncopy_sve_v1.c View File

@@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON

BLASLONG i, js;
BLASLONG X;
//printf("Using trmm_un.\n");
//printf("Using m %ld, n %ld.\n", m, n);
//printf("Using lda %ld.\n", lda);
//printf("Using posX %ld, posY %ld.\n", posX, posY);

int sve_len = svcntd();
svint64_t index = svindex_s64(0LL, lda);
@@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}

i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do
{
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
if (X < posY) {
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
svst1(pn, b, aj_vec);
ao ++;
@@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
X ++;
i ++;
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
@@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
} while (i < m);

//printf("\n");


posY += n_active;
js += n_active;
pn = svwhilelt_b64(js, n);


+ 3
- 8
kernel/arm64/trmm_utcopy_sve_v1.c View File

@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON

BLASLONG i, js;
BLASLONG X;
//printf("Using trmm_ut.\n");

int sve_len = svcntd();

@@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}

i = 0;
/* svbool_t pm = svwhilelt_b64(i, m); */
/* int m_active = svcntp_b64(svptrue_b64(), pm); */
do
{
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
if (X < posY) {
ao ++;
b += n_active;
X ++;
@@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
b += n_active;
X ++;
i ++;
} else {
} else {
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */
#ifdef UNIT
int temp = 0;
for (int j = 0; j < n_active; j++) {
@@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
}
} while (i < m);

//printf("\n");


posY += n_active;
js += n_active;
pn = svwhilelt_b64(js, n);


Loading…
Cancel
Save