some clean-up & commentary

3 years ago · b58d4f31ab
--- a/kernel/arm64/KERNEL.ARMV8SVE
+++ b/kernel/arm64/KERNEL.ARMV8SVE
@@ -143,7 +143,7 @@ endif
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)

 DGEMMKERNEL    =  dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
 DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
 DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S

 DGEMMINCOPY    =  dgemm_ncopy_sve_v1.c
--- a/kernel/arm64/dgemm_kernel_sve_v1x8.S
+++ b/kernel/arm64/dgemm_kernel_sve_v1x8.S
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha0		d10
 #define alphaZ		z2.d

 #define A_PRE_SIZE	2560
 #define A_PRE_SIZE	1536
 #define B_PRE_SIZE	512
 #define C_PRE_SIZE	128

@@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNELv1x8_I
    ld1d  z0.d, p1/z, [pA] 
    ld1d  z1.d, p1/z, [pA, lanes, lsl #3]   // next one
    //incb  pA, all, mul #2
 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8

    ld1rd  z8.d, p0/z,  [pB]
@@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    ptrue p0.d                  // create true predicate 

 	mov	pB, origPB

 // Loop over N
 	mov	counterJ, origN
 	asr 	counterJ, counterJ, #3		// J = J / 8
 	cmp 	counterJ, #0
 	ble	.Ldgemm_kernel_L4_BEGIN

 /******************************************************************************/
 /* Repeat this as long as there are 8 left in N */

 	.align 5
 .Ldgemm_kernel_L8_BEGIN:
@@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 .Ldgemm_kernel_L8_Mv1_BEGIN:

 /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
    mov counterI, #0
    whilelt p1.d, counterI, origM               //SVE instruction
    whilelt p1.d, counterI, origM   
    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension

 	.align 5
@@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bgt	.Ldgemm_kernel_L8_BEGIN

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 4 left in N */

 	.align 5
 .Ldgemm_kernel_L4_BEGIN:
@@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	origPB, origPB, temp	// B = B + K * 4 * 8

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 2 left in N */

 	.align 5
 .Ldgemm_kernel_L2_BEGIN:
@@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 1 left in N */

 	.align 5
 .Ldgemm_kernel_L1_BEGIN:
--- a/kernel/arm64/dgemm_kernel_sve_v2x8.S
+++ b/kernel/arm64/dgemm_kernel_sve_v2x8.S
@@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/

 /* This is an SVE dgemm kernel with size 2*SVE_LEN x 8.
 However, the data layout is the same as for the kernel 1*SVE_LEN x 8.
 This means that we sweep two panels of packed A when iterating in a loop over K.
 With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */

 #define ASSEMBLER
 #include "common.h"

@@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha0		d10
 #define alphaZ		z7.d

 #define A_PRE_SIZE	2560
 #define A_PRE_SIZE	1536
 #define B_PRE_SIZE	512
 #define C_PRE_SIZE	128

@@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 //v00 ALPHA -> pA10_0
 //v01 pA10_1
 //v02 
 //v03 
 //v02 pA20_0
 //v03 pA20_1
 //v04 
 //v05 
 //v06 
@@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 //v21 must save C5
 //v22 must save C6
 //v23 must save C7
 //v24 must save C8
 //v25 must save C9
 //v26 must save C10
 //v27 must save C11
 //v28 must save C12
 //v29 must save C13
 //v30 must save C14
 //v31 must save C15

 /*******************************************************************************
 * Macro definitions
@@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNELv1x8_I
    ld1d  z0.d, p1/z, [pA1] 
    ld1d  z1.d, p1/z, [pA1, lanes, lsl #3]   // next one
    //incb  pA1, all, mul #2
 	add	pA1, pA1, lanes, lsl #4	// pA1 = pA1 + lanes * 2 * 8

    ld1rd  z8.d, p0/z,  [pB]
@@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    ptrue p0.d                  // create true predicate 

 	mov	pB, origPB

 // Loop over N
 	mov	counterJ, origN
 	asr 	counterJ, counterJ, #3		// J = J / 8
 	cmp 	counterJ, #0
 	ble	.Ldgemm_kernel_L4_BEGIN

 /******************************************************************************/
 /* Repeat this as long as there are 8 left in N */

 	.align 5
 .Ldgemm_kernel_L8_BEGIN:
@@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .Ldgemm_kernel_L8_Mv2_BEGIN:

    mov counterI, #0
    cmp origM, vec_lenx2
    cmp origM, vec_lenx2        // Check if M < 2*SVE_LEN
    blt .Ldgemm_kernel_L8_Mv1_BEGIN

    mov counterI, origM

 /* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */
    mul temp, vec_len, origK                // generate address of pA2
 	add	pA2, pA1, temp, lsl #3			// pA1 = start of A array
 	prfm	PLDL1KEEP, [pA2]
@@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    cmp counterI, origM
    beq .Ldgemm_kernel_L8_END

 //////////////////////////////////
 //////////////////////////////////////////
 // We have less than 2*SVE_LEN left. We do this with V1x8 kernel.
 .Ldgemm_kernel_L8_Mv1_BEGIN:

    whilelt p1.d, counterI, origM               //SVE instruction
@@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bgt	.Ldgemm_kernel_L8_BEGIN

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 4 left in N */

 	.align 5
 .Ldgemm_kernel_L4_BEGIN:
@@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
    beq .Ldgemm_kernel_L4_END

 //////////////////////////////////
 // We have less than 2*SVE_LEN left. We do this with V1x4 kernel.
 .Ldgemm_kernel_L4_Mv1_BEGIN:

    whilelt p1.d, counterI, origM               //SVE instruction
@@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	origPB, origPB, temp	// B = B + K * 4 * 8

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 2 left in N */

 	.align 5
 .Ldgemm_kernel_L2_BEGIN:
@@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


 //////////////////////////////////
 // We have less than 2*SVE_LEN left. We do this with V1x2 kernel.
 .Ldgemm_kernel_L2_Mv1_BEGIN:

    whilelt p1.d, counterI, origM               //SVE instruction
@@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	add	origPB, origPB, origK, lsl #4	// B = B + K * 2 * 8

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 1 left in N */

 	.align 5
 .Ldgemm_kernel_L1_BEGIN:
@@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


 //////////////////////////////////
 // We have less than 2*SVE_LEN left. We do this with V1x1 kernel.
 .Ldgemm_kernel_L1_Mv1_BEGIN:

    whilelt p1.d, counterI, origM               //SVE instruction
--- a/kernel/arm64/dgemm_ncopy_sve_v1.c
+++ b/kernel/arm64/dgemm_ncopy_sve_v1.c
@@ -40,40 +40,40 @@
 #include "common.h"
 #include <arm_sve.h>

 // TODO: write in assembly with proper unrolling
 // TODO: write in assembly with proper unrolling of inner loop
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

  BLASLONG j;
  IFLOAT *aoffset, *aoffset1, *boffset;
    BLASLONG j;
    IFLOAT *aoffset, *aoffset1, *boffset;

  svint64_t lda_vec = svindex_s64(0LL, lda);
  uint64_t sve_size = svcntd();
    svint64_t lda_vec = svindex_s64(0LL, lda);
    uint64_t sve_size = svcntd();

  aoffset = a;
  boffset = b;
    aoffset = a;
    boffset = b;

  j = 0;
  svbool_t pg = svwhilelt_b64(j, n);
  uint64_t active = svcntp_b64(svptrue_b64(), pg);
  do {
    j = 0;
    svbool_t pg = svwhilelt_b64(j, n);
    uint64_t active = svcntp_b64(svptrue_b64(), pg);
    do {

      aoffset1 = aoffset;
        aoffset1 = aoffset;

      uint64_t i_cnt = m;
      while (i_cnt--) {
          svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
          svst1_f64(pg, (double *) boffset, a_vec);
          aoffset1++;
          boffset += active;
      }
      aoffset += sve_size * lda;
        uint64_t i_cnt = m;
        while (i_cnt--) {
            svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec);
            svst1_f64(pg, (double *) boffset, a_vec);
            aoffset1++;
            boffset += active;
        }
        aoffset += sve_size * lda;

      j += svcntd();
      pg = svwhilelt_b64(j, n);
      active = svcntp_b64(svptrue_b64(), pg);
        j += svcntd();
        pg = svwhilelt_b64(j, n);
        active = svcntp_b64(svptrue_b64(), pg);


  } while (svptest_any(svptrue_b64(), pg));
    } while (svptest_any(svptrue_b64(), pg));

  return 0;
    return 0;
 }
--- a/kernel/arm64/dgemm_tcopy_sve_v1.c
+++ b/kernel/arm64/dgemm_tcopy_sve_v1.c
@@ -40,38 +40,38 @@
 #include "common.h"
 #include <arm_sve.h>

 // TODO: write in assembly with proper unrolling
 // TODO: write in assembly with proper unrolling of inner loop
 int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){

  BLASLONG j;
  IFLOAT *aoffset, *aoffset1, *boffset;
    BLASLONG j;
    IFLOAT *aoffset, *aoffset1, *boffset;

  uint64_t sve_size = svcntd();
    uint64_t sve_size = svcntd();

  aoffset = a;
  boffset = b;
    aoffset = a;
    boffset = b;

  j = 0;
  svbool_t pg = svwhilelt_b64(j, n);
  uint64_t active = svcntp_b64(svptrue_b64(), pg);
  do {
    j = 0;
    svbool_t pg = svwhilelt_b64(j, n);
    uint64_t active = svcntp_b64(svptrue_b64(), pg);
    do {

      aoffset1 = aoffset;
        aoffset1 = aoffset;

      uint64_t i_cnt = m;
      while (i_cnt--) {
          svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
          svst1_f64(pg, (double *) boffset, a_vec);
          aoffset1 += lda;
          boffset += active;
      }
      aoffset += sve_size;
        uint64_t i_cnt = m;
        while (i_cnt--) {
            svfloat64_t a_vec = svld1(pg, (double *)aoffset1);
            svst1_f64(pg, (double *) boffset, a_vec);
            aoffset1 += lda;
            boffset += active;
        }
        aoffset += sve_size;

      j += svcntd();
      pg = svwhilelt_b64(j, n);
      active = svcntp_b64(svptrue_b64(), pg);
        j += svcntd();
        pg = svwhilelt_b64(j, n);
        active = svcntp_b64(svptrue_b64(), pg);

  } while (svptest_any(svptrue_b64(), pg));
    } while (svptest_any(svptrue_b64(), pg));

  return 0;
    return 0;
 }
--- a/kernel/arm64/dtrmm_kernel_sve_v1x8.S
+++ b/kernel/arm64/dtrmm_kernel_sve_v1x8.S
@@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define alpha0		d10
 #define alphaZ		z2.d

 #define A_PRE_SIZE	2560
 #define A_PRE_SIZE	1536
 #define B_PRE_SIZE	512
 #define C_PRE_SIZE	128

@@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .macro KERNELv1x8_I
    ld1d  z0.d, p1/z, [pA] 
    ld1d  z1.d, p1/z, [pA, lanes, lsl #3]   // next one
    //incb  pA, all, mul #2
 	add	pA, pA, lanes, lsl #4	// pA = pA + lanes * 2 * 8

    ld1rd  z8.d, p0/z,  [pB]
@@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif

 	mov	pB, origPB

 // Loop over N
 	mov	counterJ, origN
 	asr 	counterJ, counterJ, #3		// J = J / 8
 	cmp 	counterJ, #0
 	ble	.Ldtrmm_kernel_L4_BEGIN

 /******************************************************************************/
 /* Repeat this as long as there are 8 left in N */

 	.align 5
 .Ldtrmm_kernel_L8_BEGIN:
@@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 .Ldtrmm_kernel_L8_Mv1_BEGIN:

 /* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */
    mov counterI, #0
    whilelt p1.d, counterI, origM               //SVE instruction
    cntp lanes, p0, p1.d
    whilelt p1.d, counterI, origM      
    cntp lanes, p0, p1.d                        // lanes contain number of active SVE lanes in M dimension

 	.align 5
 .Ldtrmm_kernel_L8_Mv1_20:
@@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 	bgt	.Ldtrmm_kernel_L8_BEGIN

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 4 left in N */

 	.align 5
 .Ldtrmm_kernel_L4_BEGIN:
@@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 2 left in N */

 	.align 5
 .Ldtrmm_kernel_L2_BEGIN:
@@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #endif

 /******************************************************************************/
 /******************************************************************************/
 /* Repeat the same thing if 1 left in N */

 	.align 5
 .Ldtrmm_kernel_L1_BEGIN:
--- a/kernel/arm64/trmm_lncopy_sve_v1.c
+++ b/kernel/arm64/trmm_lncopy_sve_v1.c
@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON

    BLASLONG i, js;
    BLASLONG X;
    //printf("Using trmm_ln.\n");

    int sve_len = svcntd();
    svint64_t index = svindex_s64(0LL, lda);
@@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        }

        i = 0;
        /* svbool_t pm = svwhilelt_b64(i, m); */
        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
        do 
        {
            if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
            if (X > posY) {
                svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
                svst1(pn, b, aj_vec);
                ao ++;
@@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                    X ++;
                    i ++;
                } else {
                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
 #ifdef UNIT
                    int temp = 0;
                    for (int j = 0; j < n_active; j++) {
@@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                }
        } while (i < m);

        //printf("\n");


        posY += n_active;
        js += n_active;
        pn = svwhilelt_b64(js, n);
--- a/kernel/arm64/trmm_ltcopy_sve_v1.c
+++ b/kernel/arm64/trmm_ltcopy_sve_v1.c
@@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
    BLASLONG i, js;
    BLASLONG X;

    //printf("Using trmm_lt.\n");

    int sve_len = svcntd();

    FLOAT *ao;
@@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        }

        i = 0;
        /* svbool_t pm = svwhilelt_b64(i, m); */
        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
        do 
        {
            if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
            if (X > posY) {
                ao ++;
                b += n_active;
                X ++;
@@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                    X ++;
                    i ++;
                } else {
                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
 #ifdef UNIT
                    int temp = 0;
                    for (int j = 0; j < n_active; j++) {
@@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                }
        } while (i < m);

        //printf("\n");


        posY += n_active;
        js += n_active;
--- a/kernel/arm64/trmm_uncopy_sve_v1.c
+++ b/kernel/arm64/trmm_uncopy_sve_v1.c
@@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON

    BLASLONG i, js;
    BLASLONG X;
    //printf("Using trmm_un.\n");
    //printf("Using m %ld, n %ld.\n", m, n);
    //printf("Using lda %ld.\n", lda);
    //printf("Using posX %ld, posY %ld.\n", posX, posY);

    int sve_len = svcntd();
    svint64_t index = svindex_s64(0LL, lda);
@@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        }

        i = 0;
        /* svbool_t pm = svwhilelt_b64(i, m); */
        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
        do 
        {
            if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
            if (X < posY) {
                svfloat64_t aj_vec = svld1_gather_index(pn, ao, index);
                svst1(pn, b, aj_vec);
                ao ++;
@@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                    X ++;
                    i ++;
                } else {
                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
 #ifdef UNIT
                    int temp = 0;
                    for (int j = 0; j < n_active; j++) {
@@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                }
        } while (i < m);

        //printf("\n");


        posY += n_active;
        js += n_active;
        pn = svwhilelt_b64(js, n);
--- a/kernel/arm64/trmm_utcopy_sve_v1.c
+++ b/kernel/arm64/trmm_utcopy_sve_v1.c
@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON

    BLASLONG i, js;
    BLASLONG X;
    //printf("Using trmm_ut.\n");

    int sve_len = svcntd();

@@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
        }

        i = 0;
        /* svbool_t pm = svwhilelt_b64(i, m); */
        /* int m_active = svcntp_b64(svptrue_b64(), pm); */
        do 
        {
            if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl
            if (X < posY) {
                ao ++;
                b += n_active;
                X ++;
@@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                    b += n_active;
                    X ++;
                    i ++;
                } else {
                } else { 
                    /* I did not find a way to unroll this while preserving vector-length-agnostic code. */
 #ifdef UNIT
                    int temp = 0;
                    for (int j = 0; j < n_active; j++) {
@@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON
                }
        } while (i < m);

        //printf("\n");


        posY += n_active;
        js += n_active;
        pn = svwhilelt_b64(js, n);