@@ -143,7 +143,7 @@ endif | |||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
DGEMMKERNEL = dgemm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | |||||
DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S | |||||
DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S | ||||
DGEMMINCOPY = dgemm_ncopy_sve_v1.c | DGEMMINCOPY = dgemm_ncopy_sve_v1.c | ||||
@@ -54,7 +54,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define alpha0 d10 | #define alpha0 d10 | ||||
#define alphaZ z2.d | #define alphaZ z2.d | ||||
#define A_PRE_SIZE 2560 | |||||
#define A_PRE_SIZE 1536 | |||||
#define B_PRE_SIZE 512 | #define B_PRE_SIZE 512 | ||||
#define C_PRE_SIZE 128 | #define C_PRE_SIZE 128 | ||||
@@ -134,7 +134,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNELv1x8_I | .macro KERNELv1x8_I | ||||
ld1d z0.d, p1/z, [pA] | ld1d z0.d, p1/z, [pA] | ||||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | ||||
//incb pA, all, mul #2 | |||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | ||||
ld1rd z8.d, p0/z, [pB] | ld1rd z8.d, p0/z, [pB] | ||||
@@ -476,13 +475,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ptrue p0.d // create true predicate | ptrue p0.d // create true predicate | ||||
mov pB, origPB | mov pB, origPB | ||||
// Loop over N | |||||
mov counterJ, origN | mov counterJ, origN | ||||
asr counterJ, counterJ, #3 // J = J / 8 | asr counterJ, counterJ, #3 // J = J / 8 | ||||
cmp counterJ, #0 | cmp counterJ, #0 | ||||
ble .Ldgemm_kernel_L4_BEGIN | ble .Ldgemm_kernel_L4_BEGIN | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/* Repeat this as long as there are 8 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L8_BEGIN: | .Ldgemm_kernel_L8_BEGIN: | ||||
@@ -494,8 +494,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.Ldgemm_kernel_L8_Mv1_BEGIN: | .Ldgemm_kernel_L8_Mv1_BEGIN: | ||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||||
mov counterI, #0 | mov counterI, #0 | ||||
whilelt p1.d, counterI, origM //SVE instruction | |||||
whilelt p1.d, counterI, origM | |||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | ||||
.align 5 | .align 5 | ||||
@@ -607,7 +608,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
bgt .Ldgemm_kernel_L8_BEGIN | bgt .Ldgemm_kernel_L8_BEGIN | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 4 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L4_BEGIN: | .Ldgemm_kernel_L4_BEGIN: | ||||
@@ -692,7 +693,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
add origPB, origPB, temp // B = B + K * 4 * 8 | add origPB, origPB, temp // B = B + K * 4 * 8 | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 2 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L2_BEGIN: | .Ldgemm_kernel_L2_BEGIN: | ||||
@@ -773,7 +774,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 1 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L1_BEGIN: | .Ldgemm_kernel_L1_BEGIN: | ||||
@@ -25,6 +25,11 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | ||||
*******************************************************************************/ | *******************************************************************************/ | ||||
/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. | |||||
However, the data layout is the same as for the kernel 1*SVE_LEN x 8. | |||||
This means that we sweep two panels of packed A when iterating in a loop over K. | |||||
With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ | |||||
#define ASSEMBLER | #define ASSEMBLER | ||||
#include "common.h" | #include "common.h" | ||||
@@ -57,7 +62,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define alpha0 d10 | #define alpha0 d10 | ||||
#define alphaZ z7.d | #define alphaZ z7.d | ||||
#define A_PRE_SIZE 2560 | |||||
#define A_PRE_SIZE 1536 | |||||
#define B_PRE_SIZE 512 | #define B_PRE_SIZE 512 | ||||
#define C_PRE_SIZE 128 | #define C_PRE_SIZE 128 | ||||
@@ -96,8 +101,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
//v00 ALPHA -> pA10_0 | //v00 ALPHA -> pA10_0 | ||||
//v01 pA10_1 | //v01 pA10_1 | ||||
//v02 | |||||
//v03 | |||||
//v02 pA20_0 | |||||
//v03 pA20_1 | |||||
//v04 | //v04 | ||||
//v05 | //v05 | ||||
//v06 | //v06 | ||||
@@ -118,6 +123,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
//v21 must save C5 | //v21 must save C5 | ||||
//v22 must save C6 | //v22 must save C6 | ||||
//v23 must save C7 | //v23 must save C7 | ||||
//v24 must save C8 | |||||
//v25 must save C9 | |||||
//v26 must save C10 | |||||
//v27 must save C11 | |||||
//v28 must save C12 | |||||
//v29 must save C13 | |||||
//v30 must save C14 | |||||
//v31 must save C15 | |||||
/******************************************************************************* | /******************************************************************************* | ||||
* Macro definitions | * Macro definitions | ||||
@@ -583,7 +596,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNELv1x8_I | .macro KERNELv1x8_I | ||||
ld1d z0.d, p1/z, [pA1] | ld1d z0.d, p1/z, [pA1] | ||||
ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one | ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one | ||||
//incb pA1, all, mul #2 | |||||
add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 | add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 | ||||
ld1rd z8.d, p0/z, [pB] | ld1rd z8.d, p0/z, [pB] | ||||
@@ -928,13 +940,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ptrue p0.d // create true predicate | ptrue p0.d // create true predicate | ||||
mov pB, origPB | mov pB, origPB | ||||
// Loop over N | |||||
mov counterJ, origN | mov counterJ, origN | ||||
asr counterJ, counterJ, #3 // J = J / 8 | asr counterJ, counterJ, #3 // J = J / 8 | ||||
cmp counterJ, #0 | cmp counterJ, #0 | ||||
ble .Ldgemm_kernel_L4_BEGIN | ble .Ldgemm_kernel_L4_BEGIN | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/* Repeat this as long as there are 8 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L8_BEGIN: | .Ldgemm_kernel_L8_BEGIN: | ||||
@@ -947,11 +960,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.Ldgemm_kernel_L8_Mv2_BEGIN: | .Ldgemm_kernel_L8_Mv2_BEGIN: | ||||
mov counterI, #0 | mov counterI, #0 | ||||
cmp origM, vec_lenx2 | |||||
cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN | |||||
blt .Ldgemm_kernel_L8_Mv1_BEGIN | blt .Ldgemm_kernel_L8_Mv1_BEGIN | ||||
mov counterI, origM | mov counterI, origM | ||||
/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ | |||||
mul temp, vec_len, origK // generate address of pA2 | mul temp, vec_len, origK // generate address of pA2 | ||||
add pA2, pA1, temp, lsl #3 // pA1 = start of A array | add pA2, pA1, temp, lsl #3 // pA1 = start of A array | ||||
prfm PLDL1KEEP, [pA2] | prfm PLDL1KEEP, [pA2] | ||||
@@ -1063,7 +1077,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
cmp counterI, origM | cmp counterI, origM | ||||
beq .Ldgemm_kernel_L8_END | beq .Ldgemm_kernel_L8_END | ||||
////////////////////////////////// | |||||
////////////////////////////////////////// | |||||
// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. | |||||
.Ldgemm_kernel_L8_Mv1_BEGIN: | .Ldgemm_kernel_L8_Mv1_BEGIN: | ||||
whilelt p1.d, counterI, origM //SVE instruction | whilelt p1.d, counterI, origM //SVE instruction | ||||
@@ -1178,7 +1193,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
bgt .Ldgemm_kernel_L8_BEGIN | bgt .Ldgemm_kernel_L8_BEGIN | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 4 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L4_BEGIN: | .Ldgemm_kernel_L4_BEGIN: | ||||
@@ -1270,6 +1285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
beq .Ldgemm_kernel_L4_END | beq .Ldgemm_kernel_L4_END | ||||
////////////////////////////////// | ////////////////////////////////// | ||||
// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. | |||||
.Ldgemm_kernel_L4_Mv1_BEGIN: | .Ldgemm_kernel_L4_Mv1_BEGIN: | ||||
whilelt p1.d, counterI, origM //SVE instruction | whilelt p1.d, counterI, origM //SVE instruction | ||||
@@ -1338,7 +1354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
add origPB, origPB, temp // B = B + K * 4 * 8 | add origPB, origPB, temp // B = B + K * 4 * 8 | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 2 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L2_BEGIN: | .Ldgemm_kernel_L2_BEGIN: | ||||
@@ -1428,6 +1444,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
////////////////////////////////// | ////////////////////////////////// | ||||
// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. | |||||
.Ldgemm_kernel_L2_Mv1_BEGIN: | .Ldgemm_kernel_L2_Mv1_BEGIN: | ||||
whilelt p1.d, counterI, origM //SVE instruction | whilelt p1.d, counterI, origM //SVE instruction | ||||
@@ -1493,7 +1510,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 1 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldgemm_kernel_L1_BEGIN: | .Ldgemm_kernel_L1_BEGIN: | ||||
@@ -1581,6 +1598,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
////////////////////////////////// | ////////////////////////////////// | ||||
// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. | |||||
.Ldgemm_kernel_L1_Mv1_BEGIN: | .Ldgemm_kernel_L1_Mv1_BEGIN: | ||||
whilelt p1.d, counterI, origM //SVE instruction | whilelt p1.d, counterI, origM //SVE instruction | ||||
@@ -40,40 +40,40 @@ | |||||
#include "common.h" | #include "common.h" | ||||
#include <arm_sve.h> | #include <arm_sve.h> | ||||
// TODO: write in assembly with proper unrolling | |||||
// TODO: write in assembly with proper unrolling of inner loop | |||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | ||||
BLASLONG j; | |||||
IFLOAT *aoffset, *aoffset1, *boffset; | |||||
BLASLONG j; | |||||
IFLOAT *aoffset, *aoffset1, *boffset; | |||||
svint64_t lda_vec = svindex_s64(0LL, lda); | |||||
uint64_t sve_size = svcntd(); | |||||
svint64_t lda_vec = svindex_s64(0LL, lda); | |||||
uint64_t sve_size = svcntd(); | |||||
aoffset = a; | |||||
boffset = b; | |||||
aoffset = a; | |||||
boffset = b; | |||||
j = 0; | |||||
svbool_t pg = svwhilelt_b64(j, n); | |||||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||||
do { | |||||
j = 0; | |||||
svbool_t pg = svwhilelt_b64(j, n); | |||||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||||
do { | |||||
aoffset1 = aoffset; | |||||
aoffset1 = aoffset; | |||||
uint64_t i_cnt = m; | |||||
while (i_cnt--) { | |||||
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||||
svst1_f64(pg, (double *) boffset, a_vec); | |||||
aoffset1++; | |||||
boffset += active; | |||||
} | |||||
aoffset += sve_size * lda; | |||||
uint64_t i_cnt = m; | |||||
while (i_cnt--) { | |||||
svfloat64_t a_vec = svld1_gather_index(pg, (double *) aoffset1, lda_vec); | |||||
svst1_f64(pg, (double *) boffset, a_vec); | |||||
aoffset1++; | |||||
boffset += active; | |||||
} | |||||
aoffset += sve_size * lda; | |||||
j += svcntd(); | |||||
pg = svwhilelt_b64(j, n); | |||||
active = svcntp_b64(svptrue_b64(), pg); | |||||
j += svcntd(); | |||||
pg = svwhilelt_b64(j, n); | |||||
active = svcntp_b64(svptrue_b64(), pg); | |||||
} while (svptest_any(svptrue_b64(), pg)); | |||||
} while (svptest_any(svptrue_b64(), pg)); | |||||
return 0; | |||||
return 0; | |||||
} | } |
@@ -40,38 +40,38 @@ | |||||
#include "common.h" | #include "common.h" | ||||
#include <arm_sve.h> | #include <arm_sve.h> | ||||
// TODO: write in assembly with proper unrolling | |||||
// TODO: write in assembly with proper unrolling of inner loop | |||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | ||||
BLASLONG j; | |||||
IFLOAT *aoffset, *aoffset1, *boffset; | |||||
BLASLONG j; | |||||
IFLOAT *aoffset, *aoffset1, *boffset; | |||||
uint64_t sve_size = svcntd(); | |||||
uint64_t sve_size = svcntd(); | |||||
aoffset = a; | |||||
boffset = b; | |||||
aoffset = a; | |||||
boffset = b; | |||||
j = 0; | |||||
svbool_t pg = svwhilelt_b64(j, n); | |||||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||||
do { | |||||
j = 0; | |||||
svbool_t pg = svwhilelt_b64(j, n); | |||||
uint64_t active = svcntp_b64(svptrue_b64(), pg); | |||||
do { | |||||
aoffset1 = aoffset; | |||||
aoffset1 = aoffset; | |||||
uint64_t i_cnt = m; | |||||
while (i_cnt--) { | |||||
svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||||
svst1_f64(pg, (double *) boffset, a_vec); | |||||
aoffset1 += lda; | |||||
boffset += active; | |||||
} | |||||
aoffset += sve_size; | |||||
uint64_t i_cnt = m; | |||||
while (i_cnt--) { | |||||
svfloat64_t a_vec = svld1(pg, (double *)aoffset1); | |||||
svst1_f64(pg, (double *) boffset, a_vec); | |||||
aoffset1 += lda; | |||||
boffset += active; | |||||
} | |||||
aoffset += sve_size; | |||||
j += svcntd(); | |||||
pg = svwhilelt_b64(j, n); | |||||
active = svcntp_b64(svptrue_b64(), pg); | |||||
j += svcntd(); | |||||
pg = svwhilelt_b64(j, n); | |||||
active = svcntp_b64(svptrue_b64(), pg); | |||||
} while (svptest_any(svptrue_b64(), pg)); | |||||
} while (svptest_any(svptrue_b64(), pg)); | |||||
return 0; | |||||
return 0; | |||||
} | } |
@@ -58,7 +58,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define alpha0 d10 | #define alpha0 d10 | ||||
#define alphaZ z2.d | #define alphaZ z2.d | ||||
#define A_PRE_SIZE 2560 | |||||
#define A_PRE_SIZE 1536 | |||||
#define B_PRE_SIZE 512 | #define B_PRE_SIZE 512 | ||||
#define C_PRE_SIZE 128 | #define C_PRE_SIZE 128 | ||||
@@ -138,7 +138,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNELv1x8_I | .macro KERNELv1x8_I | ||||
ld1d z0.d, p1/z, [pA] | ld1d z0.d, p1/z, [pA] | ||||
ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | ld1d z1.d, p1/z, [pA, lanes, lsl #3] // next one | ||||
//incb pA, all, mul #2 | |||||
add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | add pA, pA, lanes, lsl #4 // pA = pA + lanes * 2 * 8 | ||||
ld1rd z8.d, p0/z, [pB] | ld1rd z8.d, p0/z, [pB] | ||||
@@ -469,13 +468,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
mov pB, origPB | mov pB, origPB | ||||
// Loop over N | |||||
mov counterJ, origN | mov counterJ, origN | ||||
asr counterJ, counterJ, #3 // J = J / 8 | asr counterJ, counterJ, #3 // J = J / 8 | ||||
cmp counterJ, #0 | cmp counterJ, #0 | ||||
ble .Ldtrmm_kernel_L4_BEGIN | ble .Ldtrmm_kernel_L4_BEGIN | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/* Repeat this as long as there are 8 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldtrmm_kernel_L8_BEGIN: | .Ldtrmm_kernel_L8_BEGIN: | ||||
@@ -491,9 +491,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.Ldtrmm_kernel_L8_Mv1_BEGIN: | .Ldtrmm_kernel_L8_Mv1_BEGIN: | ||||
/* Loop over M is done in an SVE fashion. This has the benefit of the last M%SVE_LEN iterations being done in a single sweep */ | |||||
mov counterI, #0 | mov counterI, #0 | ||||
whilelt p1.d, counterI, origM //SVE instruction | |||||
cntp lanes, p0, p1.d | |||||
whilelt p1.d, counterI, origM | |||||
cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension | |||||
.align 5 | .align 5 | ||||
.Ldtrmm_kernel_L8_Mv1_20: | .Ldtrmm_kernel_L8_Mv1_20: | ||||
@@ -641,7 +642,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
bgt .Ldtrmm_kernel_L8_BEGIN | bgt .Ldtrmm_kernel_L8_BEGIN | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 4 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldtrmm_kernel_L4_BEGIN: | .Ldtrmm_kernel_L4_BEGIN: | ||||
@@ -757,7 +758,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 2 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldtrmm_kernel_L2_BEGIN: | .Ldtrmm_kernel_L2_BEGIN: | ||||
@@ -873,7 +874,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#endif | #endif | ||||
/******************************************************************************/ | /******************************************************************************/ | ||||
/******************************************************************************/ | |||||
/* Repeat the same thing if 1 left in N */ | |||||
.align 5 | .align 5 | ||||
.Ldtrmm_kernel_L1_BEGIN: | .Ldtrmm_kernel_L1_BEGIN: | ||||
@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
BLASLONG i, js; | BLASLONG i, js; | ||||
BLASLONG X; | BLASLONG X; | ||||
//printf("Using trmm_ln.\n"); | |||||
int sve_len = svcntd(); | int sve_len = svcntd(); | ||||
svint64_t index = svindex_s64(0LL, lda); | svint64_t index = svindex_s64(0LL, lda); | ||||
@@ -67,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
i = 0; | i = 0; | ||||
/* svbool_t pm = svwhilelt_b64(i, m); */ | |||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||||
do | do | ||||
{ | { | ||||
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||||
if (X > posY) { | |||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
svst1(pn, b, aj_vec); | svst1(pn, b, aj_vec); | ||||
ao ++; | ao ++; | ||||
@@ -85,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
X ++; | X ++; | ||||
i ++; | i ++; | ||||
} else { | } else { | ||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||||
#ifdef UNIT | #ifdef UNIT | ||||
int temp = 0; | int temp = 0; | ||||
for (int j = 0; j < n_active; j++) { | for (int j = 0; j < n_active; j++) { | ||||
@@ -114,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
} while (i < m); | } while (i < m); | ||||
//printf("\n"); | |||||
posY += n_active; | posY += n_active; | ||||
js += n_active; | js += n_active; | ||||
pn = svwhilelt_b64(js, n); | pn = svwhilelt_b64(js, n); | ||||
@@ -48,8 +48,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
BLASLONG i, js; | BLASLONG i, js; | ||||
BLASLONG X; | BLASLONG X; | ||||
//printf("Using trmm_lt.\n"); | |||||
int sve_len = svcntd(); | int sve_len = svcntd(); | ||||
FLOAT *ao; | FLOAT *ao; | ||||
@@ -67,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
i = 0; | i = 0; | ||||
/* svbool_t pm = svwhilelt_b64(i, m); */ | |||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||||
do | do | ||||
{ | { | ||||
if (X > posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||||
if (X > posY) { | |||||
ao ++; | ao ++; | ||||
b += n_active; | b += n_active; | ||||
X ++; | X ++; | ||||
@@ -85,6 +81,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
X ++; | X ++; | ||||
i ++; | i ++; | ||||
} else { | } else { | ||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||||
#ifdef UNIT | #ifdef UNIT | ||||
int temp = 0; | int temp = 0; | ||||
for (int j = 0; j < n_active; j++) { | for (int j = 0; j < n_active; j++) { | ||||
@@ -114,8 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
} while (i < m); | } while (i < m); | ||||
//printf("\n"); | |||||
posY += n_active; | posY += n_active; | ||||
js += n_active; | js += n_active; | ||||
@@ -47,10 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
BLASLONG i, js; | BLASLONG i, js; | ||||
BLASLONG X; | BLASLONG X; | ||||
//printf("Using trmm_un.\n"); | |||||
//printf("Using m %ld, n %ld.\n", m, n); | |||||
//printf("Using lda %ld.\n", lda); | |||||
//printf("Using posX %ld, posY %ld.\n", posX, posY); | |||||
int sve_len = svcntd(); | int sve_len = svcntd(); | ||||
svint64_t index = svindex_s64(0LL, lda); | svint64_t index = svindex_s64(0LL, lda); | ||||
@@ -70,11 +66,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
i = 0; | i = 0; | ||||
/* svbool_t pm = svwhilelt_b64(i, m); */ | |||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||||
do | do | ||||
{ | { | ||||
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||||
if (X < posY) { | |||||
svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | svfloat64_t aj_vec = svld1_gather_index(pn, ao, index); | ||||
svst1(pn, b, aj_vec); | svst1(pn, b, aj_vec); | ||||
ao ++; | ao ++; | ||||
@@ -88,6 +82,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
X ++; | X ++; | ||||
i ++; | i ++; | ||||
} else { | } else { | ||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||||
#ifdef UNIT | #ifdef UNIT | ||||
int temp = 0; | int temp = 0; | ||||
for (int j = 0; j < n_active; j++) { | for (int j = 0; j < n_active; j++) { | ||||
@@ -117,9 +112,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
} while (i < m); | } while (i < m); | ||||
//printf("\n"); | |||||
posY += n_active; | posY += n_active; | ||||
js += n_active; | js += n_active; | ||||
pn = svwhilelt_b64(js, n); | pn = svwhilelt_b64(js, n); | ||||
@@ -47,7 +47,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
BLASLONG i, js; | BLASLONG i, js; | ||||
BLASLONG X; | BLASLONG X; | ||||
//printf("Using trmm_ut.\n"); | |||||
int sve_len = svcntd(); | int sve_len = svcntd(); | ||||
@@ -66,11 +65,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
i = 0; | i = 0; | ||||
/* svbool_t pm = svwhilelt_b64(i, m); */ | |||||
/* int m_active = svcntp_b64(svptrue_b64(), pm); */ | |||||
do | do | ||||
{ | { | ||||
if (X < posY) { // optimize this: unroll over DGEMM_UNROLL_M: vl | |||||
if (X < posY) { | |||||
ao ++; | ao ++; | ||||
b += n_active; | b += n_active; | ||||
X ++; | X ++; | ||||
@@ -83,7 +80,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
b += n_active; | b += n_active; | ||||
X ++; | X ++; | ||||
i ++; | i ++; | ||||
} else { | |||||
} else { | |||||
/* I did not find a way to unroll this while preserving vector-length-agnostic code. */ | |||||
#ifdef UNIT | #ifdef UNIT | ||||
int temp = 0; | int temp = 0; | ||||
for (int j = 0; j < n_active; j++) { | for (int j = 0; j < n_active; j++) { | ||||
@@ -113,9 +111,6 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, BLASLONG posX, BLASLON | |||||
} | } | ||||
} while (i < m); | } while (i < m); | ||||
//printf("\n"); | |||||
posY += n_active; | posY += n_active; | ||||
js += n_active; | js += n_active; | ||||
pn = svwhilelt_b64(js, n); | pn = svwhilelt_b64(js, n); | ||||