@@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define pCRow0 x12 | |||
#define pCRow1 x13 | |||
#define pCRow2 x14 | |||
#define pA x15 | |||
#define temp x16 | |||
#define tempOffset x17 | |||
#define tempK x18 | |||
#define pCRow3 x15 | |||
#define pA x16 | |||
#define alpha x17 | |||
#define temp x18 | |||
#define tempOffset x19 | |||
#define tempK x20 | |||
#define alpha0 d10 | |||
#define alphaV0 v10.d[0] | |||
#define alpha1 d11 | |||
#define alphaV1 v11.d[0] | |||
#define alpha2 d14 | |||
#define alphaV2 v14.d[0] | |||
#define alpha3 d15 | |||
#define alphaV3 v15.d[0] | |||
#define A_PRE_SIZE 2560 | |||
#define B_PRE_SIZE 448 | |||
#define C_PRE_SIZE 128 | |||
// 00 origM | |||
// 01 origN | |||
@@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
//v05 pA1_2, pA1_3 | |||
//v06 pA1_4, pA1_5 | |||
//v07 pA1_6, pA1_7 | |||
//v08 must save pB0_0, pB0_1 | |||
//v09 must save pB0_2, pB0_3 | |||
//v10 must save ALPHA0 | |||
//v11 must save ALPHA1 | |||
//v12 must save pB1_0, pB1_1 | |||
//v13 must save pB1_2, pB1_3 | |||
//v14 must save ALPHA2 | |||
//v15 must save ALPHA3 | |||
//v08 must save pB0_0 | |||
//v09 must save pB0_1 | |||
//v10 must save pB0_2 --> ALPHA0 | |||
//v11 must save pB0_3 | |||
//v12 must save pB1_0 | |||
//v13 must save pB1_1 | |||
//v14 must save pB1_2 | |||
//v15 must save pB1_3 | |||
//v16 must save C00, C01 | |||
//v17 must save C02, C03 | |||
//v18 C04, C05 | |||
@@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL8x4_I | |||
ld1 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
ld1 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
ld1 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
ldp q0, q1, [pA], #32 | |||
ldp d8, d9, [pB], #16 | |||
fmul v16.2d, v0.2d, v8.d[0] | |||
fmul v20.2d, v0.2d, v9.d[0] | |||
ldp d10, d11, [pB], #16 | |||
fmul v17.2d, v1.2d, v8.d[0] | |||
fmul v21.2d, v1.2d, v9.d[0] | |||
ldp q2, q3, [pA], #32 | |||
fmul v24.2d, v0.2d, v10.d[0] | |||
fmul v28.2d, v0.2d, v11.d[0] | |||
ldp q4, q5, [pA], #32 | |||
fmul v25.2d, v1.2d, v10.d[0] | |||
fmul v29.2d, v1.2d, v11.d[0] | |||
ldp d12, d13, [pB], #16 | |||
fmul v18.2d, v2.2d, v8.d[0] | |||
fmul v19.2d, v3.2d, v8.d[0] | |||
fmul v22.2d, v2.2d, v9.d[0] | |||
fmul v20.2d, v0.2d, v8.d[1] | |||
fmul v21.2d, v1.2d, v8.d[1] | |||
fmul v22.2d, v2.2d, v8.d[1] | |||
fmul v23.2d, v3.2d, v8.d[1] | |||
ldp d14, d15, [pB], #16 | |||
fmul v24.2d, v0.2d, v9.d[0] | |||
fmul v25.2d, v1.2d, v9.d[0] | |||
fmul v26.2d, v2.2d, v9.d[0] | |||
fmul v27.2d, v3.2d, v9.d[0] | |||
fmul v26.2d, v2.2d, v10.d[0] | |||
fmul v30.2d, v2.2d, v11.d[0] | |||
fmul v28.2d, v0.2d, v9.d[1] | |||
fmul v29.2d, v1.2d, v9.d[1] | |||
fmul v30.2d, v2.2d, v9.d[1] | |||
fmul v31.2d, v3.2d, v9.d[1] | |||
ldp q6, q7, [pA], #32 | |||
ld1 {v4.2d, v5.2d}, [pA] | |||
add pA, pA, #32 | |||
ld1 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
ld1 {v6.2d, v7.2d}, [pA] | |||
add pA, pA, #32 | |||
fmul v19.2d, v3.2d, v8.d[0] | |||
fmul v27.2d, v3.2d, v10.d[0] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmul v31.2d, v3.2d, v11.d[0] | |||
fmul v23.2d, v3.2d, v9.d[0] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNEL8x4_M1 | |||
fmla v16.2d, v0.2d, v8.d[0] | |||
fmla v20.2d, v0.2d, v9.d[0] | |||
ldp q4, q5, [pA], #32 | |||
fmla v24.2d, v0.2d, v10.d[0] | |||
fmla v28.2d, v0.2d, v11.d[0] | |||
ldp d12, d13, [pB], #16 | |||
fmla v17.2d, v1.2d, v8.d[0] | |||
fmla v18.2d, v2.2d, v8.d[0] | |||
fmla v19.2d, v3.2d, v8.d[0] | |||
fmla v25.2d, v1.2d, v10.d[0] | |||
fmla v20.2d, v0.2d, v8.d[1] | |||
fmla v21.2d, v1.2d, v8.d[1] | |||
fmla v22.2d, v2.2d, v8.d[1] | |||
fmla v23.2d, v3.2d, v8.d[1] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
fmla v24.2d, v0.2d, v9.d[0] | |||
fmla v25.2d, v1.2d, v9.d[0] | |||
fmla v26.2d, v2.2d, v9.d[0] | |||
fmla v27.2d, v3.2d, v9.d[0] | |||
fmla v21.2d, v1.2d, v9.d[0] | |||
fmla v29.2d, v1.2d, v11.d[0] | |||
fmla v28.2d, v0.2d, v9.d[1] | |||
fmla v29.2d, v1.2d, v9.d[1] | |||
fmla v30.2d, v2.2d, v9.d[1] | |||
fmla v31.2d, v3.2d, v9.d[1] | |||
ldp d14, d15, [pB], #16 | |||
ld1 {v4.2d, v5.2d}, [pA] | |||
add pA, pA, #32 | |||
ld1 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
ld1 {v6.2d, v7.2d}, [pA] | |||
add pA, pA, #32 | |||
fmla v18.2d, v2.2d, v8.d[0] | |||
fmla v22.2d, v2.2d, v9.d[0] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla v26.2d, v2.2d, v10.d[0] | |||
fmla v30.2d, v2.2d, v11.d[0] | |||
fmla v19.2d, v3.2d, v8.d[0] | |||
fmla v23.2d, v3.2d, v9.d[0] | |||
ldp q6, q7, [pA], #32 | |||
prfm PLDL1KEEP, [pA, #512] | |||
fmla v27.2d, v3.2d, v10.d[0] | |||
fmla v31.2d, v3.2d, v11.d[0] | |||
.endm | |||
.macro KERNEL8x4_M2 | |||
fmla v16.2d, v4.2d, v12.d[0] | |||
fmla v20.2d, v4.2d, v13.d[0] | |||
fmla v24.2d, v4.2d, v14.d[0] | |||
fmla v28.2d, v4.2d, v15.d[0] | |||
ldp q0, q1, [pA], #32 | |||
fmla v17.2d, v5.2d, v12.d[0] | |||
fmla v25.2d, v5.2d, v14.d[0] | |||
ldp d8, d9, [pB], #16 | |||
fmla v21.2d, v5.2d, v13.d[0] | |||
fmla v29.2d, v5.2d, v15.d[0] | |||
ldp d10, d11, [pB], #16 | |||
fmla v18.2d, v6.2d, v12.d[0] | |||
fmla v19.2d, v7.2d, v12.d[0] | |||
fmla v22.2d, v6.2d, v13.d[0] | |||
fmla v20.2d, v4.2d, v12.d[1] | |||
fmla v21.2d, v5.2d, v12.d[1] | |||
fmla v22.2d, v6.2d, v12.d[1] | |||
fmla v23.2d, v7.2d, v12.d[1] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla v24.2d, v4.2d, v13.d[0] | |||
fmla v25.2d, v5.2d, v13.d[0] | |||
fmla v26.2d, v6.2d, v13.d[0] | |||
fmla v27.2d, v7.2d, v13.d[0] | |||
fmla v26.2d, v6.2d, v14.d[0] | |||
fmla v30.2d, v6.2d, v15.d[0] | |||
fmla v28.2d, v4.2d, v13.d[1] | |||
fmla v29.2d, v5.2d, v13.d[1] | |||
fmla v30.2d, v6.2d, v13.d[1] | |||
fmla v31.2d, v7.2d, v13.d[1] | |||
fmla v19.2d, v7.2d, v12.d[0] | |||
fmla v23.2d, v7.2d, v13.d[0] | |||
ld1 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
ld1 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
ld1 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
ldp q2, q3, [pA], #32 | |||
prfm PLDL1KEEP, [pB, #512] | |||
fmla v27.2d, v7.2d, v14.d[0] | |||
fmla v31.2d, v7.2d, v15.d[0] | |||
.endm | |||
.macro KERNEL8x4_E | |||
fmla v16.2d, v4.2d, v12.d[0] | |||
fmla v20.2d, v4.2d, v13.d[0] | |||
fmla v24.2d, v4.2d, v14.d[0] | |||
fmla v28.2d, v4.2d, v15.d[0] | |||
fmla v17.2d, v5.2d, v12.d[0] | |||
fmla v18.2d, v6.2d, v12.d[0] | |||
fmla v19.2d, v7.2d, v12.d[0] | |||
fmla v25.2d, v5.2d, v14.d[0] | |||
fmla v21.2d, v5.2d, v13.d[0] | |||
fmla v29.2d, v5.2d, v15.d[0] | |||
fmla v20.2d, v4.2d, v12.d[1] | |||
fmla v21.2d, v5.2d, v12.d[1] | |||
fmla v22.2d, v6.2d, v12.d[1] | |||
fmla v23.2d, v7.2d, v12.d[1] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla v24.2d, v4.2d, v13.d[0] | |||
fmla v25.2d, v5.2d, v13.d[0] | |||
fmla v26.2d, v6.2d, v13.d[0] | |||
fmla v27.2d, v7.2d, v13.d[0] | |||
fmla v18.2d, v6.2d, v12.d[0] | |||
fmla v22.2d, v6.2d, v13.d[0] | |||
fmla v26.2d, v6.2d, v14.d[0] | |||
fmla v30.2d, v6.2d, v15.d[0] | |||
fmla v28.2d, v4.2d, v13.d[1] | |||
fmla v29.2d, v5.2d, v13.d[1] | |||
fmla v30.2d, v6.2d, v13.d[1] | |||
fmla v31.2d, v7.2d, v13.d[1] | |||
fmla v19.2d, v7.2d, v12.d[0] | |||
fmla v23.2d, v7.2d, v13.d[0] | |||
fmla v27.2d, v7.2d, v14.d[0] | |||
fmla v31.2d, v7.2d, v15.d[0] | |||
.endm | |||
.macro KERNEL8x4_SUB | |||
ld1 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
ld1 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
ld1 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
ldp q0, q1, [pA], #32 | |||
ldp d8, d9, [pB], #16 | |||
fmla v16.2d, v0.2d, v8.d[0] | |||
fmla v20.2d, v0.2d, v9.d[0] | |||
ldp d10, d11, [pB], #16 | |||
fmla v17.2d, v1.2d, v8.d[0] | |||
fmla v21.2d, v1.2d, v9.d[0] | |||
ldp q2, q3, [pA], #32 | |||
fmla v24.2d, v0.2d, v10.d[0] | |||
fmla v28.2d, v0.2d, v11.d[0] | |||
fmla v25.2d, v1.2d, v10.d[0] | |||
fmla v29.2d, v1.2d, v11.d[0] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmla v18.2d, v2.2d, v8.d[0] | |||
fmla v19.2d, v3.2d, v8.d[0] | |||
fmla v22.2d, v2.2d, v9.d[0] | |||
fmla v20.2d, v0.2d, v8.d[1] | |||
fmla v21.2d, v1.2d, v8.d[1] | |||
fmla v22.2d, v2.2d, v8.d[1] | |||
fmla v23.2d, v3.2d, v8.d[1] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
fmla v24.2d, v0.2d, v9.d[0] | |||
fmla v25.2d, v1.2d, v9.d[0] | |||
fmla v26.2d, v2.2d, v9.d[0] | |||
fmla v27.2d, v3.2d, v9.d[0] | |||
fmla v26.2d, v2.2d, v10.d[0] | |||
fmla v30.2d, v2.2d, v11.d[0] | |||
fmla v28.2d, v0.2d, v9.d[1] | |||
fmla v29.2d, v1.2d, v9.d[1] | |||
fmla v30.2d, v2.2d, v9.d[1] | |||
fmla v31.2d, v3.2d, v9.d[1] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
fmla v19.2d, v3.2d, v8.d[0] | |||
fmla v27.2d, v3.2d, v10.d[0] | |||
fmla v31.2d, v3.2d, v11.d[0] | |||
fmla v23.2d, v3.2d, v9.d[0] | |||
.endm | |||
.macro SAVE8x4 | |||
add pCRow1, pCRow0, LDC | |||
fmov alpha0, alpha | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
fmul v0.2d, v16.2d, alphaV0 | |||
fmul v1.2d, v17.2d, alphaV1 | |||
fmul v2.2d, v18.2d, alphaV2 | |||
fmul v3.2d, v19.2d, alphaV3 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
fmul v1.2d, v17.2d, alphaV0 | |||
stp q0, q1, [pCRow0] | |||
add pCRow2, pCRow1, LDC | |||
add pCRow0, pCRow0, #32 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
fmul v2.2d, v18.2d, alphaV0 | |||
fmul v3.2d, v19.2d, alphaV0 | |||
stp q2, q3, [pCRow0] | |||
add pCRow0, pCRow0, #32 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
fmul v4.2d, v20.2d, alphaV0 | |||
fmul v5.2d, v21.2d, alphaV1 | |||
fmul v6.2d, v22.2d, alphaV2 | |||
fmul v7.2d, v23.2d, alphaV3 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
fmul v5.2d, v21.2d, alphaV0 | |||
stp q4, q5, [pCRow1] | |||
add pCRow1, pCRow2, LDC | |||
add pCRow1, pCRow1, #32 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
fmul v6.2d, v22.2d, alphaV0 | |||
fmul v7.2d, v23.2d, alphaV0 | |||
stp q6, q7, [pCRow1] | |||
add pCRow1, pCRow1, #32 | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
fmul v0.2d, v24.2d, alphaV0 | |||
fmul v1.2d, v25.2d, alphaV1 | |||
fmul v2.2d, v26.2d, alphaV2 | |||
fmul v3.2d, v27.2d, alphaV3 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] | |||
fmul v1.2d, v25.2d, alphaV0 | |||
stp q0, q1, [pCRow2] | |||
add pCRow2, pCRow2, #32 | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
fmul v2.2d, v26.2d, alphaV0 | |||
fmul v3.2d, v27.2d, alphaV0 | |||
stp q2, q3, [pCRow2] | |||
add pCRow2, pCRow2, #32 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
fmul v4.2d, v28.2d, alphaV0 | |||
fmul v5.2d, v29.2d, alphaV1 | |||
fmul v6.2d, v30.2d, alphaV2 | |||
fmul v7.2d, v31.2d, alphaV3 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
fmul v5.2d, v29.2d, alphaV0 | |||
stp q4, q5, [pCRow3] | |||
add pCRow0, pCRow0, #64 | |||
add pCRow3, pCRow3, #32 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
fmul v6.2d, v30.2d, alphaV0 | |||
fmul v7.2d, v31.2d, alphaV0 | |||
stp q6, q7, [pCRow3] | |||
add pCRow3, pCRow3, #32 | |||
.endm | |||
/******************************************************************************/ | |||
@@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x4 | |||
fmov alpha0, alpha | |||
fmul v8.2d, v16.2d, alphaV0 | |||
fmul v9.2d, v17.2d, alphaV1 | |||
fmul v9.2d, v17.2d, alphaV0 | |||
st1 {v8.2d, v9.2d}, [pCRow0] | |||
add pCRow1, pCRow0, LDC | |||
fmul v12.2d, v20.2d, alphaV2 | |||
fmul v13.2d, v21.2d, alphaV3 | |||
fmul v12.2d, v20.2d, alphaV0 | |||
fmul v13.2d, v21.2d, alphaV0 | |||
st1 {v12.2d, v13.2d}, [pCRow1] | |||
add pCRow2, pCRow1, LDC | |||
fmul v8.2d, v24.2d, alphaV0 | |||
fmul v9.2d, v25.2d, alphaV1 | |||
fmul v9.2d, v25.2d, alphaV0 | |||
st1 {v8.2d, v9.2d}, [pCRow2] | |||
add pCRow1, pCRow2, LDC | |||
fmul v12.2d, v28.2d, alphaV2 | |||
fmul v13.2d, v29.2d, alphaV3 | |||
fmul v12.2d, v28.2d, alphaV0 | |||
fmul v13.2d, v29.2d, alphaV0 | |||
st1 {v12.2d, v13.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x4 | |||
fmov alpha0, alpha | |||
fmul v8.2d, v16.2d, alphaV0 | |||
st1 {v8.2d}, [pCRow0] | |||
add pCRow1, pCRow0, LDC | |||
fmul v12.2d, v20.2d, alphaV1 | |||
fmul v12.2d, v20.2d, alphaV0 | |||
st1 {v12.2d}, [pCRow1] | |||
add pCRow2, pCRow1, LDC | |||
fmul v8.2d, v24.2d, alphaV2 | |||
fmul v8.2d, v24.2d, alphaV0 | |||
st1 {v8.2d}, [pCRow2] | |||
add pCRow1, pCRow2, LDC | |||
fmul v12.2d, v28.2d, alphaV3 | |||
fmul v12.2d, v28.2d, alphaV0 | |||
st1 {v12.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x4 | |||
fmov alpha0, alpha | |||
add pCRow1, pCRow0, LDC | |||
fmul v8.2d, v16.2d, alphaV0 | |||
@@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
add pCRow2, pCRow1, LDC | |||
add pCRow1, pCRow2, LDC | |||
fmul v12.2d, v20.2d, alphaV1 | |||
fmul v12.2d, v20.2d, alphaV0 | |||
st1 {v12.d}[0], [pCRow2] | |||
st1 {v12.d}[1], [pCRow1] | |||
@@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE8x2 | |||
fmov alpha0, alpha | |||
add pCRow1, pCRow0, LDC | |||
fmul v0.2d, v16.2d, alphaV0 | |||
fmul v1.2d, v17.2d, alphaV1 | |||
fmul v2.2d, v18.2d, alphaV2 | |||
fmul v3.2d, v19.2d, alphaV3 | |||
fmul v1.2d, v17.2d, alphaV0 | |||
fmul v2.2d, v18.2d, alphaV0 | |||
fmul v3.2d, v19.2d, alphaV0 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
fmul v4.2d, v20.2d, alphaV0 | |||
fmul v5.2d, v21.2d, alphaV1 | |||
fmul v6.2d, v22.2d, alphaV2 | |||
fmul v7.2d, v23.2d, alphaV3 | |||
fmul v5.2d, v21.2d, alphaV0 | |||
fmul v6.2d, v22.2d, alphaV0 | |||
fmul v7.2d, v23.2d, alphaV0 | |||
st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #64 | |||
@@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x2 | |||
fmov alpha0, alpha | |||
fmul v8.2d, v16.2d, alphaV0 | |||
fmul v9.2d, v17.2d, alphaV1 | |||
fmul v9.2d, v17.2d, alphaV0 | |||
st1 {v8.2d, v9.2d}, [pCRow0] | |||
add pCRow1, pCRow0, LDC | |||
fmul v12.2d, v20.2d, alphaV2 | |||
fmul v13.2d, v21.2d, alphaV3 | |||
fmul v12.2d, v20.2d, alphaV0 | |||
fmul v13.2d, v21.2d, alphaV0 | |||
st1 {v12.2d, v13.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x2 | |||
fmov alpha0, alpha | |||
fmul v8.2d, v16.2d, alphaV0 | |||
st1 {v8.2d}, [pCRow0] | |||
add pCRow1 , pCRow0, LDC | |||
fmul v12.2d, v20.2d, alphaV1 | |||
fmul v12.2d, v20.2d, alphaV0 | |||
st1 {v12.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x2 | |||
fmov alpha0, alpha | |||
add pCRow1 , pCRow0, LDC | |||
fmul v8.2d, v16.2d, alphaV0 | |||
@@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE8x1 | |||
fmov alpha0, alpha | |||
fmul v0.2d, v16.2d, alphaV0 | |||
fmul v1.2d, v17.2d, alphaV1 | |||
fmul v2.2d, v18.2d, alphaV2 | |||
fmul v3.2d, v19.2d, alphaV3 | |||
fmul v1.2d, v17.2d, alphaV0 | |||
fmul v2.2d, v18.2d, alphaV0 | |||
fmul v3.2d, v19.2d, alphaV0 | |||
st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
add pCRow0, pCRow0, #64 | |||
@@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x1 | |||
fmov alpha0, alpha | |||
fmul v8.2d, v16.2d, alphaV0 | |||
fmul v9.2d, v17.2d, alphaV1 | |||
fmul v9.2d, v17.2d, alphaV0 | |||
st1 {v8.2d, v9.2d}, [pCRow0] | |||
add pCRow0, pCRow0, #32 | |||
@@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x1 | |||
fmov alpha0, alpha | |||
fmul v8.2d, v16.2d, alphaV0 | |||
st1 {v8.2d}, [pCRow0] | |||
@@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x1 | |||
fmov alpha0, alpha | |||
fmul d8, d16, alpha0 | |||
str d8, [pCRow0] | |||
@@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
fmov alpha0, d0 | |||
fmov alpha1, d0 | |||
fmov alpha2, d0 | |||
fmov alpha3, d0 | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alpha, d0 | |||
lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
@@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/******************************************************************************/ | |||
dtrmm_kernel_L4_BEGIN: | |||
mov pCRow0, pC // pCRow0 = C | |||
add pC, pC, LDC, lsl #2 | |||
mov pCRow0, pC | |||
add pCRow1, pCRow0, LDC | |||
add pCRow2, pCRow1, LDC | |||
add pCRow3, pCRow2, LDC | |||
add pC, pCRow3, LDC | |||
#if defined(LEFT) | |||
mov tempOffset, offset | |||
@@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: | |||
cmp counterI, #0 | |||
ble dtrmm_kernel_L4_M4_BEGIN | |||
.align 5 | |||
dtrmm_kernel_L4_M8_20: | |||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
@@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: | |||
add tempK, tempOffset, #4 | |||
#endif | |||
asr counterL , tempK, #1 // L = K / 2 | |||
asr counterL , tempK, #3 // L = K / 8 | |||
cmp counterL , #2 // is there at least 4 to do? | |||
blt dtrmm_kernel_L4_M8_32 | |||
KERNEL8x4_I // do one in the K | |||
KERNEL8x4_M2 // do another in the K | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
subs counterL, counterL, #2 // subtract 2 | |||
ble dtrmm_kernel_L4_M8_22a | |||
.align 5 | |||
.align 5 | |||
dtrmm_kernel_L4_M8_22: | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
subs counterL, counterL, #1 | |||
bgt dtrmm_kernel_L4_M8_22 | |||
.align 5 | |||
dtrmm_kernel_L4_M8_22a: | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_E | |||
b dtrmm_kernel_L4_M8_44 | |||
.align 5 | |||
dtrmm_kernel_L4_M8_32: | |||
tst counterL, #1 | |||
ble dtrmm_kernel_L4_M8_40 | |||
KERNEL8x4_I | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_M2 | |||
KERNEL8x4_M1 | |||
KERNEL8x4_E | |||
b dtrmm_kernel_L4_M8_44 | |||
@@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: | |||
dtrmm_kernel_L4_M8_44: | |||
ands counterL , tempK, #1 | |||
ands counterL , tempK, #7 | |||
ble dtrmm_kernel_L4_M8_100 | |||
.align 5 | |||
dtrmm_kernel_L4_M8_46: | |||
KERNEL8x4_SUB | |||
subs counterL, counterL, #1 | |||
bne dtrmm_kernel_L4_M8_46 | |||
dtrmm_kernel_L4_M8_100: | |||
SAVE8x4 | |||
@@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: | |||
#if defined(LEFT) | |||
add tempOffset, tempOffset, #8 | |||
#endif | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
dtrmm_kernel_L4_M8_END: | |||
subs counterI, counterI, #1 | |||
@@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define pCRow0 x12 | |||
#define pCRow1 x13 | |||
#define pCRow2 x14 | |||
#define pA x15 | |||
#define alpha_save_R x16 | |||
#define alpha_save_I x17 | |||
#define pCRow3 x15 | |||
#define pA x16 | |||
#define alphaR x17 | |||
#define alphaI x18 | |||
#define alpha0_R d10 | |||
#define alphaV0_R v10.d[0] | |||
#define alpha0_I d11 | |||
#define alphaV0_I v11.d[0] | |||
#define alpha1_R d14 | |||
#define alphaV1_R v14.d[0] | |||
#define alpha1_I d15 | |||
#define alphaV1_I v15.d[0] | |||
#define A_PRE_SIZE 2560 | |||
#define B_PRE_SIZE 448 | |||
#define C_PRE_SIZE 128 | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
#define OP_rr fmla | |||
@@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
// 12 pCRow0 | |||
// 13 pCRow1 | |||
// 14 pCRow2 | |||
// 15 pA | |||
// 16 alpha_save_R | |||
// 17 alpha_save_I | |||
// 18 must save | |||
// 15 pCRow3 | |||
// 16 pA | |||
// 17 alpha_save_R | |||
// 18 must save alpha_save_I | |||
// 19 must save | |||
// 20 must save | |||
// 21 must save | |||
@@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro KERNEL4x4_I | |||
ld2 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
fmul v16.2d, v0.2d, v8.d[0] | |||
OP_ii v16.2d, v1.2d, v9.d[0] | |||
@@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v17.2d, v1.2d, v8.d[0] | |||
fmul v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
eor v19.16b, v19.16b, v19.16b | |||
fmls v19.2d, v2.2d, v9.d[0] | |||
#else | |||
fmul v19.2d, v2.2d, v9.d[0] | |||
#endif | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
fmul v20.2d, v0.2d, v8.d[1] | |||
OP_ii v20.2d, v1.2d, v9.d[1] | |||
@@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v21.2d, v1.2d, v8.d[1] | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
fmul v22.2d, v2.2d, v8.d[1] | |||
OP_ii v22.2d, v3.2d, v9.d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v23.2d, v3.2d, v8.d[1] | |||
ld2 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
fmul v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
eor v19.16b, v19.16b, v19.16b | |||
fmls v19.2d, v2.2d, v9.d[0] | |||
#else | |||
fmul v19.2d, v2.2d, v9.d[0] | |||
#endif | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v4.2d, v5.2d} , [pA] | |||
add pA, pA, #32 | |||
fmul v24.2d, v0.2d, v10.d[0] | |||
OP_ii v24.2d, v1.2d, v11.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v25.2d, v1.2d, v10.d[0] | |||
ld2 {v6.2d, v7.2d} , [pA] | |||
add pA, pA, #32 | |||
fmul v26.2d, v2.2d, v10.d[0] | |||
OP_ii v26.2d, v3.2d, v11.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v27.2d, v3.2d, v10.d[0] | |||
ld2 {v14.2d, v15.2d}, [pB] | |||
add pB, pB, #32 | |||
fmul v28.2d, v0.2d, v10.d[1] | |||
OP_ii v28.2d, v1.2d, v11.d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v29.2d, v1.2d, v10.d[1] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmul v30.2d, v2.2d, v10.d[1] | |||
OP_ii v30.2d, v3.2d, v11.d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v31.2d, v3.2d, v10.d[1] | |||
ld2 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v14.2d, v15.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v4.2d, v5.2d} , [pA] | |||
add pA, pA, #32 | |||
ld2 {v6.2d, v7.2d} , [pA] | |||
add pA, pA, #32 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNEL4x4_M1 | |||
@@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v17.2d, v0.2d, v9.d[0] | |||
OP_ir v17.2d, v1.2d, v8.d[0] | |||
ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
ld2 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v18.2d, v2.2d, v8.d[0] | |||
@@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v19.2d, v2.2d, v9.d[0] | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
add pB, pB, #32 | |||
ld2 {v4.2d, v5.2d} , [pA] | |||
add pA, pA, #32 | |||
OP_rr v20.2d, v0.2d, v8.d[1] | |||
OP_ii v20.2d, v1.2d, v9.d[1] | |||
OP_ri v21.2d, v0.2d, v9.d[1] | |||
OP_ir v21.2d, v1.2d, v8.d[1] | |||
ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
ld2 {v6.2d, v7.2d} , [pA] | |||
add pA, pA, #32 | |||
OP_rr v22.2d, v2.2d, v8.d[1] | |||
@@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v23.2d, v2.2d, v9.d[1] | |||
OP_ir v23.2d, v3.2d, v8.d[1] | |||
ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
add pA, pA, #32 | |||
ld2 {v14.2d, v15.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v24.2d, v0.2d, v10.d[0] | |||
OP_ii v24.2d, v1.2d, v11.d[0] | |||
OP_ri v25.2d, v0.2d, v11.d[0] | |||
OP_ir v25.2d, v1.2d, v10.d[0] | |||
prfm PLDL1KEEP, [pA, #512] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
OP_rr v26.2d, v2.2d, v10.d[0] | |||
OP_ii v26.2d, v3.2d, v11.d[0] | |||
OP_ri v27.2d, v2.2d, v11.d[0] | |||
OP_ir v27.2d, v3.2d, v10.d[0] | |||
prfm PLDL1KEEP, [pB, #512] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
OP_rr v28.2d, v0.2d, v10.d[1] | |||
OP_ii v28.2d, v1.2d, v11.d[1] | |||
@@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v17.2d, v4.2d, v13.d[0] | |||
OP_ir v17.2d, v5.2d, v12.d[0] | |||
ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
ld2 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v18.2d, v6.2d, v12.d[0] | |||
@@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v19.2d, v6.2d, v13.d[0] | |||
OP_ir v19.2d, v7.2d, v12.d[0] | |||
ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
add pB, pB, #32 | |||
ld2 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v20.2d, v4.2d, v12.d[1] | |||
OP_ii v20.2d, v5.2d, v13.d[1] | |||
OP_ri v21.2d, v4.2d, v13.d[1] | |||
OP_ir v21.2d, v5.2d, v12.d[1] | |||
ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v22.2d, v6.2d, v12.d[1] | |||
@@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v23.2d, v6.2d, v13.d[1] | |||
OP_ir v23.2d, v7.2d, v12.d[1] | |||
ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
add pA, pA, #32 | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v24.2d, v4.2d, v14.d[0] | |||
OP_ii v24.2d, v5.2d, v15.d[0] | |||
OP_ri v25.2d, v4.2d, v15.d[0] | |||
OP_ir v25.2d, v5.2d, v14.d[0] | |||
prfm PLDL1KEEP, [pA, #512] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
OP_rr v26.2d, v6.2d, v14.d[0] | |||
OP_ii v26.2d, v7.2d, v15.d[0] | |||
OP_ri v27.2d, v6.2d, v15.d[0] | |||
OP_ir v27.2d, v7.2d, v14.d[0] | |||
prfm PLDL1KEEP, [pB, #512] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
OP_rr v28.2d, v4.2d, v14.d[1] | |||
OP_ii v28.2d, v5.2d, v15.d[1] | |||
@@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v21.2d, v4.2d, v13.d[1] | |||
OP_ir v21.2d, v5.2d, v12.d[1] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
OP_rr v22.2d, v6.2d, v12.d[1] | |||
OP_ii v22.2d, v7.2d, v13.d[1] | |||
OP_ri v23.2d, v6.2d, v13.d[1] | |||
@@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v25.2d, v4.2d, v15.d[0] | |||
OP_ir v25.2d, v5.2d, v14.d[0] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
OP_rr v26.2d, v6.2d, v14.d[0] | |||
OP_ii v26.2d, v7.2d, v15.d[0] | |||
OP_ri v27.2d, v6.2d, v15.d[0] | |||
@@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro KERNEL4x4_SUB | |||
ld2 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v16.2d, v0.2d, v8.d[0] | |||
OP_ii v16.2d, v1.2d, v9.d[0] | |||
OP_ri v17.2d, v0.2d, v9.d[0] | |||
OP_ir v17.2d, v1.2d, v8.d[0] | |||
OP_rr v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
OP_ri v19.2d, v2.2d, v9.d[0] | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v20.2d, v0.2d, v8.d[1] | |||
OP_ii v20.2d, v1.2d, v9.d[1] | |||
OP_ri v21.2d, v0.2d, v9.d[1] | |||
OP_ir v21.2d, v1.2d, v8.d[1] | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
OP_ri v19.2d, v2.2d, v9.d[0] | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
OP_rr v22.2d, v2.2d, v8.d[1] | |||
OP_ii v22.2d, v3.2d, v9.d[1] | |||
OP_ri v23.2d, v2.2d, v9.d[1] | |||
OP_ir v23.2d, v3.2d, v8.d[1] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
OP_rr v24.2d, v0.2d, v10.d[0] | |||
OP_ii v24.2d, v1.2d, v11.d[0] | |||
OP_ri v25.2d, v0.2d, v11.d[0] | |||
@@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x4 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
ld2 {v0.2d, v1.2d}, [pCRow0] | |||
fmla v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmla v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
ld2 {v2.2d, v3.2d}, [pCRow2] | |||
fmla v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow0] | |||
add pCRow0, pCRow0, #32 | |||
ld2 {v2.2d, v3.2d}, [pCRow0] | |||
fmla v2.2d, v18.2d, alphaV0_R | |||
fmls v2.2d, v19.2d, alphaV0_I | |||
fmla v3.2d, v18.2d, alphaV1_I | |||
fmla v3.2d, v19.2d, alphaV1_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
fmla v3.2d, v18.2d, alphaV0_I | |||
fmla v3.2d, v19.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow0] | |||
add pCRow0, pCRow0, #32 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow1, pCRow1, LDC | |||
ld2 {v4.2d, v5.2d}, [pCRow1] | |||
fmla v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmla v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmla v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
ld2 {v6.2d, v7.2d}, [pCRow2] | |||
add pCRow1, pCRow1, #32 | |||
ld2 {v6.2d, v7.2d}, [pCRow1] | |||
fmla v6.2d, v22.2d, alphaV0_R | |||
fmls v6.2d, v23.2d, alphaV0_I | |||
fmla v7.2d, v22.2d, alphaV1_I | |||
fmla v7.2d, v23.2d, alphaV1_R | |||
st2 {v6.2d, v7.2d}, [pCRow2] | |||
fmla v7.2d, v22.2d, alphaV0_I | |||
fmla v7.2d, v23.2d, alphaV0_R | |||
st2 {v6.2d, v7.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow1, pCRow1, #32 | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
ld2 {v0.2d, v1.2d}, [pCRow2] | |||
fmla v0.2d, v24.2d, alphaV0_R | |||
fmls v0.2d, v25.2d, alphaV0_I | |||
fmla v1.2d, v24.2d, alphaV1_I | |||
fmla v1.2d, v25.2d, alphaV1_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
fmla v1.2d, v24.2d, alphaV0_I | |||
fmla v1.2d, v25.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow2] | |||
add pCRow2, pCRow2, #32 | |||
ld2 {v2.2d, v3.2d}, [pCRow2] | |||
fmla v2.2d, v26.2d, alphaV0_R | |||
fmls v2.2d, v27.2d, alphaV0_I | |||
fmla v3.2d, v26.2d, alphaV1_I | |||
fmla v3.2d, v27.2d, alphaV1_R | |||
fmla v3.2d, v26.2d, alphaV0_I | |||
fmla v3.2d, v27.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
add pCRow1, pCRow1, LDC | |||
add pCRow2, pCRow2, #32 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
ld2 {v4.2d, v5.2d}, [pCRow1] | |||
ld2 {v4.2d, v5.2d}, [pCRow3] | |||
fmla v4.2d, v28.2d, alphaV0_R | |||
fmls v4.2d, v29.2d, alphaV0_I | |||
fmla v5.2d, v28.2d, alphaV1_I | |||
fmla v5.2d, v29.2d, alphaV1_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
ld2 {v6.2d, v7.2d}, [pCRow2] | |||
fmla v5.2d, v28.2d, alphaV0_I | |||
fmla v5.2d, v29.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow3] | |||
add pCRow3, pCRow3, #32 | |||
ld2 {v6.2d, v7.2d}, [pCRow3] | |||
fmla v6.2d, v30.2d, alphaV0_R | |||
fmls v6.2d, v31.2d, alphaV0_I | |||
fmla v7.2d, v30.2d, alphaV1_I | |||
fmla v7.2d, v31.2d, alphaV1_R | |||
st2 {v6.2d, v7.2d}, [pCRow2] | |||
fmla v7.2d, v30.2d, alphaV0_I | |||
fmla v7.2d, v31.2d, alphaV0_R | |||
st2 {v6.2d, v7.2d}, [pCRow3] | |||
add pCRow0, pCRow0, #64 | |||
add pCRow3, pCRow3, #32 | |||
.endm | |||
/******************************************************************************/ | |||
@@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x4 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
fmla v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmla v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmla v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v4.2d, v5.2d}, [pCRow1] | |||
fmla v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmla v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmla v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
fmla v0.2d, v24.2d, alphaV0_R | |||
fmls v0.2d, v25.2d, alphaV0_I | |||
fmla v1.2d, v24.2d, alphaV1_I | |||
fmla v1.2d, v25.2d, alphaV1_R | |||
fmla v1.2d, v24.2d, alphaV0_I | |||
fmla v1.2d, v25.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v4.2d, v5.2d}, [pCRow1] | |||
fmla v4.2d, v28.2d, alphaV0_R | |||
fmls v4.2d, v29.2d, alphaV0_I | |||
fmla v5.2d, v28.2d, alphaV1_I | |||
fmla v5.2d, v29.2d, alphaV1_R | |||
fmla v5.2d, v28.2d, alphaV0_I | |||
fmla v5.2d, v29.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x4 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.d, v1.d}[0], [pCRow1] | |||
fmla d0, d16, alphaV0_R | |||
fmls d0, d17, alphaV0_I | |||
fmla d1, d16, alphaV1_I | |||
fmla d1, d17, alphaV1_R | |||
fmla d1, d16, alphaV0_I | |||
fmla d1, d17, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v4.d, v5.d}[0], [pCRow1] | |||
fmla d4, d20, alphaV0_R | |||
fmls d4, d21, alphaV0_I | |||
fmla d5, d20, alphaV1_I | |||
fmla d5, d21, alphaV1_R | |||
fmla d5, d20, alphaV0_I | |||
fmla d5, d21, alphaV0_R | |||
st2 {v4.d, v5.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v0.d, v1.d}[0], [pCRow1] | |||
fmla d0, d24, alphaV0_R | |||
fmls d0, d25, alphaV0_I | |||
fmla d1, d24, alphaV1_I | |||
fmla d1, d25, alphaV1_R | |||
fmla d1, d24, alphaV0_I | |||
fmla d1, d25, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v4.d, v5.d}[0], [pCRow1] | |||
fmla d4, d28, alphaV0_R | |||
fmls d4, d29, alphaV0_I | |||
fmla d5, d28, alphaV1_I | |||
fmla d5, d29, alphaV1_R | |||
fmla d5, d28, alphaV0_I | |||
fmla d5, d29, alphaV0_R | |||
st2 {v4.d, v5.d}[0], [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x2 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
fmla v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmla v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmla v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
ld2 {v2.2d, v3.2d}, [pCRow2] | |||
fmla v2.2d, v18.2d, alphaV0_R | |||
fmls v2.2d, v19.2d, alphaV0_I | |||
fmla v3.2d, v18.2d, alphaV1_I | |||
fmla v3.2d, v19.2d, alphaV1_R | |||
fmla v3.2d, v18.2d, alphaV0_I | |||
fmla v3.2d, v19.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
add pCRow1, pCRow1, LDC | |||
@@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v4.2d, v5.2d}, [pCRow1] | |||
fmla v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmla v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmla v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
ld2 {v6.2d, v7.2d}, [pCRow2] | |||
fmla v6.2d, v22.2d, alphaV0_R | |||
fmls v6.2d, v23.2d, alphaV0_I | |||
fmla v7.2d, v22.2d, alphaV1_I | |||
fmla v7.2d, v23.2d, alphaV1_R | |||
fmla v7.2d, v22.2d, alphaV0_I | |||
fmla v7.2d, v23.2d, alphaV0_R | |||
st2 {v6.2d, v7.2d}, [pCRow2] | |||
add pCRow0, pCRow0, #64 | |||
@@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x2 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
fmla v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmla v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmla v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v4.2d, v5.2d}, [pCRow1] | |||
fmla v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmla v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmla v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x2 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.d, v1.d}[0], [pCRow1] | |||
fmla d0, d16, alphaV0_R | |||
fmls d0, d17, alphaV0_I | |||
fmla d1, d16, alphaV1_I | |||
fmla d1, d17, alphaV1_R | |||
fmla d1, d16, alphaV0_I | |||
fmla d1, d17, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
@@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v4.d, v5.d}[0], [pCRow1] | |||
fmla d4, d20, alphaV0_R | |||
fmls d4, d21, alphaV0_I | |||
fmla d5, d20, alphaV1_I | |||
fmla d5, d21, alphaV1_R | |||
fmla d5, d20, alphaV0_I | |||
fmla d5, d21, alphaV0_R | |||
st2 {v4.d, v5.d}[0], [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x1 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
fmla v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmla v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmla v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
ld2 {v2.2d, v3.2d}, [pCRow2] | |||
fmla v2.2d, v18.2d, alphaV0_R | |||
fmls v2.2d, v19.2d, alphaV0_I | |||
fmla v3.2d, v18.2d, alphaV1_I | |||
fmla v3.2d, v19.2d, alphaV1_R | |||
fmla v3.2d, v18.2d, alphaV0_I | |||
fmla v3.2d, v19.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
add pCRow0, pCRow0, #64 | |||
@@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x1 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.2d, v1.2d}, [pCRow1] | |||
fmla v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmla v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmla v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x1 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
ld2 {v0.d, v1.d}[0], [pCRow1] | |||
fmla d0, d16, alphaV0_R | |||
fmls d0, d17, alphaV0_I | |||
fmla d1, d16, alphaV1_I | |||
fmla d1, d17, alphaV1_R | |||
fmla d1, d16, alphaV0_I | |||
fmla d1, d17, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
fmov alpha_save_R, d0 | |||
fmov alpha_save_I, d1 | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alphaR, d0 | |||
fmov alphaI, d1 | |||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
@@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ble zgemm_kernel_L2_BEGIN | |||
zgemm_kernel_L4_BEGIN: | |||
mov pCRow0, pC // pCRow0 = C | |||
add pC, pC, LDC, lsl #2 | |||
mov pCRow0, pC | |||
add pCRow1, pCRow0, LDC | |||
add pCRow2, pCRow1, LDC | |||
add pCRow3, pCRow2, LDC | |||
add pC, pCRow3, LDC | |||
mov pA, origPA // pA = start of A array | |||
zgemm_kernel_L4_M4_BEGIN: | |||
@@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: | |||
cmp counterI, #0 | |||
ble zgemm_kernel_L4_M2_BEGIN | |||
.align 5 | |||
zgemm_kernel_L4_M4_20: | |||
mov pB, origPB | |||
asr counterL , origK, #1 // L = K / 2 | |||
cmp counterL , #2 // is there at least 4 to do? | |||
asr counterL , origK, #3 | |||
cmp counterL , #2 | |||
blt zgemm_kernel_L4_M4_32 | |||
KERNEL4x4_I // do one in the K | |||
KERNEL4x4_M2 // do another in the K | |||
KERNEL4x4_I | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
subs counterL, counterL, #2 // subtract 2 | |||
ble zgemm_kernel_L4_M4_22a | |||
.align 5 | |||
.align 5 | |||
zgemm_kernel_L4_M4_22: | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
subs counterL, counterL, #1 | |||
bgt zgemm_kernel_L4_M4_22 | |||
.align 5 | |||
zgemm_kernel_L4_M4_22a: | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_E | |||
b zgemm_kernel_L4_M4_44 | |||
.align 5 | |||
zgemm_kernel_L4_M4_32: | |||
tst counterL, #1 | |||
ble zgemm_kernel_L4_M4_40 | |||
KERNEL4x4_I | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_E | |||
b zgemm_kernel_L4_M4_44 | |||
@@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: | |||
zgemm_kernel_L4_M4_44: | |||
ands counterL , origK, #1 | |||
ands counterL , origK, #7 | |||
ble zgemm_kernel_L4_M4_100 | |||
.align 5 | |||
zgemm_kernel_L4_M4_46: | |||
KERNEL4x4_SUB | |||
subs counterL, counterL, #1 | |||
bne zgemm_kernel_L4_M4_46 | |||
zgemm_kernel_L4_M4_100: | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
SAVE4x4 | |||
@@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define pCRow0 x12 | |||
#define pCRow1 x13 | |||
#define pCRow2 x14 | |||
#define pA x15 | |||
#define alpha_save_R x16 | |||
#define alpha_save_I x17 | |||
#define temp x18 | |||
#define tempOffset x19 | |||
#define tempK x20 | |||
#define pCRow3 x15 | |||
#define pA x16 | |||
#define alphaR x17 | |||
#define alphaI x18 | |||
#define temp x19 | |||
#define tempOffset x20 | |||
#define tempK x21 | |||
#define alpha0_R d10 | |||
#define alphaV0_R v10.d[0] | |||
#define alpha0_I d11 | |||
#define alphaV0_I v11.d[0] | |||
#define alpha1_R d14 | |||
#define alphaV1_R v14.d[0] | |||
#define alpha1_I d15 | |||
#define alphaV1_I v15.d[0] | |||
#define A_PRE_SIZE 2560 | |||
#define B_PRE_SIZE 448 | |||
#define C_PRE_SIZE 128 | |||
#if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
#define OP_rr fmla | |||
@@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
// 04 origPB | |||
// 05 pC | |||
// 06 origLDC -> LDC | |||
// 07 offset | |||
// 07 offset -> temp | |||
// 08 counterL | |||
// 09 counterI | |||
// 10 counterJ | |||
@@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
// 12 pCRow0 | |||
// 13 pCRow1 | |||
// 14 pCRow2 | |||
// 15 pA | |||
// 16 alpha_save_R | |||
// 17 alpha_save_I | |||
// 18 must save temp | |||
// 19 must save tempOffset | |||
// 20 must save tempK | |||
// 21 must save | |||
// 15 pCRow3 | |||
// 16 pA | |||
// 17 alpha_save_R | |||
// 18 must save alpha_save_I | |||
// 19 must save temp | |||
// 20 must save tempOffset | |||
// 21 must save tempK | |||
// 22 must save | |||
// 23 must save | |||
// 24 must save | |||
@@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro KERNEL4x4_I | |||
ld2 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
fmul v16.2d, v0.2d, v8.d[0] | |||
OP_ii v16.2d, v1.2d, v9.d[0] | |||
@@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v17.2d, v1.2d, v8.d[0] | |||
fmul v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
eor v19.16b, v19.16b, v19.16b | |||
fmls v19.2d, v2.2d, v9.d[0] | |||
#else | |||
fmul v19.2d, v2.2d, v9.d[0] | |||
#endif | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
fmul v20.2d, v0.2d, v8.d[1] | |||
OP_ii v20.2d, v1.2d, v9.d[1] | |||
@@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v21.2d, v1.2d, v8.d[1] | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
fmul v22.2d, v2.2d, v8.d[1] | |||
OP_ii v22.2d, v3.2d, v9.d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v23.2d, v3.2d, v8.d[1] | |||
ld2 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
fmul v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
eor v19.16b, v19.16b, v19.16b | |||
fmls v19.2d, v2.2d, v9.d[0] | |||
#else | |||
fmul v19.2d, v2.2d, v9.d[0] | |||
#endif | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v4.2d, v5.2d} , [pA] | |||
add pA, pA, #32 | |||
fmul v24.2d, v0.2d, v10.d[0] | |||
OP_ii v24.2d, v1.2d, v11.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v25.2d, v1.2d, v10.d[0] | |||
ld2 {v6.2d, v7.2d} , [pA] | |||
add pA, pA, #32 | |||
fmul v26.2d, v2.2d, v10.d[0] | |||
OP_ii v26.2d, v3.2d, v11.d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v27.2d, v3.2d, v10.d[0] | |||
ld2 {v14.2d, v15.2d}, [pB] | |||
add pB, pB, #32 | |||
fmul v28.2d, v0.2d, v10.d[1] | |||
OP_ii v28.2d, v1.2d, v11.d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v29.2d, v1.2d, v10.d[1] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
fmul v30.2d, v2.2d, v10.d[1] | |||
OP_ii v30.2d, v3.2d, v11.d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
@@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
OP_ir v31.2d, v3.2d, v10.d[1] | |||
ld2 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v14.2d, v15.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v4.2d, v5.2d} , [pA] | |||
add pA, pA, #32 | |||
ld2 {v6.2d, v7.2d} , [pA] | |||
add pA, pA, #32 | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
.endm | |||
.macro KERNEL4x4_M1 | |||
@@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v17.2d, v0.2d, v9.d[0] | |||
OP_ir v17.2d, v1.2d, v8.d[0] | |||
ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
ld2 {v12.2d, v13.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v18.2d, v2.2d, v8.d[0] | |||
@@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v19.2d, v2.2d, v9.d[0] | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
add pB, pB, #32 | |||
ld2 {v4.2d, v5.2d} , [pA] | |||
add pA, pA, #32 | |||
OP_rr v20.2d, v0.2d, v8.d[1] | |||
OP_ii v20.2d, v1.2d, v9.d[1] | |||
OP_ri v21.2d, v0.2d, v9.d[1] | |||
OP_ir v21.2d, v1.2d, v8.d[1] | |||
ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
ld2 {v6.2d, v7.2d} , [pA] | |||
add pA, pA, #32 | |||
OP_rr v22.2d, v2.2d, v8.d[1] | |||
@@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v23.2d, v2.2d, v9.d[1] | |||
OP_ir v23.2d, v3.2d, v8.d[1] | |||
ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
add pA, pA, #32 | |||
ld2 {v14.2d, v15.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v24.2d, v0.2d, v10.d[0] | |||
OP_ii v24.2d, v1.2d, v11.d[0] | |||
OP_ri v25.2d, v0.2d, v11.d[0] | |||
OP_ir v25.2d, v1.2d, v10.d[0] | |||
prfm PLDL1KEEP, [pA, #512] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
OP_rr v26.2d, v2.2d, v10.d[0] | |||
OP_ii v26.2d, v3.2d, v11.d[0] | |||
OP_ri v27.2d, v2.2d, v11.d[0] | |||
OP_ir v27.2d, v3.2d, v10.d[0] | |||
prfm PLDL1KEEP, [pB, #512] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
OP_rr v28.2d, v0.2d, v10.d[1] | |||
OP_ii v28.2d, v1.2d, v11.d[1] | |||
@@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v17.2d, v4.2d, v13.d[0] | |||
OP_ir v17.2d, v5.2d, v12.d[0] | |||
ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
ld2 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v18.2d, v6.2d, v12.d[0] | |||
@@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v19.2d, v6.2d, v13.d[0] | |||
OP_ir v19.2d, v7.2d, v12.d[0] | |||
ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
add pB, pB, #32 | |||
ld2 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v20.2d, v4.2d, v12.d[1] | |||
OP_ii v20.2d, v5.2d, v13.d[1] | |||
OP_ri v21.2d, v4.2d, v13.d[1] | |||
OP_ir v21.2d, v5.2d, v12.d[1] | |||
ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v22.2d, v6.2d, v12.d[1] | |||
@@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v23.2d, v6.2d, v13.d[1] | |||
OP_ir v23.2d, v7.2d, v12.d[1] | |||
ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
add pA, pA, #32 | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v24.2d, v4.2d, v14.d[0] | |||
OP_ii v24.2d, v5.2d, v15.d[0] | |||
OP_ri v25.2d, v4.2d, v15.d[0] | |||
OP_ir v25.2d, v5.2d, v14.d[0] | |||
prfm PLDL1KEEP, [pA, #512] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
OP_rr v26.2d, v6.2d, v14.d[0] | |||
OP_ii v26.2d, v7.2d, v15.d[0] | |||
OP_ri v27.2d, v6.2d, v15.d[0] | |||
OP_ir v27.2d, v7.2d, v14.d[0] | |||
prfm PLDL1KEEP, [pB, #512] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
OP_rr v28.2d, v4.2d, v14.d[1] | |||
OP_ii v28.2d, v5.2d, v15.d[1] | |||
@@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v21.2d, v4.2d, v13.d[1] | |||
OP_ir v21.2d, v5.2d, v12.d[1] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
OP_rr v22.2d, v6.2d, v12.d[1] | |||
OP_ii v22.2d, v7.2d, v13.d[1] | |||
OP_ri v23.2d, v6.2d, v13.d[1] | |||
@@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
OP_ri v25.2d, v4.2d, v15.d[0] | |||
OP_ir v25.2d, v5.2d, v14.d[0] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
OP_rr v26.2d, v6.2d, v14.d[0] | |||
OP_ii v26.2d, v7.2d, v15.d[0] | |||
OP_ri v27.2d, v6.2d, v15.d[0] | |||
@@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro KERNEL4x4_SUB | |||
ld2 {v8.2d, v9.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
ld2 {v0.2d, v1.2d}, [pA] | |||
add pA, pA, #32 | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v16.2d, v0.2d, v8.d[0] | |||
OP_ii v16.2d, v1.2d, v9.d[0] | |||
OP_ri v17.2d, v0.2d, v9.d[0] | |||
OP_ir v17.2d, v1.2d, v8.d[0] | |||
OP_rr v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
OP_ri v19.2d, v2.2d, v9.d[0] | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
ld2 {v2.2d, v3.2d}, [pA] | |||
add pA, pA, #32 | |||
OP_rr v20.2d, v0.2d, v8.d[1] | |||
OP_ii v20.2d, v1.2d, v9.d[1] | |||
OP_ri v21.2d, v0.2d, v9.d[1] | |||
OP_ir v21.2d, v1.2d, v8.d[1] | |||
ld2 {v10.2d, v11.2d}, [pB] | |||
add pB, pB, #32 | |||
OP_rr v18.2d, v2.2d, v8.d[0] | |||
OP_ii v18.2d, v3.2d, v9.d[0] | |||
OP_ri v19.2d, v2.2d, v9.d[0] | |||
OP_ir v19.2d, v3.2d, v8.d[0] | |||
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
OP_rr v22.2d, v2.2d, v8.d[1] | |||
OP_ii v22.2d, v3.2d, v9.d[1] | |||
OP_ri v23.2d, v2.2d, v9.d[1] | |||
OP_ir v23.2d, v3.2d, v8.d[1] | |||
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
OP_rr v24.2d, v0.2d, v10.d[0] | |||
OP_ii v24.2d, v1.2d, v11.d[0] | |||
OP_ri v25.2d, v0.2d, v11.d[0] | |||
@@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x4 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
fmul v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmul v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
fmul v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow0] | |||
add pCRow0, pCRow0, #32 | |||
fmul v2.2d, v18.2d, alphaV0_R | |||
fmls v2.2d, v19.2d, alphaV0_I | |||
fmul v3.2d, v18.2d, alphaV1_I | |||
fmla v3.2d, v19.2d, alphaV1_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
fmul v3.2d, v18.2d, alphaV0_I | |||
fmla v3.2d, v19.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow0] | |||
add pCRow0, pCRow0, #32 | |||
prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
add pCRow1, pCRow1, LDC | |||
fmul v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmul v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmul v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
add pCRow1, pCRow1, #32 | |||
fmul v6.2d, v22.2d, alphaV0_R | |||
fmls v6.2d, v23.2d, alphaV0_I | |||
fmul v7.2d, v22.2d, alphaV1_I | |||
fmla v7.2d, v23.2d, alphaV1_R | |||
st2 {v6.2d, v7.2d}, [pCRow2] | |||
fmul v7.2d, v22.2d, alphaV0_I | |||
fmla v7.2d, v23.2d, alphaV0_R | |||
st2 {v6.2d, v7.2d}, [pCRow1] | |||
add pCRow1, pCRow1, #32 | |||
prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
add pCRow1, pCRow1, LDC | |||
fmul v0.2d, v24.2d, alphaV0_R | |||
fmls v0.2d, v25.2d, alphaV0_I | |||
fmul v1.2d, v24.2d, alphaV1_I | |||
fmla v1.2d, v25.2d, alphaV1_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
fmul v1.2d, v24.2d, alphaV0_I | |||
fmla v1.2d, v25.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow2] | |||
add pCRow2, pCRow2, #32 | |||
fmul v2.2d, v26.2d, alphaV0_R | |||
fmls v2.2d, v27.2d, alphaV0_I | |||
fmul v3.2d, v26.2d, alphaV1_I | |||
fmla v3.2d, v27.2d, alphaV1_R | |||
fmul v3.2d, v26.2d, alphaV0_I | |||
fmla v3.2d, v27.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
add pCRow1, pCRow1, LDC | |||
add pCRow2, pCRow2, #32 | |||
prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
fmul v4.2d, v28.2d, alphaV0_R | |||
fmls v4.2d, v29.2d, alphaV0_I | |||
fmul v5.2d, v28.2d, alphaV1_I | |||
fmla v5.2d, v29.2d, alphaV1_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
fmul v5.2d, v28.2d, alphaV0_I | |||
fmla v5.2d, v29.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow3] | |||
add pCRow3, pCRow3, #32 | |||
fmul v6.2d, v30.2d, alphaV0_R | |||
fmls v6.2d, v31.2d, alphaV0_I | |||
fmul v7.2d, v30.2d, alphaV1_I | |||
fmla v7.2d, v31.2d, alphaV1_R | |||
st2 {v6.2d, v7.2d}, [pCRow2] | |||
fmul v7.2d, v30.2d, alphaV0_I | |||
fmla v7.2d, v31.2d, alphaV0_R | |||
st2 {v6.2d, v7.2d}, [pCRow3] | |||
add pCRow0, pCRow0, #64 | |||
add pCRow3, pCRow3, #32 | |||
.endm | |||
/******************************************************************************/ | |||
@@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x4 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmul v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmul v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmul v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmul v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul v0.2d, v24.2d, alphaV0_R | |||
fmls v0.2d, v25.2d, alphaV0_I | |||
fmul v1.2d, v24.2d, alphaV1_I | |||
fmla v1.2d, v25.2d, alphaV1_R | |||
fmul v1.2d, v24.2d, alphaV0_I | |||
fmla v1.2d, v25.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul v4.2d, v28.2d, alphaV0_R | |||
fmls v4.2d, v29.2d, alphaV0_I | |||
fmul v5.2d, v28.2d, alphaV1_I | |||
fmla v5.2d, v29.2d, alphaV1_R | |||
fmul v5.2d, v28.2d, alphaV0_I | |||
fmla v5.2d, v29.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x4 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul d0, d16, alphaV0_R | |||
fmls d0, d17, alphaV0_I | |||
fmul d1, d16, alphaV1_I | |||
fmla d1, d17, alphaV1_R | |||
fmul d1, d16, alphaV0_I | |||
fmla d1, d17, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul d4, d20, alphaV0_R | |||
fmls d4, d21, alphaV0_I | |||
fmul d5, d20, alphaV1_I | |||
fmla d5, d21, alphaV1_R | |||
fmul d5, d20, alphaV0_I | |||
fmla d5, d21, alphaV0_R | |||
st2 {v4.d, v5.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul d0, d24, alphaV0_R | |||
fmls d0, d25, alphaV0_I | |||
fmul d1, d24, alphaV1_I | |||
fmla d1, d25, alphaV1_R | |||
fmul d1, d24, alphaV0_I | |||
fmla d1, d25, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul d4, d28, alphaV0_R | |||
fmls d4, d29, alphaV0_I | |||
fmul d5, d28, alphaV1_I | |||
fmla d5, d29, alphaV1_R | |||
fmul d5, d28, alphaV0_I | |||
fmla d5, d29, alphaV0_R | |||
st2 {v4.d, v5.d}[0], [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x2 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmul v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmul v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
fmul v2.2d, v18.2d, alphaV0_R | |||
fmls v2.2d, v19.2d, alphaV0_I | |||
fmul v3.2d, v18.2d, alphaV1_I | |||
fmla v3.2d, v19.2d, alphaV1_R | |||
fmul v3.2d, v18.2d, alphaV0_I | |||
fmla v3.2d, v19.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
add pCRow1, pCRow1, LDC | |||
fmul v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmul v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmul v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
fmul v6.2d, v22.2d, alphaV0_R | |||
fmls v6.2d, v23.2d, alphaV0_I | |||
fmul v7.2d, v22.2d, alphaV1_I | |||
fmla v7.2d, v23.2d, alphaV1_R | |||
fmul v7.2d, v22.2d, alphaV0_I | |||
fmla v7.2d, v23.2d, alphaV0_R | |||
st2 {v6.2d, v7.2d}, [pCRow2] | |||
add pCRow0, pCRow0, #64 | |||
@@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x2 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmul v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmul v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul v4.2d, v20.2d, alphaV0_R | |||
fmls v4.2d, v21.2d, alphaV0_I | |||
fmul v5.2d, v20.2d, alphaV1_I | |||
fmla v5.2d, v21.2d, alphaV1_R | |||
fmul v5.2d, v20.2d, alphaV0_I | |||
fmla v5.2d, v21.2d, alphaV0_R | |||
st2 {v4.2d, v5.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x2 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul d0, d16, alphaV0_R | |||
fmls d0, d17, alphaV0_I | |||
fmul d1, d16, alphaV1_I | |||
fmla d1, d17, alphaV1_R | |||
fmul d1, d16, alphaV0_I | |||
fmla d1, d17, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow1, pCRow1, LDC | |||
fmul d4, d20, alphaV0_R | |||
fmls d4, d21, alphaV0_I | |||
fmul d5, d20, alphaV1_I | |||
fmla d5, d21, alphaV1_R | |||
fmul d5, d20, alphaV0_I | |||
fmla d5, d21, alphaV0_R | |||
st2 {v4.d, v5.d}[0], [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE4x1 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmul v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmul v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow2, pCRow1, #32 | |||
fmul v2.2d, v18.2d, alphaV0_R | |||
fmls v2.2d, v19.2d, alphaV0_I | |||
fmul v3.2d, v18.2d, alphaV1_I | |||
fmla v3.2d, v19.2d, alphaV1_R | |||
fmul v3.2d, v18.2d, alphaV0_I | |||
fmla v3.2d, v19.2d, alphaV0_R | |||
st2 {v2.2d, v3.2d}, [pCRow2] | |||
add pCRow0, pCRow0, #64 | |||
@@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE2x1 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul v0.2d, v16.2d, alphaV0_R | |||
fmls v0.2d, v17.2d, alphaV0_I | |||
fmul v1.2d, v16.2d, alphaV1_I | |||
fmla v1.2d, v17.2d, alphaV1_R | |||
fmul v1.2d, v16.2d, alphaV0_I | |||
fmla v1.2d, v17.2d, alphaV0_R | |||
st2 {v0.2d, v1.2d}, [pCRow1] | |||
add pCRow0, pCRow0, #32 | |||
@@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro SAVE1x1 | |||
fmov alpha0_R, alpha_save_R | |||
fmov alpha0_I, alpha_save_I | |||
fmov alpha1_R, alpha0_R | |||
fmov alpha1_I, alpha0_I | |||
fmov alpha0_R, alphaR | |||
fmov alpha0_I, alphaI | |||
mov pCRow1, pCRow0 | |||
fmul d0, d16, alphaV0_R | |||
fmls d0, d17, alphaV0_I | |||
fmul d1, d16, alphaV1_I | |||
fmla d1, d17, alphaV1_R | |||
fmul d1, d16, alphaV0_I | |||
fmla d1, d17, alphaV0_R | |||
st2 {v0.d, v1.d}[0], [pCRow1] | |||
add pCRow0, pCRow0, #16 | |||
@@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
stp x26, x27, [sp, #(9 * 16)] | |||
str x28, [sp, #(10 * 16)] | |||
fmov alpha_save_R, d0 | |||
fmov alpha_save_I, d1 | |||
prfm PLDL1KEEP, [origPB] | |||
prfm PLDL1KEEP, [origPA] | |||
fmov alphaR, d0 | |||
fmov alphaI, d1 | |||
lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
@@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ble ztrmm_kernel_L2_BEGIN | |||
ztrmm_kernel_L4_BEGIN: | |||
mov pCRow0, pC // pCRow0 = C | |||
add pC, pC, LDC, lsl #2 | |||
mov pCRow0, pC | |||
add pCRow1, pCRow0, LDC | |||
add pCRow2, pCRow1, LDC | |||
add pCRow3, pCRow2, LDC | |||
add pC, pCRow3, LDC | |||
#if defined(LEFT) | |||
mov tempOffset, offset | |||
@@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: | |||
cmp counterI, #0 | |||
ble ztrmm_kernel_L4_M2_BEGIN | |||
.align 5 | |||
ztrmm_kernel_L4_M4_20: | |||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
@@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: | |||
add tempK, tempOffset, #4 | |||
#endif | |||
asr counterL , tempK, #1 // L = K / 2 | |||
cmp counterL , #2 // is there at least 4 to do? | |||
asr counterL , tempK, #3 | |||
cmp counterL , #2 | |||
blt ztrmm_kernel_L4_M4_32 | |||
KERNEL4x4_I // do one in the K | |||
KERNEL4x4_M2 // do another in the K | |||
KERNEL4x4_I | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
subs counterL, counterL, #2 | |||
ble ztrmm_kernel_L4_M4_22a | |||
.align 5 | |||
.align 5 | |||
ztrmm_kernel_L4_M4_22: | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
subs counterL, counterL, #1 | |||
bgt ztrmm_kernel_L4_M4_22 | |||
.align 5 | |||
ztrmm_kernel_L4_M4_22a: | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_E | |||
b ztrmm_kernel_L4_M4_44 | |||
.align 5 | |||
ztrmm_kernel_L4_M4_32: | |||
tst counterL, #1 | |||
ble ztrmm_kernel_L4_M4_40 | |||
KERNEL4x4_I | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_M2 | |||
KERNEL4x4_M1 | |||
KERNEL4x4_E | |||
b ztrmm_kernel_L4_M4_44 | |||
@@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: | |||
ztrmm_kernel_L4_M4_44: | |||
ands counterL , tempK, #1 | |||
ands counterL , tempK, #7 | |||
ble ztrmm_kernel_L4_M4_100 | |||
.align 5 | |||
ztrmm_kernel_L4_M4_46: | |||
KERNEL4x4_SUB | |||
subs counterL, counterL, #1 | |||
bne ztrmm_kernel_L4_M4_46 | |||
ztrmm_kernel_L4_M4_100: | |||
SAVE4x4 | |||
@@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: | |||
add tempOffset, tempOffset, #4 | |||
#endif | |||
prfm PLDL1KEEP, [pA] | |||
prfm PLDL1KEEP, [pA, #64] | |||
prfm PLDL1KEEP, [origPB] | |||
ztrmm_kernel_L4_M4_END: | |||
subs counterI, counterI, #1 | |||
bne ztrmm_kernel_L4_M4_20 | |||
@@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define GEMM_DEFAULT_OFFSET_B 0 | |||
#define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
#define SGEMM_DEFAULT_UNROLL_M 4 | |||
#define SGEMM_DEFAULT_UNROLL_M 16 | |||
#define SGEMM_DEFAULT_UNROLL_N 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 4 | |||
#define DGEMM_DEFAULT_UNROLL_M 8 | |||
#define DGEMM_DEFAULT_UNROLL_N 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 4 | |||
#define CGEMM_DEFAULT_UNROLL_M 8 | |||
#define CGEMM_DEFAULT_UNROLL_N 4 | |||
#define ZGEMM_DEFAULT_UNROLL_M 4 | |||