| @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define temp x16 | |||
| #define tempOffset x17 | |||
| #define tempK x18 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alpha x17 | |||
| #define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define alpha0 d10 | |||
| #define alphaV0 v10.d[0] | |||
| #define alpha1 d11 | |||
| #define alphaV1 v11.d[0] | |||
| #define alpha2 d14 | |||
| #define alphaV2 v14.d[0] | |||
| #define alpha3 d15 | |||
| #define alphaV3 v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v05 pA1_2, pA1_3 | |||
| //v06 pA1_4, pA1_5 | |||
| //v07 pA1_6, pA1_7 | |||
| //v08 must save pB0_0, pB0_1 | |||
| //v09 must save pB0_2, pB0_3 | |||
| //v10 must save ALPHA0 | |||
| //v11 must save ALPHA1 | |||
| //v12 must save pB1_0, pB1_1 | |||
| //v13 must save pB1_2, pB1_3 | |||
| //v14 must save ALPHA2 | |||
| //v15 must save ALPHA3 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 --> ALPHA0 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB1_0 | |||
| //v13 must save pB1_1 | |||
| //v14 must save pB1_2 | |||
| //v15 must save pB1_3 | |||
| //v16 must save C00, C01 | |||
| //v17 must save C02, C03 | |||
| //v18 C04, C05 | |||
| @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_I | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v20.2d, v0.2d, v9.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| fmul v21.2d, v1.2d, v9.d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| fmul v28.2d, v0.2d, v11.d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmul v25.2d, v1.2d, v10.d[0] | |||
| fmul v29.2d, v1.2d, v11.d[0] | |||
| ldp d12, d13, [pB], #16 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v22.2d, v2.2d, v9.d[0] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| fmul v21.2d, v1.2d, v8.d[1] | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| fmul v23.2d, v3.2d, v8.d[1] | |||
| ldp d14, d15, [pB], #16 | |||
| fmul v24.2d, v0.2d, v9.d[0] | |||
| fmul v25.2d, v1.2d, v9.d[0] | |||
| fmul v26.2d, v2.2d, v9.d[0] | |||
| fmul v27.2d, v3.2d, v9.d[0] | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| fmul v30.2d, v2.2d, v11.d[0] | |||
| fmul v28.2d, v0.2d, v9.d[1] | |||
| fmul v29.2d, v1.2d, v9.d[1] | |||
| fmul v30.2d, v2.2d, v9.d[1] | |||
| fmul v31.2d, v3.2d, v9.d[1] | |||
| ldp q6, q7, [pA], #32 | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v31.2d, v3.2d, v11.d[0] | |||
| fmul v23.2d, v3.2d, v9.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| ldp d12, d13, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| ldp d14, d15, [pB], #16 | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| ldp q6, q7, [pA], #32 | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| ldp q0, q1, [pA], #32 | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q2, q3, [pA], #32 | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| add pCRow1, pCRow0, LDC | |||
| fmov alpha0, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| stp q0, q1, [pCRow0] | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| stp q2, q3, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| fmul v4.2d, v20.2d, alphaV0 | |||
| fmul v5.2d, v21.2d, alphaV1 | |||
| fmul v6.2d, v22.2d, alphaV2 | |||
| fmul v7.2d, v23.2d, alphaV3 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmul v5.2d, v21.2d, alphaV0 | |||
| stp q4, q5, [pCRow1] | |||
| add pCRow1, pCRow2, LDC | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| fmul v6.2d, v22.2d, alphaV0 | |||
| fmul v7.2d, v23.2d, alphaV0 | |||
| stp q6, q7, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| fmul v0.2d, v24.2d, alphaV0 | |||
| fmul v1.2d, v25.2d, alphaV1 | |||
| fmul v2.2d, v26.2d, alphaV2 | |||
| fmul v3.2d, v27.2d, alphaV3 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] | |||
| fmul v1.2d, v25.2d, alphaV0 | |||
| stp q0, q1, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| fmul v2.2d, v26.2d, alphaV0 | |||
| fmul v3.2d, v27.2d, alphaV0 | |||
| stp q2, q3, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v4.2d, v28.2d, alphaV0 | |||
| fmul v5.2d, v29.2d, alphaV1 | |||
| fmul v6.2d, v30.2d, alphaV2 | |||
| fmul v7.2d, v31.2d, alphaV3 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmul v5.2d, v29.2d, alphaV0 | |||
| stp q4, q5, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v6.2d, v30.2d, alphaV0 | |||
| fmul v7.2d, v31.2d, alphaV0 | |||
| stp q6, q7, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV2 | |||
| fmul v13.2d, v21.2d, alphaV3 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| fmul v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| fmul v8.2d, v24.2d, alphaV0 | |||
| fmul v9.2d, v25.2d, alphaV1 | |||
| fmul v9.2d, v25.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v28.2d, alphaV2 | |||
| fmul v13.2d, v29.2d, alphaV3 | |||
| fmul v12.2d, v28.2d, alphaV0 | |||
| fmul v13.2d, v29.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| fmul v8.2d, v24.2d, alphaV2 | |||
| fmul v8.2d, v24.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v28.2d, alphaV3 | |||
| fmul v12.2d, v28.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.d}[0], [pCRow2] | |||
| st1 {v12.d}[1], [pCRow1] | |||
| @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmul v4.2d, v20.2d, alphaV0 | |||
| fmul v5.2d, v21.2d, alphaV1 | |||
| fmul v6.2d, v22.2d, alphaV2 | |||
| fmul v7.2d, v23.2d, alphaV3 | |||
| fmul v5.2d, v21.2d, alphaV0 | |||
| fmul v6.2d, v22.2d, alphaV0 | |||
| fmul v7.2d, v23.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV2 | |||
| fmul v13.2d, v21.2d, alphaV3 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| fmul v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1 , pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1 , pCRow0, LDC | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x1 | |||
| fmov alpha0, alpha | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0, alpha | |||
| fmul d8, d16, alpha0 | |||
| str d8, [pCRow0] | |||
| @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha0, d0 | |||
| fmov alpha1, d0 | |||
| fmov alpha2, d0 | |||
| fmov alpha3, d0 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| #if defined(LEFT) | |||
| mov tempOffset, offset | |||
| @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: | |||
| add tempK, tempOffset, #4 | |||
| #endif | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| asr counterL , tempK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dtrmm_kernel_L4_M8_32 | |||
| KERNEL8x4_I // do one in the K | |||
| KERNEL8x4_M2 // do another in the K | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dtrmm_kernel_L4_M8_22a | |||
| .align 5 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dtrmm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: | |||
| dtrmm_kernel_L4_M8_44: | |||
| ands counterL , tempK, #1 | |||
| ands counterL , tempK, #7 | |||
| ble dtrmm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dtrmm_kernel_L4_M8_46 | |||
| dtrmm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #8 | |||
| #endif | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| dtrmm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define alpha_save_R x16 | |||
| #define alpha_save_I x17 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define alpha0_R d10 | |||
| #define alphaV0_R v10.d[0] | |||
| #define alpha0_I d11 | |||
| #define alphaV0_I v11.d[0] | |||
| #define alpha1_R d14 | |||
| #define alphaV1_R v14.d[0] | |||
| #define alpha1_I d15 | |||
| #define alphaV1_I v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 alpha_save_R | |||
| // 17 alpha_save_I | |||
| // 18 must save | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_I | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_SUB | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| ld2 {v0.2d, v1.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow0] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow1] | |||
| fmla v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmla v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ld2 {v0.2d, v1.2d}, [pCRow2] | |||
| fmla v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmla v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmla v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v26.2d, alphaV0_R | |||
| fmls v2.2d, v27.2d, alphaV0_I | |||
| fmla v3.2d, v26.2d, alphaV1_I | |||
| fmla v3.2d, v27.2d, alphaV1_R | |||
| fmla v3.2d, v26.2d, alphaV0_I | |||
| fmla v3.2d, v27.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| ld2 {v4.2d, v5.2d}, [pCRow3] | |||
| fmla v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmla v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow3] | |||
| fmla v6.2d, v30.2d, alphaV0_R | |||
| fmls v6.2d, v31.2d, alphaV0_I | |||
| fmla v7.2d, v30.2d, alphaV1_I | |||
| fmla v7.2d, v31.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v7.2d, v30.2d, alphaV0_I | |||
| fmla v7.2d, v31.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmla v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| fmla v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmla v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| fmla v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmla d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmla d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d24, alphaV0_R | |||
| fmls d0, d25, alphaV0_I | |||
| fmla d1, d24, alphaV1_I | |||
| fmla d1, d25, alphaV1_R | |||
| fmla d1, d24, alphaV0_I | |||
| fmla d1, d25, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d28, alphaV0_R | |||
| fmls d4, d29, alphaV0_I | |||
| fmla d5, d28, alphaV1_I | |||
| fmla d5, d29, alphaV1_R | |||
| fmla d5, d28, alphaV0_I | |||
| fmla d5, d29, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmla v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| fmla v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmla d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmla d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha_save_R, d0 | |||
| fmov alpha_save_I, d1 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| fmov alphaI, d1 | |||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
| @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble zgemm_kernel_L2_BEGIN | |||
| zgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| mov pA, origPA // pA = start of A array | |||
| zgemm_kernel_L4_M4_BEGIN: | |||
| @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: | |||
| cmp counterI, #0 | |||
| ble zgemm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt zgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble zgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M4_22 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble zgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: | |||
| zgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #1 | |||
| ands counterL , origK, #7 | |||
| ble zgemm_kernel_L4_M4_100 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne zgemm_kernel_L4_M4_46 | |||
| zgemm_kernel_L4_M4_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE4x4 | |||
| @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define alpha_save_R x16 | |||
| #define alpha_save_I x17 | |||
| #define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define temp x19 | |||
| #define tempOffset x20 | |||
| #define tempK x21 | |||
| #define alpha0_R d10 | |||
| #define alphaV0_R v10.d[0] | |||
| #define alpha0_I d11 | |||
| #define alphaV0_I v11.d[0] | |||
| #define alpha1_R d14 | |||
| #define alphaV1_R v14.d[0] | |||
| #define alpha1_I d15 | |||
| #define alphaV1_I v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 offset | |||
| // 07 offset -> temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 alpha_save_R | |||
| // 17 alpha_save_I | |||
| // 18 must save temp | |||
| // 19 must save tempOffset | |||
| // 20 must save tempK | |||
| // 21 must save | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save temp | |||
| // 20 must save tempOffset | |||
| // 21 must save tempK | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_I | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_SUB | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| add pCRow1, pCRow1, #32 | |||
| fmul v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmul v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmul v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmul v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| fmul v2.2d, v26.2d, alphaV0_R | |||
| fmls v2.2d, v27.2d, alphaV0_I | |||
| fmul v3.2d, v26.2d, alphaV1_I | |||
| fmla v3.2d, v27.2d, alphaV1_R | |||
| fmul v3.2d, v26.2d, alphaV0_I | |||
| fmla v3.2d, v27.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmul v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| fmul v6.2d, v30.2d, alphaV0_R | |||
| fmls v6.2d, v31.2d, alphaV0_I | |||
| fmul v7.2d, v30.2d, alphaV1_I | |||
| fmla v7.2d, v31.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmul v7.2d, v30.2d, alphaV0_I | |||
| fmla v7.2d, v31.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmul v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| fmul v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmul v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| fmul v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmul d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmul d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d0, d24, alphaV0_R | |||
| fmls d0, d25, alphaV0_I | |||
| fmul d1, d24, alphaV1_I | |||
| fmla d1, d25, alphaV1_R | |||
| fmul d1, d24, alphaV0_I | |||
| fmla d1, d25, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d28, alphaV0_R | |||
| fmls d4, d29, alphaV0_I | |||
| fmul d5, d28, alphaV1_I | |||
| fmla d5, d29, alphaV1_R | |||
| fmul d5, d28, alphaV0_I | |||
| fmla d5, d29, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmul v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| fmul v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmul d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmul d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha_save_R, d0 | |||
| fmov alpha_save_I, d1 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| fmov alphaI, d1 | |||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
| @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble ztrmm_kernel_L2_BEGIN | |||
| ztrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| #if defined(LEFT) | |||
| mov tempOffset, offset | |||
| @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: | |||
| cmp counterI, #0 | |||
| ble ztrmm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: | |||
| add tempK, tempOffset, #4 | |||
| #endif | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| asr counterL , tempK, #3 | |||
| cmp counterL , #2 | |||
| blt ztrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 | |||
| ble ztrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M4_22 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble ztrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: | |||
| ztrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ands counterL , tempK, #7 | |||
| ble ztrmm_kernel_L4_M4_100 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne ztrmm_kernel_L4_M4_46 | |||
| ztrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| ztrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne ztrmm_kernel_L4_M4_20 | |||
| @@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||