| @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow0 x12 | #define pCRow0 x12 | ||||
| #define pCRow1 x13 | #define pCRow1 x13 | ||||
| #define pCRow2 x14 | #define pCRow2 x14 | ||||
| #define pA x15 | |||||
| #define temp x16 | |||||
| #define tempOffset x17 | |||||
| #define tempK x18 | |||||
| #define pCRow3 x15 | |||||
| #define pA x16 | |||||
| #define alpha x17 | |||||
| #define temp x18 | |||||
| #define tempOffset x19 | |||||
| #define tempK x20 | |||||
| #define alpha0 d10 | #define alpha0 d10 | ||||
| #define alphaV0 v10.d[0] | #define alphaV0 v10.d[0] | ||||
| #define alpha1 d11 | |||||
| #define alphaV1 v11.d[0] | |||||
| #define alpha2 d14 | |||||
| #define alphaV2 v14.d[0] | |||||
| #define alpha3 d15 | |||||
| #define alphaV3 v15.d[0] | |||||
| #define A_PRE_SIZE 2560 | |||||
| #define B_PRE_SIZE 448 | |||||
| #define C_PRE_SIZE 128 | |||||
| // 00 origM | // 00 origM | ||||
| // 01 origN | // 01 origN | ||||
| @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| //v05 pA1_2, pA1_3 | //v05 pA1_2, pA1_3 | ||||
| //v06 pA1_4, pA1_5 | //v06 pA1_4, pA1_5 | ||||
| //v07 pA1_6, pA1_7 | //v07 pA1_6, pA1_7 | ||||
| //v08 must save pB0_0, pB0_1 | |||||
| //v09 must save pB0_2, pB0_3 | |||||
| //v10 must save ALPHA0 | |||||
| //v11 must save ALPHA1 | |||||
| //v12 must save pB1_0, pB1_1 | |||||
| //v13 must save pB1_2, pB1_3 | |||||
| //v14 must save ALPHA2 | |||||
| //v15 must save ALPHA3 | |||||
| //v08 must save pB0_0 | |||||
| //v09 must save pB0_1 | |||||
| //v10 must save pB0_2 --> ALPHA0 | |||||
| //v11 must save pB0_3 | |||||
| //v12 must save pB1_0 | |||||
| //v13 must save pB1_1 | |||||
| //v14 must save pB1_2 | |||||
| //v15 must save pB1_3 | |||||
| //v16 must save C00, C01 | //v16 must save C00, C01 | ||||
| //v17 must save C02, C03 | //v17 must save C02, C03 | ||||
| //v18 C04, C05 | //v18 C04, C05 | ||||
| @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_I | .macro KERNEL8x4_I | ||||
| ld1 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ldp q0, q1, [pA], #32 | |||||
| ldp d8, d9, [pB], #16 | |||||
| fmul v16.2d, v0.2d, v8.d[0] | fmul v16.2d, v0.2d, v8.d[0] | ||||
| fmul v20.2d, v0.2d, v9.d[0] | |||||
| ldp d10, d11, [pB], #16 | |||||
| fmul v17.2d, v1.2d, v8.d[0] | fmul v17.2d, v1.2d, v8.d[0] | ||||
| fmul v21.2d, v1.2d, v9.d[0] | |||||
| ldp q2, q3, [pA], #32 | |||||
| fmul v24.2d, v0.2d, v10.d[0] | |||||
| fmul v28.2d, v0.2d, v11.d[0] | |||||
| ldp q4, q5, [pA], #32 | |||||
| fmul v25.2d, v1.2d, v10.d[0] | |||||
| fmul v29.2d, v1.2d, v11.d[0] | |||||
| ldp d12, d13, [pB], #16 | |||||
| fmul v18.2d, v2.2d, v8.d[0] | fmul v18.2d, v2.2d, v8.d[0] | ||||
| fmul v19.2d, v3.2d, v8.d[0] | |||||
| fmul v22.2d, v2.2d, v9.d[0] | |||||
| fmul v20.2d, v0.2d, v8.d[1] | |||||
| fmul v21.2d, v1.2d, v8.d[1] | |||||
| fmul v22.2d, v2.2d, v8.d[1] | |||||
| fmul v23.2d, v3.2d, v8.d[1] | |||||
| ldp d14, d15, [pB], #16 | |||||
| fmul v24.2d, v0.2d, v9.d[0] | |||||
| fmul v25.2d, v1.2d, v9.d[0] | |||||
| fmul v26.2d, v2.2d, v9.d[0] | |||||
| fmul v27.2d, v3.2d, v9.d[0] | |||||
| fmul v26.2d, v2.2d, v10.d[0] | |||||
| fmul v30.2d, v2.2d, v11.d[0] | |||||
| fmul v28.2d, v0.2d, v9.d[1] | |||||
| fmul v29.2d, v1.2d, v9.d[1] | |||||
| fmul v30.2d, v2.2d, v9.d[1] | |||||
| fmul v31.2d, v3.2d, v9.d[1] | |||||
| ldp q6, q7, [pA], #32 | |||||
| ld1 {v4.2d, v5.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v6.2d, v7.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v19.2d, v3.2d, v8.d[0] | |||||
| fmul v27.2d, v3.2d, v10.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmul v31.2d, v3.2d, v11.d[0] | |||||
| fmul v23.2d, v3.2d, v9.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_M1 | .macro KERNEL8x4_M1 | ||||
| fmla v16.2d, v0.2d, v8.d[0] | fmla v16.2d, v0.2d, v8.d[0] | ||||
| fmla v20.2d, v0.2d, v9.d[0] | |||||
| ldp q4, q5, [pA], #32 | |||||
| fmla v24.2d, v0.2d, v10.d[0] | |||||
| fmla v28.2d, v0.2d, v11.d[0] | |||||
| ldp d12, d13, [pB], #16 | |||||
| fmla v17.2d, v1.2d, v8.d[0] | fmla v17.2d, v1.2d, v8.d[0] | ||||
| fmla v18.2d, v2.2d, v8.d[0] | |||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v25.2d, v1.2d, v10.d[0] | |||||
| fmla v20.2d, v0.2d, v8.d[1] | |||||
| fmla v21.2d, v1.2d, v8.d[1] | |||||
| fmla v22.2d, v2.2d, v8.d[1] | |||||
| fmla v23.2d, v3.2d, v8.d[1] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| fmla v24.2d, v0.2d, v9.d[0] | |||||
| fmla v25.2d, v1.2d, v9.d[0] | |||||
| fmla v26.2d, v2.2d, v9.d[0] | |||||
| fmla v27.2d, v3.2d, v9.d[0] | |||||
| fmla v21.2d, v1.2d, v9.d[0] | |||||
| fmla v29.2d, v1.2d, v11.d[0] | |||||
| fmla v28.2d, v0.2d, v9.d[1] | |||||
| fmla v29.2d, v1.2d, v9.d[1] | |||||
| fmla v30.2d, v2.2d, v9.d[1] | |||||
| fmla v31.2d, v3.2d, v9.d[1] | |||||
| ldp d14, d15, [pB], #16 | |||||
| ld1 {v4.2d, v5.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v6.2d, v7.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmla v18.2d, v2.2d, v8.d[0] | |||||
| fmla v22.2d, v2.2d, v9.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v26.2d, v2.2d, v10.d[0] | |||||
| fmla v30.2d, v2.2d, v11.d[0] | |||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v23.2d, v3.2d, v9.d[0] | |||||
| ldp q6, q7, [pA], #32 | |||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| fmla v27.2d, v3.2d, v10.d[0] | |||||
| fmla v31.2d, v3.2d, v11.d[0] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_M2 | .macro KERNEL8x4_M2 | ||||
| fmla v16.2d, v4.2d, v12.d[0] | fmla v16.2d, v4.2d, v12.d[0] | ||||
| fmla v20.2d, v4.2d, v13.d[0] | |||||
| fmla v24.2d, v4.2d, v14.d[0] | |||||
| fmla v28.2d, v4.2d, v15.d[0] | |||||
| ldp q0, q1, [pA], #32 | |||||
| fmla v17.2d, v5.2d, v12.d[0] | fmla v17.2d, v5.2d, v12.d[0] | ||||
| fmla v25.2d, v5.2d, v14.d[0] | |||||
| ldp d8, d9, [pB], #16 | |||||
| fmla v21.2d, v5.2d, v13.d[0] | |||||
| fmla v29.2d, v5.2d, v15.d[0] | |||||
| ldp d10, d11, [pB], #16 | |||||
| fmla v18.2d, v6.2d, v12.d[0] | fmla v18.2d, v6.2d, v12.d[0] | ||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v22.2d, v6.2d, v13.d[0] | |||||
| fmla v20.2d, v4.2d, v12.d[1] | |||||
| fmla v21.2d, v5.2d, v12.d[1] | |||||
| fmla v22.2d, v6.2d, v12.d[1] | |||||
| fmla v23.2d, v7.2d, v12.d[1] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| fmla v24.2d, v4.2d, v13.d[0] | |||||
| fmla v25.2d, v5.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v13.d[0] | |||||
| fmla v27.2d, v7.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v14.d[0] | |||||
| fmla v30.2d, v6.2d, v15.d[0] | |||||
| fmla v28.2d, v4.2d, v13.d[1] | |||||
| fmla v29.2d, v5.2d, v13.d[1] | |||||
| fmla v30.2d, v6.2d, v13.d[1] | |||||
| fmla v31.2d, v7.2d, v13.d[1] | |||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v23.2d, v7.2d, v13.d[0] | |||||
| ld1 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ldp q2, q3, [pA], #32 | |||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| fmla v27.2d, v7.2d, v14.d[0] | |||||
| fmla v31.2d, v7.2d, v15.d[0] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_E | .macro KERNEL8x4_E | ||||
| fmla v16.2d, v4.2d, v12.d[0] | fmla v16.2d, v4.2d, v12.d[0] | ||||
| fmla v20.2d, v4.2d, v13.d[0] | |||||
| fmla v24.2d, v4.2d, v14.d[0] | |||||
| fmla v28.2d, v4.2d, v15.d[0] | |||||
| fmla v17.2d, v5.2d, v12.d[0] | fmla v17.2d, v5.2d, v12.d[0] | ||||
| fmla v18.2d, v6.2d, v12.d[0] | |||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v25.2d, v5.2d, v14.d[0] | |||||
| fmla v21.2d, v5.2d, v13.d[0] | |||||
| fmla v29.2d, v5.2d, v15.d[0] | |||||
| fmla v20.2d, v4.2d, v12.d[1] | |||||
| fmla v21.2d, v5.2d, v12.d[1] | |||||
| fmla v22.2d, v6.2d, v12.d[1] | |||||
| fmla v23.2d, v7.2d, v12.d[1] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| fmla v24.2d, v4.2d, v13.d[0] | |||||
| fmla v25.2d, v5.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v13.d[0] | |||||
| fmla v27.2d, v7.2d, v13.d[0] | |||||
| fmla v18.2d, v6.2d, v12.d[0] | |||||
| fmla v22.2d, v6.2d, v13.d[0] | |||||
| fmla v26.2d, v6.2d, v14.d[0] | |||||
| fmla v30.2d, v6.2d, v15.d[0] | |||||
| fmla v28.2d, v4.2d, v13.d[1] | |||||
| fmla v29.2d, v5.2d, v13.d[1] | |||||
| fmla v30.2d, v6.2d, v13.d[1] | |||||
| fmla v31.2d, v7.2d, v13.d[1] | |||||
| fmla v19.2d, v7.2d, v12.d[0] | |||||
| fmla v23.2d, v7.2d, v13.d[0] | |||||
| fmla v27.2d, v7.2d, v14.d[0] | |||||
| fmla v31.2d, v7.2d, v15.d[0] | |||||
| .endm | .endm | ||||
| .macro KERNEL8x4_SUB | .macro KERNEL8x4_SUB | ||||
| ld1 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld1 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| ldp q0, q1, [pA], #32 | |||||
| ldp d8, d9, [pB], #16 | |||||
| fmla v16.2d, v0.2d, v8.d[0] | fmla v16.2d, v0.2d, v8.d[0] | ||||
| fmla v20.2d, v0.2d, v9.d[0] | |||||
| ldp d10, d11, [pB], #16 | |||||
| fmla v17.2d, v1.2d, v8.d[0] | fmla v17.2d, v1.2d, v8.d[0] | ||||
| fmla v21.2d, v1.2d, v9.d[0] | |||||
| ldp q2, q3, [pA], #32 | |||||
| fmla v24.2d, v0.2d, v10.d[0] | |||||
| fmla v28.2d, v0.2d, v11.d[0] | |||||
| fmla v25.2d, v1.2d, v10.d[0] | |||||
| fmla v29.2d, v1.2d, v11.d[0] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v18.2d, v2.2d, v8.d[0] | fmla v18.2d, v2.2d, v8.d[0] | ||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v22.2d, v2.2d, v9.d[0] | |||||
| fmla v20.2d, v0.2d, v8.d[1] | |||||
| fmla v21.2d, v1.2d, v8.d[1] | |||||
| fmla v22.2d, v2.2d, v8.d[1] | |||||
| fmla v23.2d, v3.2d, v8.d[1] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| fmla v24.2d, v0.2d, v9.d[0] | |||||
| fmla v25.2d, v1.2d, v9.d[0] | |||||
| fmla v26.2d, v2.2d, v9.d[0] | |||||
| fmla v27.2d, v3.2d, v9.d[0] | |||||
| fmla v26.2d, v2.2d, v10.d[0] | |||||
| fmla v30.2d, v2.2d, v11.d[0] | |||||
| fmla v28.2d, v0.2d, v9.d[1] | |||||
| fmla v29.2d, v1.2d, v9.d[1] | |||||
| fmla v30.2d, v2.2d, v9.d[1] | |||||
| fmla v31.2d, v3.2d, v9.d[1] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| fmla v19.2d, v3.2d, v8.d[0] | |||||
| fmla v27.2d, v3.2d, v10.d[0] | |||||
| fmla v31.2d, v3.2d, v11.d[0] | |||||
| fmla v23.2d, v3.2d, v9.d[0] | |||||
| .endm | .endm | ||||
| .macro SAVE8x4 | .macro SAVE8x4 | ||||
| add pCRow1, pCRow0, LDC | |||||
| fmov alpha0, alpha | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| fmul v0.2d, v16.2d, alphaV0 | fmul v0.2d, v16.2d, alphaV0 | ||||
| fmul v1.2d, v17.2d, alphaV1 | |||||
| fmul v2.2d, v18.2d, alphaV2 | |||||
| fmul v3.2d, v19.2d, alphaV3 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||||
| fmul v1.2d, v17.2d, alphaV0 | |||||
| stp q0, q1, [pCRow0] | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| fmul v2.2d, v18.2d, alphaV0 | |||||
| fmul v3.2d, v19.2d, alphaV0 | |||||
| stp q2, q3, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| fmul v4.2d, v20.2d, alphaV0 | fmul v4.2d, v20.2d, alphaV0 | ||||
| fmul v5.2d, v21.2d, alphaV1 | |||||
| fmul v6.2d, v22.2d, alphaV2 | |||||
| fmul v7.2d, v23.2d, alphaV3 | |||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||||
| fmul v5.2d, v21.2d, alphaV0 | |||||
| stp q4, q5, [pCRow1] | |||||
| add pCRow1, pCRow2, LDC | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| fmul v6.2d, v22.2d, alphaV0 | |||||
| fmul v7.2d, v23.2d, alphaV0 | |||||
| stp q6, q7, [pCRow1] | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| fmul v0.2d, v24.2d, alphaV0 | fmul v0.2d, v24.2d, alphaV0 | ||||
| fmul v1.2d, v25.2d, alphaV1 | |||||
| fmul v2.2d, v26.2d, alphaV2 | |||||
| fmul v3.2d, v27.2d, alphaV3 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] | |||||
| fmul v1.2d, v25.2d, alphaV0 | |||||
| stp q0, q1, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| fmul v2.2d, v26.2d, alphaV0 | |||||
| fmul v3.2d, v27.2d, alphaV0 | |||||
| stp q2, q3, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| fmul v4.2d, v28.2d, alphaV0 | fmul v4.2d, v28.2d, alphaV0 | ||||
| fmul v5.2d, v29.2d, alphaV1 | |||||
| fmul v6.2d, v30.2d, alphaV2 | |||||
| fmul v7.2d, v31.2d, alphaV3 | |||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||||
| fmul v5.2d, v29.2d, alphaV0 | |||||
| stp q4, q5, [pCRow3] | |||||
| add pCRow0, pCRow0, #64 | |||||
| add pCRow3, pCRow3, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| fmul v6.2d, v30.2d, alphaV0 | |||||
| fmul v7.2d, v31.2d, alphaV0 | |||||
| stp q6, q7, [pCRow3] | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| fmul v9.2d, v17.2d, alphaV1 | |||||
| fmul v9.2d, v17.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV2 | |||||
| fmul v13.2d, v21.2d, alphaV3 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| fmul v13.2d, v21.2d, alphaV0 | |||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | add pCRow2, pCRow1, LDC | ||||
| fmul v8.2d, v24.2d, alphaV0 | fmul v8.2d, v24.2d, alphaV0 | ||||
| fmul v9.2d, v25.2d, alphaV1 | |||||
| fmul v9.2d, v25.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow2] | st1 {v8.2d, v9.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | add pCRow1, pCRow2, LDC | ||||
| fmul v12.2d, v28.2d, alphaV2 | |||||
| fmul v13.2d, v29.2d, alphaV3 | |||||
| fmul v12.2d, v28.2d, alphaV0 | |||||
| fmul v13.2d, v29.2d, alphaV0 | |||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV1 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | add pCRow2, pCRow1, LDC | ||||
| fmul v8.2d, v24.2d, alphaV2 | |||||
| fmul v8.2d, v24.2d, alphaV0 | |||||
| st1 {v8.2d}, [pCRow2] | st1 {v8.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | add pCRow1, pCRow2, LDC | ||||
| fmul v12.2d, v28.2d, alphaV3 | |||||
| fmul v12.2d, v28.2d, alphaV0 | |||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0, alpha | |||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| add pCRow2, pCRow1, LDC | add pCRow2, pCRow1, LDC | ||||
| add pCRow1, pCRow2, LDC | add pCRow1, pCRow2, LDC | ||||
| fmul v12.2d, v20.2d, alphaV1 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| st1 {v12.d}[0], [pCRow2] | st1 {v12.d}[0], [pCRow2] | ||||
| st1 {v12.d}[1], [pCRow1] | st1 {v12.d}[1], [pCRow1] | ||||
| @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE8x2 | .macro SAVE8x2 | ||||
| fmov alpha0, alpha | |||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v0.2d, v16.2d, alphaV0 | fmul v0.2d, v16.2d, alphaV0 | ||||
| fmul v1.2d, v17.2d, alphaV1 | |||||
| fmul v2.2d, v18.2d, alphaV2 | |||||
| fmul v3.2d, v19.2d, alphaV3 | |||||
| fmul v1.2d, v17.2d, alphaV0 | |||||
| fmul v2.2d, v18.2d, alphaV0 | |||||
| fmul v3.2d, v19.2d, alphaV0 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| fmul v4.2d, v20.2d, alphaV0 | fmul v4.2d, v20.2d, alphaV0 | ||||
| fmul v5.2d, v21.2d, alphaV1 | |||||
| fmul v6.2d, v22.2d, alphaV2 | |||||
| fmul v7.2d, v23.2d, alphaV3 | |||||
| fmul v5.2d, v21.2d, alphaV0 | |||||
| fmul v6.2d, v22.2d, alphaV0 | |||||
| fmul v7.2d, v23.2d, alphaV0 | |||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| fmul v9.2d, v17.2d, alphaV1 | |||||
| fmul v9.2d, v17.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV2 | |||||
| fmul v13.2d, v21.2d, alphaV3 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| fmul v13.2d, v21.2d, alphaV0 | |||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1 , pCRow0, LDC | add pCRow1 , pCRow0, LDC | ||||
| fmul v12.2d, v20.2d, alphaV1 | |||||
| fmul v12.2d, v20.2d, alphaV0 | |||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0, alpha | |||||
| add pCRow1 , pCRow0, LDC | add pCRow1 , pCRow0, LDC | ||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE8x1 | .macro SAVE8x1 | ||||
| fmov alpha0, alpha | |||||
| fmul v0.2d, v16.2d, alphaV0 | fmul v0.2d, v16.2d, alphaV0 | ||||
| fmul v1.2d, v17.2d, alphaV1 | |||||
| fmul v2.2d, v18.2d, alphaV2 | |||||
| fmul v3.2d, v19.2d, alphaV3 | |||||
| fmul v1.2d, v17.2d, alphaV0 | |||||
| fmul v2.2d, v18.2d, alphaV0 | |||||
| fmul v3.2d, v19.2d, alphaV0 | |||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| fmul v9.2d, v17.2d, alphaV1 | |||||
| fmul v9.2d, v17.2d, alphaV0 | |||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0, alpha | |||||
| fmul v8.2d, v16.2d, alphaV0 | fmul v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| fmov alpha0, alpha | |||||
| fmul d8, d16, alpha0 | fmul d8, d16, alpha0 | ||||
| str d8, [pCRow0] | str d8, [pCRow0] | ||||
| @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp x26, x27, [sp, #(9 * 16)] | stp x26, x27, [sp, #(9 * 16)] | ||||
| str x28, [sp, #(10 * 16)] | str x28, [sp, #(10 * 16)] | ||||
| fmov alpha0, d0 | |||||
| fmov alpha1, d0 | |||||
| fmov alpha2, d0 | |||||
| fmov alpha3, d0 | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| prfm PLDL1KEEP, [origPA] | |||||
| fmov alpha, d0 | |||||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | lsl LDC, LDC, #3 // ldc = ldc * 8 | ||||
| @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| dtrmm_kernel_L4_BEGIN: | dtrmm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC // pCRow0 = C | |||||
| add pC, pC, LDC, lsl #2 | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow3, pCRow2, LDC | |||||
| add pC, pCRow3, LDC | |||||
| #if defined(LEFT) | #if defined(LEFT) | ||||
| mov tempOffset, offset | mov tempOffset, offset | ||||
| @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble dtrmm_kernel_L4_M4_BEGIN | ble dtrmm_kernel_L4_M4_BEGIN | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_20: | dtrmm_kernel_L4_M8_20: | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: | |||||
| add tempK, tempOffset, #4 | add tempK, tempOffset, #4 | ||||
| #endif | #endif | ||||
| asr counterL , tempK, #1 // L = K / 2 | |||||
| asr counterL , tempK, #3 // L = K / 8 | |||||
| cmp counterL , #2 // is there at least 4 to do? | cmp counterL , #2 // is there at least 4 to do? | ||||
| blt dtrmm_kernel_L4_M8_32 | blt dtrmm_kernel_L4_M8_32 | ||||
| KERNEL8x4_I // do one in the K | KERNEL8x4_I // do one in the K | ||||
| KERNEL8x4_M2 // do another in the K | KERNEL8x4_M2 // do another in the K | ||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| subs counterL, counterL, #2 // subtract 2 | subs counterL, counterL, #2 // subtract 2 | ||||
| ble dtrmm_kernel_L4_M8_22a | ble dtrmm_kernel_L4_M8_22a | ||||
| .align 5 | |||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_22: | dtrmm_kernel_L4_M8_22: | ||||
| KERNEL8x4_M1 | KERNEL8x4_M1 | ||||
| KERNEL8x4_M2 | KERNEL8x4_M2 | ||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dtrmm_kernel_L4_M8_22 | bgt dtrmm_kernel_L4_M8_22 | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_22a: | dtrmm_kernel_L4_M8_22a: | ||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | KERNEL8x4_M1 | ||||
| KERNEL8x4_E | KERNEL8x4_E | ||||
| b dtrmm_kernel_L4_M8_44 | b dtrmm_kernel_L4_M8_44 | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_32: | dtrmm_kernel_L4_M8_32: | ||||
| tst counterL, #1 | tst counterL, #1 | ||||
| ble dtrmm_kernel_L4_M8_40 | ble dtrmm_kernel_L4_M8_40 | ||||
| KERNEL8x4_I | KERNEL8x4_I | ||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_M2 | |||||
| KERNEL8x4_M1 | |||||
| KERNEL8x4_E | KERNEL8x4_E | ||||
| b dtrmm_kernel_L4_M8_44 | b dtrmm_kernel_L4_M8_44 | ||||
| @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: | |||||
| dtrmm_kernel_L4_M8_44: | dtrmm_kernel_L4_M8_44: | ||||
| ands counterL , tempK, #1 | |||||
| ands counterL , tempK, #7 | |||||
| ble dtrmm_kernel_L4_M8_100 | ble dtrmm_kernel_L4_M8_100 | ||||
| .align 5 | |||||
| dtrmm_kernel_L4_M8_46: | dtrmm_kernel_L4_M8_46: | ||||
| KERNEL8x4_SUB | KERNEL8x4_SUB | ||||
| subs counterL, counterL, #1 | |||||
| bne dtrmm_kernel_L4_M8_46 | |||||
| dtrmm_kernel_L4_M8_100: | dtrmm_kernel_L4_M8_100: | ||||
| SAVE8x4 | SAVE8x4 | ||||
| @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: | |||||
| #if defined(LEFT) | #if defined(LEFT) | ||||
| add tempOffset, tempOffset, #8 | add tempOffset, tempOffset, #8 | ||||
| #endif | #endif | ||||
| prfm PLDL1KEEP, [pA] | |||||
| prfm PLDL1KEEP, [pA, #64] | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| dtrmm_kernel_L4_M8_END: | dtrmm_kernel_L4_M8_END: | ||||
| subs counterI, counterI, #1 | subs counterI, counterI, #1 | ||||
| @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow0 x12 | #define pCRow0 x12 | ||||
| #define pCRow1 x13 | #define pCRow1 x13 | ||||
| #define pCRow2 x14 | #define pCRow2 x14 | ||||
| #define pA x15 | |||||
| #define alpha_save_R x16 | |||||
| #define alpha_save_I x17 | |||||
| #define pCRow3 x15 | |||||
| #define pA x16 | |||||
| #define alphaR x17 | |||||
| #define alphaI x18 | |||||
| #define alpha0_R d10 | #define alpha0_R d10 | ||||
| #define alphaV0_R v10.d[0] | #define alphaV0_R v10.d[0] | ||||
| #define alpha0_I d11 | #define alpha0_I d11 | ||||
| #define alphaV0_I v11.d[0] | #define alphaV0_I v11.d[0] | ||||
| #define alpha1_R d14 | |||||
| #define alphaV1_R v14.d[0] | |||||
| #define alpha1_I d15 | |||||
| #define alphaV1_I v15.d[0] | |||||
| #define A_PRE_SIZE 2560 | |||||
| #define B_PRE_SIZE 448 | |||||
| #define C_PRE_SIZE 128 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define OP_rr fmla | #define OP_rr fmla | ||||
| @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 12 pCRow0 | // 12 pCRow0 | ||||
| // 13 pCRow1 | // 13 pCRow1 | ||||
| // 14 pCRow2 | // 14 pCRow2 | ||||
| // 15 pA | |||||
| // 16 alpha_save_R | |||||
| // 17 alpha_save_I | |||||
| // 18 must save | |||||
| // 15 pCRow3 | |||||
| // 16 pA | |||||
| // 17 alpha_save_R | |||||
| // 18 must save alpha_save_I | |||||
| // 19 must save | // 19 must save | ||||
| // 20 must save | // 20 must save | ||||
| // 21 must save | // 21 must save | ||||
| @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_I | .macro KERNEL4x4_I | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v16.2d, v0.2d, v8.d[0] | fmul v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v20.2d, v0.2d, v8.d[1] | fmul v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v22.2d, v2.2d, v8.d[1] | fmul v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v24.2d, v0.2d, v10.d[0] | fmul v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v26.2d, v2.2d, v10.d[0] | fmul v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v28.2d, v0.2d, v10.d[1] | fmul v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v29.2d, v1.2d, v10.d[1] | OP_ir v29.2d, v1.2d, v10.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmul v30.2d, v2.2d, v10.d[1] | fmul v30.2d, v2.2d, v10.d[1] | ||||
| OP_ii v30.2d, v3.2d, v11.d[1] | OP_ii v30.2d, v3.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v31.2d, v3.2d, v10.d[1] | OP_ir v31.2d, v3.2d, v10.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| .endm | .endm | ||||
| .macro KERNEL4x4_M1 | .macro KERNEL4x4_M1 | ||||
| @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | OP_rr v18.2d, v2.2d, v8.d[0] | ||||
| @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | OP_ri v19.2d, v2.2d, v9.d[0] | ||||
| OP_ir v19.2d, v3.2d, v8.d[0] | OP_ir v19.2d, v3.2d, v8.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v26.2d, v2.2d, v10.d[0] | OP_rr v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| OP_ri v27.2d, v2.2d, v11.d[0] | OP_ri v27.2d, v2.2d, v11.d[0] | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v0.2d, v10.d[1] | OP_rr v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v4.2d, v13.d[0] | OP_ri v17.2d, v4.2d, v13.d[0] | ||||
| OP_ir v17.2d, v5.2d, v12.d[0] | OP_ir v17.2d, v5.2d, v12.d[0] | ||||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||||
| ld2 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v6.2d, v12.d[0] | OP_rr v18.2d, v6.2d, v12.d[0] | ||||
| @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v6.2d, v13.d[0] | OP_ri v19.2d, v6.2d, v13.d[0] | ||||
| OP_ir v19.2d, v7.2d, v12.d[0] | OP_ir v19.2d, v7.2d, v12.d[0] | ||||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v4.2d, v12.d[1] | OP_rr v20.2d, v4.2d, v12.d[1] | ||||
| OP_ii v20.2d, v5.2d, v13.d[1] | OP_ii v20.2d, v5.2d, v13.d[1] | ||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| OP_ir v23.2d, v7.2d, v12.d[1] | OP_ir v23.2d, v7.2d, v12.d[1] | ||||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v4.2d, v14.d[0] | OP_rr v24.2d, v4.2d, v14.d[0] | ||||
| OP_ii v24.2d, v5.2d, v15.d[0] | OP_ii v24.2d, v5.2d, v15.d[0] | ||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| OP_ir v27.2d, v7.2d, v14.d[0] | OP_ir v27.2d, v7.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v4.2d, v14.d[1] | OP_rr v28.2d, v4.2d, v14.d[1] | ||||
| OP_ii v28.2d, v5.2d, v15.d[1] | OP_ii v28.2d, v5.2d, v15.d[1] | ||||
| @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| OP_ii v22.2d, v7.2d, v13.d[1] | OP_ii v22.2d, v7.2d, v13.d[1] | ||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_SUB | .macro KERNEL4x4_SUB | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v16.2d, v0.2d, v8.d[0] | OP_rr v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||||
| ld2 {v0.2d, v1.2d}, [pCRow0] | |||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| ld2 {v2.2d, v3.2d}, [pCRow0] | |||||
| fmla v2.2d, v18.2d, alphaV0_R | fmla v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmla v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||||
| fmla v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, LDC | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||||
| add pCRow1, pCRow1, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow1] | |||||
| fmla v6.2d, v22.2d, alphaV0_R | fmla v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmla v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmla v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||||
| add pCRow1, pCRow1, LDC | |||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| ld2 {v0.2d, v1.2d}, [pCRow2] | |||||
| fmla v0.2d, v24.2d, alphaV0_R | fmla v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmla v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmla v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | ld2 {v2.2d, v3.2d}, [pCRow2] | ||||
| fmla v2.2d, v26.2d, alphaV0_R | fmla v2.2d, v26.2d, alphaV0_R | ||||
| fmls v2.2d, v27.2d, alphaV0_I | fmls v2.2d, v27.2d, alphaV0_I | ||||
| fmla v3.2d, v26.2d, alphaV1_I | |||||
| fmla v3.2d, v27.2d, alphaV1_R | |||||
| fmla v3.2d, v26.2d, alphaV0_I | |||||
| fmla v3.2d, v27.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||||
| ld2 {v4.2d, v5.2d}, [pCRow3] | |||||
| fmla v4.2d, v28.2d, alphaV0_R | fmla v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmla v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmla v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||||
| add pCRow3, pCRow3, #32 | |||||
| ld2 {v6.2d, v7.2d}, [pCRow3] | |||||
| fmla v6.2d, v30.2d, alphaV0_R | fmla v6.2d, v30.2d, alphaV0_R | ||||
| fmls v6.2d, v31.2d, alphaV0_I | fmls v6.2d, v31.2d, alphaV0_I | ||||
| fmla v7.2d, v30.2d, alphaV1_I | |||||
| fmla v7.2d, v31.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmla v7.2d, v30.2d, alphaV0_I | |||||
| fmla v7.2d, v31.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #64 | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v24.2d, alphaV0_R | fmla v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmla v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| fmla v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v28.2d, alphaV0_R | fmla v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmla v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| fmla v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d16, alphaV0_R | fmla d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmla d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmla d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.d, v5.d}[0], [pCRow1] | ld2 {v4.d, v5.d}[0], [pCRow1] | ||||
| fmla d4, d20, alphaV0_R | fmla d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmla d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmla d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d24, alphaV0_R | fmla d0, d24, alphaV0_R | ||||
| fmls d0, d25, alphaV0_I | fmls d0, d25, alphaV0_I | ||||
| fmla d1, d24, alphaV1_I | |||||
| fmla d1, d25, alphaV1_R | |||||
| fmla d1, d24, alphaV0_I | |||||
| fmla d1, d25, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.d, v5.d}[0], [pCRow1] | ld2 {v4.d, v5.d}[0], [pCRow1] | ||||
| fmla d4, d28, alphaV0_R | fmla d4, d28, alphaV0_R | ||||
| fmls d4, d29, alphaV0_I | fmls d4, d29, alphaV0_I | ||||
| fmla d5, d28, alphaV1_I | |||||
| fmla d5, d29, alphaV1_R | |||||
| fmla d5, d28, alphaV0_I | |||||
| fmla d5, d29, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | ld2 {v2.2d, v3.2d}, [pCRow2] | ||||
| fmla v2.2d, v18.2d, alphaV0_R | fmla v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmla v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmla v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| ld2 {v6.2d, v7.2d}, [pCRow2] | ld2 {v6.2d, v7.2d}, [pCRow2] | ||||
| fmla v6.2d, v22.2d, alphaV0_R | fmla v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmla v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| fmla v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | st2 {v6.2d, v7.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.2d, v5.2d}, [pCRow1] | ld2 {v4.2d, v5.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0_R | fmla v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmla v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmla v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d16, alphaV0_R | fmla d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmla d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmla d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ld2 {v4.d, v5.d}[0], [pCRow1] | ld2 {v4.d, v5.d}[0], [pCRow1] | ||||
| fmla d4, d20, alphaV0_R | fmla d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmla d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmla d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pCRow2] | ld2 {v2.2d, v3.2d}, [pCRow2] | ||||
| fmla v2.2d, v18.2d, alphaV0_R | fmla v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmla v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmla v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.2d, v1.2d}, [pCRow1] | ld2 {v0.2d, v1.2d}, [pCRow1] | ||||
| fmla v0.2d, v16.2d, alphaV0_R | fmla v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmla v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmla v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| ld2 {v0.d, v1.d}[0], [pCRow1] | ld2 {v0.d, v1.d}[0], [pCRow1] | ||||
| fmla d0, d16, alphaV0_R | fmla d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmla d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmla d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp x26, x27, [sp, #(9 * 16)] | stp x26, x27, [sp, #(9 * 16)] | ||||
| str x28, [sp, #(10 * 16)] | str x28, [sp, #(10 * 16)] | ||||
| fmov alpha_save_R, d0 | |||||
| fmov alpha_save_I, d1 | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| prfm PLDL1KEEP, [origPA] | |||||
| fmov alphaR, d0 | |||||
| fmov alphaI, d1 | |||||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | ||||
| @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ble zgemm_kernel_L2_BEGIN | ble zgemm_kernel_L2_BEGIN | ||||
| zgemm_kernel_L4_BEGIN: | zgemm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC // pCRow0 = C | |||||
| add pC, pC, LDC, lsl #2 | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow3, pCRow2, LDC | |||||
| add pC, pCRow3, LDC | |||||
| mov pA, origPA // pA = start of A array | mov pA, origPA // pA = start of A array | ||||
| zgemm_kernel_L4_M4_BEGIN: | zgemm_kernel_L4_M4_BEGIN: | ||||
| @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble zgemm_kernel_L4_M2_BEGIN | ble zgemm_kernel_L4_M2_BEGIN | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_20: | zgemm_kernel_L4_M4_20: | ||||
| mov pB, origPB | mov pB, origPB | ||||
| asr counterL , origK, #1 // L = K / 2 | |||||
| cmp counterL , #2 // is there at least 4 to do? | |||||
| asr counterL , origK, #3 | |||||
| cmp counterL , #2 | |||||
| blt zgemm_kernel_L4_M4_32 | blt zgemm_kernel_L4_M4_32 | ||||
| KERNEL4x4_I // do one in the K | |||||
| KERNEL4x4_M2 // do another in the K | |||||
| KERNEL4x4_I | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #2 // subtract 2 | subs counterL, counterL, #2 // subtract 2 | ||||
| ble zgemm_kernel_L4_M4_22a | ble zgemm_kernel_L4_M4_22a | ||||
| .align 5 | |||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_22: | zgemm_kernel_L4_M4_22: | ||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_M2 | KERNEL4x4_M2 | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt zgemm_kernel_L4_M4_22 | bgt zgemm_kernel_L4_M4_22 | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_22a: | zgemm_kernel_L4_M4_22a: | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b zgemm_kernel_L4_M4_44 | b zgemm_kernel_L4_M4_44 | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_32: | zgemm_kernel_L4_M4_32: | ||||
| tst counterL, #1 | tst counterL, #1 | ||||
| ble zgemm_kernel_L4_M4_40 | ble zgemm_kernel_L4_M4_40 | ||||
| KERNEL4x4_I | KERNEL4x4_I | ||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b zgemm_kernel_L4_M4_44 | b zgemm_kernel_L4_M4_44 | ||||
| @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: | |||||
| zgemm_kernel_L4_M4_44: | zgemm_kernel_L4_M4_44: | ||||
| ands counterL , origK, #1 | |||||
| ands counterL , origK, #7 | |||||
| ble zgemm_kernel_L4_M4_100 | ble zgemm_kernel_L4_M4_100 | ||||
| .align 5 | |||||
| zgemm_kernel_L4_M4_46: | zgemm_kernel_L4_M4_46: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| subs counterL, counterL, #1 | |||||
| bne zgemm_kernel_L4_M4_46 | |||||
| zgemm_kernel_L4_M4_100: | zgemm_kernel_L4_M4_100: | ||||
| prfm PLDL1KEEP, [pA] | |||||
| prfm PLDL1KEEP, [pA, #64] | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| SAVE4x4 | SAVE4x4 | ||||
| @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define pCRow0 x12 | #define pCRow0 x12 | ||||
| #define pCRow1 x13 | #define pCRow1 x13 | ||||
| #define pCRow2 x14 | #define pCRow2 x14 | ||||
| #define pA x15 | |||||
| #define alpha_save_R x16 | |||||
| #define alpha_save_I x17 | |||||
| #define temp x18 | |||||
| #define tempOffset x19 | |||||
| #define tempK x20 | |||||
| #define pCRow3 x15 | |||||
| #define pA x16 | |||||
| #define alphaR x17 | |||||
| #define alphaI x18 | |||||
| #define temp x19 | |||||
| #define tempOffset x20 | |||||
| #define tempK x21 | |||||
| #define alpha0_R d10 | #define alpha0_R d10 | ||||
| #define alphaV0_R v10.d[0] | #define alphaV0_R v10.d[0] | ||||
| #define alpha0_I d11 | #define alpha0_I d11 | ||||
| #define alphaV0_I v11.d[0] | #define alphaV0_I v11.d[0] | ||||
| #define alpha1_R d14 | |||||
| #define alphaV1_R v14.d[0] | |||||
| #define alpha1_I d15 | |||||
| #define alphaV1_I v15.d[0] | |||||
| #define A_PRE_SIZE 2560 | |||||
| #define B_PRE_SIZE 448 | |||||
| #define C_PRE_SIZE 128 | |||||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | ||||
| #define OP_rr fmla | #define OP_rr fmla | ||||
| @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 04 origPB | // 04 origPB | ||||
| // 05 pC | // 05 pC | ||||
| // 06 origLDC -> LDC | // 06 origLDC -> LDC | ||||
| // 07 offset | |||||
| // 07 offset -> temp | |||||
| // 08 counterL | // 08 counterL | ||||
| // 09 counterI | // 09 counterI | ||||
| // 10 counterJ | // 10 counterJ | ||||
| @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| // 12 pCRow0 | // 12 pCRow0 | ||||
| // 13 pCRow1 | // 13 pCRow1 | ||||
| // 14 pCRow2 | // 14 pCRow2 | ||||
| // 15 pA | |||||
| // 16 alpha_save_R | |||||
| // 17 alpha_save_I | |||||
| // 18 must save temp | |||||
| // 19 must save tempOffset | |||||
| // 20 must save tempK | |||||
| // 21 must save | |||||
| // 15 pCRow3 | |||||
| // 16 pA | |||||
| // 17 alpha_save_R | |||||
| // 18 must save alpha_save_I | |||||
| // 19 must save temp | |||||
| // 20 must save tempOffset | |||||
| // 21 must save tempK | |||||
| // 22 must save | // 22 must save | ||||
| // 23 must save | // 23 must save | ||||
| // 24 must save | // 24 must save | ||||
| @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_I | .macro KERNEL4x4_I | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v16.2d, v0.2d, v8.d[0] | fmul v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v20.2d, v0.2d, v8.d[1] | fmul v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v22.2d, v2.2d, v8.d[1] | fmul v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||||
| eor v19.16b, v19.16b, v19.16b | |||||
| fmls v19.2d, v2.2d, v9.d[0] | |||||
| #else | |||||
| fmul v19.2d, v2.2d, v9.d[0] | |||||
| #endif | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v24.2d, v0.2d, v10.d[0] | fmul v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| fmul v26.2d, v2.2d, v10.d[0] | fmul v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| fmul v28.2d, v0.2d, v10.d[1] | fmul v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v29.2d, v1.2d, v10.d[1] | OP_ir v29.2d, v1.2d, v10.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmul v30.2d, v2.2d, v10.d[1] | fmul v30.2d, v2.2d, v10.d[1] | ||||
| OP_ii v30.2d, v3.2d, v11.d[1] | OP_ii v30.2d, v3.2d, v11.d[1] | ||||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
| @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #endif | #endif | ||||
| OP_ir v31.2d, v3.2d, v10.d[1] | OP_ir v31.2d, v3.2d, v10.d[1] | ||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| .endm | .endm | ||||
| .macro KERNEL4x4_M1 | .macro KERNEL4x4_M1 | ||||
| @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||||
| ld2 {v12.2d, v13.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | OP_rr v18.2d, v2.2d, v8.d[0] | ||||
| @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | OP_ri v19.2d, v2.2d, v9.d[0] | ||||
| OP_ir v19.2d, v3.2d, v8.d[0] | OP_ir v19.2d, v3.2d, v8.d[0] | ||||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v4.2d, v5.2d} , [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||||
| ld2 {v6.2d, v7.2d} , [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v14.2d, v15.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| OP_ir v25.2d, v1.2d, v10.d[0] | OP_ir v25.2d, v1.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v26.2d, v2.2d, v10.d[0] | OP_rr v26.2d, v2.2d, v10.d[0] | ||||
| OP_ii v26.2d, v3.2d, v11.d[0] | OP_ii v26.2d, v3.2d, v11.d[0] | ||||
| OP_ri v27.2d, v2.2d, v11.d[0] | OP_ri v27.2d, v2.2d, v11.d[0] | ||||
| OP_ir v27.2d, v3.2d, v10.d[0] | OP_ir v27.2d, v3.2d, v10.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v0.2d, v10.d[1] | OP_rr v28.2d, v0.2d, v10.d[1] | ||||
| OP_ii v28.2d, v1.2d, v11.d[1] | OP_ii v28.2d, v1.2d, v11.d[1] | ||||
| @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v17.2d, v4.2d, v13.d[0] | OP_ri v17.2d, v4.2d, v13.d[0] | ||||
| OP_ir v17.2d, v5.2d, v12.d[0] | OP_ir v17.2d, v5.2d, v12.d[0] | ||||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||||
| ld2 {v8.2d, v9.2d}, [pB] | |||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| OP_rr v18.2d, v6.2d, v12.d[0] | OP_rr v18.2d, v6.2d, v12.d[0] | ||||
| @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v19.2d, v6.2d, v13.d[0] | OP_ri v19.2d, v6.2d, v13.d[0] | ||||
| OP_ir v19.2d, v7.2d, v12.d[0] | OP_ir v19.2d, v7.2d, v12.d[0] | ||||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v4.2d, v12.d[1] | OP_rr v20.2d, v4.2d, v12.d[1] | ||||
| OP_ii v20.2d, v5.2d, v13.d[1] | OP_ii v20.2d, v5.2d, v13.d[1] | ||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| OP_ir v23.2d, v7.2d, v12.d[1] | OP_ir v23.2d, v7.2d, v12.d[1] | ||||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||||
| add pA, pA, #32 | |||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v24.2d, v4.2d, v14.d[0] | OP_rr v24.2d, v4.2d, v14.d[0] | ||||
| OP_ii v24.2d, v5.2d, v15.d[0] | OP_ii v24.2d, v5.2d, v15.d[0] | ||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pA, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| OP_ir v27.2d, v7.2d, v14.d[0] | OP_ir v27.2d, v7.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #512] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v28.2d, v4.2d, v14.d[1] | OP_rr v28.2d, v4.2d, v14.d[1] | ||||
| OP_ii v28.2d, v5.2d, v15.d[1] | OP_ii v28.2d, v5.2d, v15.d[1] | ||||
| @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v21.2d, v4.2d, v13.d[1] | OP_ri v21.2d, v4.2d, v13.d[1] | ||||
| OP_ir v21.2d, v5.2d, v12.d[1] | OP_ir v21.2d, v5.2d, v12.d[1] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v6.2d, v12.d[1] | OP_rr v22.2d, v6.2d, v12.d[1] | ||||
| OP_ii v22.2d, v7.2d, v13.d[1] | OP_ii v22.2d, v7.2d, v13.d[1] | ||||
| OP_ri v23.2d, v6.2d, v13.d[1] | OP_ri v23.2d, v6.2d, v13.d[1] | ||||
| @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| OP_ri v25.2d, v4.2d, v15.d[0] | OP_ri v25.2d, v4.2d, v15.d[0] | ||||
| OP_ir v25.2d, v5.2d, v14.d[0] | OP_ir v25.2d, v5.2d, v14.d[0] | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| OP_rr v26.2d, v6.2d, v14.d[0] | OP_rr v26.2d, v6.2d, v14.d[0] | ||||
| OP_ii v26.2d, v7.2d, v15.d[0] | OP_ii v26.2d, v7.2d, v15.d[0] | ||||
| OP_ri v27.2d, v6.2d, v15.d[0] | OP_ri v27.2d, v6.2d, v15.d[0] | ||||
| @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro KERNEL4x4_SUB | .macro KERNEL4x4_SUB | ||||
| ld2 {v8.2d, v9.2d}, [pB] | ld2 {v8.2d, v9.2d}, [pB] | ||||
| add pB, pB, #32 | add pB, pB, #32 | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| ld2 {v0.2d, v1.2d}, [pA] | ld2 {v0.2d, v1.2d}, [pA] | ||||
| add pA, pA, #32 | add pA, pA, #32 | ||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v16.2d, v0.2d, v8.d[0] | OP_rr v16.2d, v0.2d, v8.d[0] | ||||
| OP_ii v16.2d, v1.2d, v9.d[0] | OP_ii v16.2d, v1.2d, v9.d[0] | ||||
| OP_ri v17.2d, v0.2d, v9.d[0] | OP_ri v17.2d, v0.2d, v9.d[0] | ||||
| OP_ir v17.2d, v1.2d, v8.d[0] | OP_ir v17.2d, v1.2d, v8.d[0] | ||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| ld2 {v2.2d, v3.2d}, [pA] | |||||
| add pA, pA, #32 | |||||
| OP_rr v20.2d, v0.2d, v8.d[1] | OP_rr v20.2d, v0.2d, v8.d[1] | ||||
| OP_ii v20.2d, v1.2d, v9.d[1] | OP_ii v20.2d, v1.2d, v9.d[1] | ||||
| OP_ri v21.2d, v0.2d, v9.d[1] | OP_ri v21.2d, v0.2d, v9.d[1] | ||||
| OP_ir v21.2d, v1.2d, v8.d[1] | OP_ir v21.2d, v1.2d, v8.d[1] | ||||
| ld2 {v10.2d, v11.2d}, [pB] | |||||
| add pB, pB, #32 | |||||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| OP_rr v22.2d, v2.2d, v8.d[1] | OP_rr v22.2d, v2.2d, v8.d[1] | ||||
| OP_ii v22.2d, v3.2d, v9.d[1] | OP_ii v22.2d, v3.2d, v9.d[1] | ||||
| OP_ri v23.2d, v2.2d, v9.d[1] | OP_ri v23.2d, v2.2d, v9.d[1] | ||||
| OP_ir v23.2d, v3.2d, v8.d[1] | OP_ir v23.2d, v3.2d, v8.d[1] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| OP_rr v24.2d, v0.2d, v10.d[0] | OP_rr v24.2d, v0.2d, v10.d[0] | ||||
| OP_ii v24.2d, v1.2d, v11.d[0] | OP_ii v24.2d, v1.2d, v11.d[0] | ||||
| OP_ri v25.2d, v0.2d, v11.d[0] | OP_ri v25.2d, v0.2d, v11.d[0] | ||||
| @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| fmul v2.2d, v18.2d, alphaV0_R | fmul v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmul v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||||
| fmul v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, LDC | |||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | |||||
| add pCRow1, pCRow1, #32 | |||||
| fmul v6.2d, v22.2d, alphaV0_R | fmul v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmul v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmul v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||||
| add pCRow1, pCRow1, #32 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, LDC | |||||
| fmul v0.2d, v24.2d, alphaV0_R | fmul v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmul v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmul v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||||
| add pCRow2, pCRow2, #32 | |||||
| fmul v2.2d, v26.2d, alphaV0_R | fmul v2.2d, v26.2d, alphaV0_R | ||||
| fmls v2.2d, v27.2d, alphaV0_I | fmls v2.2d, v27.2d, alphaV0_I | ||||
| fmul v3.2d, v26.2d, alphaV1_I | |||||
| fmla v3.2d, v27.2d, alphaV1_R | |||||
| fmul v3.2d, v26.2d, alphaV0_I | |||||
| fmla v3.2d, v27.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | |||||
| add pCRow2, pCRow2, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| fmul v4.2d, v28.2d, alphaV0_R | fmul v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmul v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||||
| add pCRow2, pCRow1, #32 | |||||
| fmul v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||||
| add pCRow3, pCRow3, #32 | |||||
| fmul v6.2d, v30.2d, alphaV0_R | fmul v6.2d, v30.2d, alphaV0_R | ||||
| fmls v6.2d, v31.2d, alphaV0_I | fmls v6.2d, v31.2d, alphaV0_I | ||||
| fmul v7.2d, v30.2d, alphaV1_I | |||||
| fmla v7.2d, v31.2d, alphaV1_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||||
| fmul v7.2d, v30.2d, alphaV0_I | |||||
| fmla v7.2d, v31.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #64 | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v0.2d, v24.2d, alphaV0_R | fmul v0.2d, v24.2d, alphaV0_R | ||||
| fmls v0.2d, v25.2d, alphaV0_I | fmls v0.2d, v25.2d, alphaV0_I | ||||
| fmul v1.2d, v24.2d, alphaV1_I | |||||
| fmla v1.2d, v25.2d, alphaV1_R | |||||
| fmul v1.2d, v24.2d, alphaV0_I | |||||
| fmla v1.2d, v25.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v28.2d, alphaV0_R | fmul v4.2d, v28.2d, alphaV0_R | ||||
| fmls v4.2d, v29.2d, alphaV0_I | fmls v4.2d, v29.2d, alphaV0_I | ||||
| fmul v5.2d, v28.2d, alphaV1_I | |||||
| fmla v5.2d, v29.2d, alphaV1_R | |||||
| fmul v5.2d, v28.2d, alphaV0_I | |||||
| fmla v5.2d, v29.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul d0, d16, alphaV0_R | fmul d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmul d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmul d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d4, d20, alphaV0_R | fmul d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmul d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmul d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d0, d24, alphaV0_R | fmul d0, d24, alphaV0_R | ||||
| fmls d0, d25, alphaV0_I | fmls d0, d25, alphaV0_I | ||||
| fmul d1, d24, alphaV1_I | |||||
| fmla d1, d25, alphaV1_R | |||||
| fmul d1, d24, alphaV0_I | |||||
| fmla d1, d25, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d4, d28, alphaV0_R | fmul d4, d28, alphaV0_R | ||||
| fmls d4, d29, alphaV0_I | fmls d4, d29, alphaV0_I | ||||
| fmul d5, d28, alphaV1_I | |||||
| fmla d5, d29, alphaV1_R | |||||
| fmul d5, d28, alphaV0_I | |||||
| fmla d5, d29, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| fmul v2.2d, v18.2d, alphaV0_R | fmul v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmul v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmul v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| fmul v6.2d, v22.2d, alphaV0_R | fmul v6.2d, v22.2d, alphaV0_R | ||||
| fmls v6.2d, v23.2d, alphaV0_I | fmls v6.2d, v23.2d, alphaV0_I | ||||
| fmul v7.2d, v22.2d, alphaV1_I | |||||
| fmla v7.2d, v23.2d, alphaV1_R | |||||
| fmul v7.2d, v22.2d, alphaV0_I | |||||
| fmla v7.2d, v23.2d, alphaV0_R | |||||
| st2 {v6.2d, v7.2d}, [pCRow2] | st2 {v6.2d, v7.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul v4.2d, v20.2d, alphaV0_R | fmul v4.2d, v20.2d, alphaV0_R | ||||
| fmls v4.2d, v21.2d, alphaV0_I | fmls v4.2d, v21.2d, alphaV0_I | ||||
| fmul v5.2d, v20.2d, alphaV1_I | |||||
| fmla v5.2d, v21.2d, alphaV1_R | |||||
| fmul v5.2d, v20.2d, alphaV0_I | |||||
| fmla v5.2d, v21.2d, alphaV0_R | |||||
| st2 {v4.2d, v5.2d}, [pCRow1] | st2 {v4.2d, v5.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul d0, d16, alphaV0_R | fmul d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmul d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmul d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow1, pCRow1, LDC | add pCRow1, pCRow1, LDC | ||||
| fmul d4, d20, alphaV0_R | fmul d4, d20, alphaV0_R | ||||
| fmls d4, d21, alphaV0_I | fmls d4, d21, alphaV0_I | ||||
| fmul d5, d20, alphaV1_I | |||||
| fmla d5, d21, alphaV1_R | |||||
| fmul d5, d20, alphaV0_I | |||||
| fmla d5, d21, alphaV0_R | |||||
| st2 {v4.d, v5.d}[0], [pCRow1] | st2 {v4.d, v5.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, #32 | add pCRow2, pCRow1, #32 | ||||
| fmul v2.2d, v18.2d, alphaV0_R | fmul v2.2d, v18.2d, alphaV0_R | ||||
| fmls v2.2d, v19.2d, alphaV0_I | fmls v2.2d, v19.2d, alphaV0_I | ||||
| fmul v3.2d, v18.2d, alphaV1_I | |||||
| fmla v3.2d, v19.2d, alphaV1_R | |||||
| fmul v3.2d, v18.2d, alphaV0_I | |||||
| fmla v3.2d, v19.2d, alphaV0_R | |||||
| st2 {v2.2d, v3.2d}, [pCRow2] | st2 {v2.2d, v3.2d}, [pCRow2] | ||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul v0.2d, v16.2d, alphaV0_R | fmul v0.2d, v16.2d, alphaV0_R | ||||
| fmls v0.2d, v17.2d, alphaV0_I | fmls v0.2d, v17.2d, alphaV0_I | ||||
| fmul v1.2d, v16.2d, alphaV1_I | |||||
| fmla v1.2d, v17.2d, alphaV1_R | |||||
| fmul v1.2d, v16.2d, alphaV0_I | |||||
| fmla v1.2d, v17.2d, alphaV0_R | |||||
| st2 {v0.2d, v1.2d}, [pCRow1] | st2 {v0.2d, v1.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .endm | .endm | ||||
| .macro SAVE1x1 | .macro SAVE1x1 | ||||
| fmov alpha0_R, alpha_save_R | |||||
| fmov alpha0_I, alpha_save_I | |||||
| fmov alpha1_R, alpha0_R | |||||
| fmov alpha1_I, alpha0_I | |||||
| fmov alpha0_R, alphaR | |||||
| fmov alpha0_I, alphaI | |||||
| mov pCRow1, pCRow0 | mov pCRow1, pCRow0 | ||||
| fmul d0, d16, alphaV0_R | fmul d0, d16, alphaV0_R | ||||
| fmls d0, d17, alphaV0_I | fmls d0, d17, alphaV0_I | ||||
| fmul d1, d16, alphaV1_I | |||||
| fmla d1, d17, alphaV1_R | |||||
| fmul d1, d16, alphaV0_I | |||||
| fmla d1, d17, alphaV0_R | |||||
| st2 {v0.d, v1.d}[0], [pCRow1] | st2 {v0.d, v1.d}[0], [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp x26, x27, [sp, #(9 * 16)] | stp x26, x27, [sp, #(9 * 16)] | ||||
| str x28, [sp, #(10 * 16)] | str x28, [sp, #(10 * 16)] | ||||
| fmov alpha_save_R, d0 | |||||
| fmov alpha_save_I, d1 | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| prfm PLDL1KEEP, [origPA] | |||||
| fmov alphaR, d0 | |||||
| fmov alphaI, d1 | |||||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | ||||
| @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| ble ztrmm_kernel_L2_BEGIN | ble ztrmm_kernel_L2_BEGIN | ||||
| ztrmm_kernel_L4_BEGIN: | ztrmm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC // pCRow0 = C | |||||
| add pC, pC, LDC, lsl #2 | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow3, pCRow2, LDC | |||||
| add pC, pCRow3, LDC | |||||
| #if defined(LEFT) | #if defined(LEFT) | ||||
| mov tempOffset, offset | mov tempOffset, offset | ||||
| @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble ztrmm_kernel_L4_M2_BEGIN | ble ztrmm_kernel_L4_M2_BEGIN | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_20: | ztrmm_kernel_L4_M4_20: | ||||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: | |||||
| add tempK, tempOffset, #4 | add tempK, tempOffset, #4 | ||||
| #endif | #endif | ||||
| asr counterL , tempK, #1 // L = K / 2 | |||||
| cmp counterL , #2 // is there at least 4 to do? | |||||
| asr counterL , tempK, #3 | |||||
| cmp counterL , #2 | |||||
| blt ztrmm_kernel_L4_M4_32 | blt ztrmm_kernel_L4_M4_32 | ||||
| KERNEL4x4_I // do one in the K | |||||
| KERNEL4x4_M2 // do another in the K | |||||
| KERNEL4x4_I | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #2 | subs counterL, counterL, #2 | ||||
| ble ztrmm_kernel_L4_M4_22a | ble ztrmm_kernel_L4_M4_22a | ||||
| .align 5 | |||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_22: | ztrmm_kernel_L4_M4_22: | ||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_M2 | KERNEL4x4_M2 | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt ztrmm_kernel_L4_M4_22 | bgt ztrmm_kernel_L4_M4_22 | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_22a: | ztrmm_kernel_L4_M4_22a: | ||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | KERNEL4x4_M1 | ||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b ztrmm_kernel_L4_M4_44 | b ztrmm_kernel_L4_M4_44 | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_32: | ztrmm_kernel_L4_M4_32: | ||||
| tst counterL, #1 | tst counterL, #1 | ||||
| ble ztrmm_kernel_L4_M4_40 | ble ztrmm_kernel_L4_M4_40 | ||||
| KERNEL4x4_I | KERNEL4x4_I | ||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_M2 | |||||
| KERNEL4x4_M1 | |||||
| KERNEL4x4_E | KERNEL4x4_E | ||||
| b ztrmm_kernel_L4_M4_44 | b ztrmm_kernel_L4_M4_44 | ||||
| @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: | |||||
| ztrmm_kernel_L4_M4_44: | ztrmm_kernel_L4_M4_44: | ||||
| ands counterL , tempK, #1 | |||||
| ands counterL , tempK, #7 | |||||
| ble ztrmm_kernel_L4_M4_100 | ble ztrmm_kernel_L4_M4_100 | ||||
| .align 5 | |||||
| ztrmm_kernel_L4_M4_46: | ztrmm_kernel_L4_M4_46: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| subs counterL, counterL, #1 | |||||
| bne ztrmm_kernel_L4_M4_46 | |||||
| ztrmm_kernel_L4_M4_100: | ztrmm_kernel_L4_M4_100: | ||||
| SAVE4x4 | SAVE4x4 | ||||
| @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: | |||||
| add tempOffset, tempOffset, #4 | add tempOffset, tempOffset, #4 | ||||
| #endif | #endif | ||||
| prfm PLDL1KEEP, [pA] | |||||
| prfm PLDL1KEEP, [pA, #64] | |||||
| prfm PLDL1KEEP, [origPB] | |||||
| ztrmm_kernel_L4_M4_END: | ztrmm_kernel_L4_M4_END: | ||||
| subs counterI, counterI, #1 | subs counterI, counterI, #1 | ||||
| bne ztrmm_kernel_L4_M4_20 | bne ztrmm_kernel_L4_M4_20 | ||||
| @@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #define GEMM_DEFAULT_OFFSET_B 0 | #define GEMM_DEFAULT_OFFSET_B 0 | ||||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | #define GEMM_DEFAULT_ALIGN 0x03fffUL | ||||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||||
| #define SGEMM_DEFAULT_UNROLL_N 4 | #define SGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define DGEMM_DEFAULT_UNROLL_N 4 | #define DGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||||
| #define CGEMM_DEFAULT_UNROLL_N 4 | #define CGEMM_DEFAULT_UNROLL_N 4 | ||||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | #define ZGEMM_DEFAULT_UNROLL_M 4 | ||||