| @@ -339,7 +339,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q0, q1, [pCRow0] | stp q0, q1, [pCRow0] | ||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| ldp q2, q3, [pCRow0] | ldp q2, q3, [pCRow0] | ||||
| fmla v2.2d, v18.2d, alphaV0 | fmla v2.2d, v18.2d, alphaV0 | ||||
| @@ -356,7 +355,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q4, q5, [pCRow1] | stp q4, q5, [pCRow1] | ||||
| add pCRow1, pCRow1, #32 | add pCRow1, pCRow1, #32 | ||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| ldp q6, q7, [pCRow1] | ldp q6, q7, [pCRow1] | ||||
| fmla v6.2d, v22.2d, alphaV0 | fmla v6.2d, v22.2d, alphaV0 | ||||
| @@ -373,7 +371,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q0, q1, [pCRow2] | stp q0, q1, [pCRow2] | ||||
| add pCRow2, pCRow2, #32 | add pCRow2, pCRow2, #32 | ||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| ldp q2, q3, [pCRow2] | ldp q2, q3, [pCRow2] | ||||
| fmla v2.2d, v26.2d, alphaV0 | fmla v2.2d, v26.2d, alphaV0 | ||||
| @@ -390,7 +387,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| stp q4, q5, [pCRow3] | stp q4, q5, [pCRow3] | ||||
| add pCRow3, pCRow3, #32 | add pCRow3, pCRow3, #32 | ||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| ldp q6, q7, [pCRow3] | ldp q6, q7, [pCRow3] | ||||
| fmla v6.2d, v30.2d, alphaV0 | fmla v6.2d, v30.2d, alphaV0 | ||||
| @@ -434,33 +430,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x4 | .macro SAVE4x4 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d, v9.2d}, [pCRow0] | ld1 {v8.2d, v9.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| fmla v9.2d, v17.2d, alphaV0 | fmla v9.2d, v17.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pCRow1] | ld1 {v12.2d, v13.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| fmla v13.2d, v21.2d, alphaV0 | fmla v13.2d, v21.2d, alphaV0 | ||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #32 | |||||
| ld1 {v8.2d, v9.2d}, [pCRow2] | ld1 {v8.2d, v9.2d}, [pCRow2] | ||||
| fmla v8.2d, v24.2d, alphaV0 | fmla v8.2d, v24.2d, alphaV0 | ||||
| fmla v9.2d, v25.2d, alphaV0 | fmla v9.2d, v25.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow2] | st1 {v8.2d, v9.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow2, pCRow2, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||||
| ld1 {v12.2d, v13.2d}, [pCRow3] | |||||
| fmla v12.2d, v28.2d, alphaV0 | fmla v12.2d, v28.2d, alphaV0 | ||||
| fmla v13.2d, v29.2d, alphaV0 | fmla v13.2d, v29.2d, alphaV0 | ||||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||||
| st1 {v12.2d, v13.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| add pCRow3, pCRow3, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -487,29 +488,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE2x4 | .macro SAVE2x4 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d}, [pCRow0] | ld1 {v8.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #16 | |||||
| ld1 {v12.2d}, [pCRow1] | ld1 {v12.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #16 | |||||
| ld1 {v8.2d}, [pCRow2] | ld1 {v8.2d}, [pCRow2] | ||||
| fmla v8.2d, v24.2d, alphaV0 | fmla v8.2d, v24.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow2] | st1 {v8.2d}, [pCRow2] | ||||
| add pCRow1, pCRow2, LDC | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow2, pCRow2, #16 | |||||
| ld1 {v12.2d}, [pCRow1] | |||||
| ld1 {v12.2d}, [pCRow3] | |||||
| fmla v12.2d, v28.2d, alphaV0 | fmla v12.2d, v28.2d, alphaV0 | ||||
| st1 {v12.2d}, [pCRow1] | |||||
| st1 {v12.2d}, [pCRow3] | |||||
| add pCRow0, pCRow0, #16 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| add pCRow3, pCRow3, #16 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -532,7 +538,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE1x4 | .macro SAVE1x4 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| add pCRow1, pCRow0, LDC | |||||
| ld1 {v8.d}[0], [pCRow0] | ld1 {v8.d}[0], [pCRow0] | ||||
| ld1 {v8.d}[1], [pCRow1] | ld1 {v8.d}[1], [pCRow1] | ||||
| @@ -540,16 +545,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| st1 {v8.d}[0], [pCRow0] | st1 {v8.d}[0], [pCRow0] | ||||
| st1 {v8.d}[1], [pCRow1] | st1 {v8.d}[1], [pCRow1] | ||||
| add pCRow2, pCRow1, LDC | |||||
| add pCRow1, pCRow2, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #8 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #8 | |||||
| ld1 {v12.d}[0], [pCRow2] | ld1 {v12.d}[0], [pCRow2] | ||||
| ld1 {v12.d}[1], [pCRow1] | |||||
| ld1 {v12.d}[1], [pCRow3] | |||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| st1 {v12.d}[0], [pCRow2] | st1 {v12.d}[0], [pCRow2] | ||||
| st1 {v12.d}[1], [pCRow1] | |||||
| st1 {v12.d}[1], [pCRow3] | |||||
| add pCRow0, pCRow0, #8 | |||||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||||
| add pCRow2, pCRow2, #8 | |||||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||||
| add pCRow3, pCRow3, #8 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -578,6 +588,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v18.2d, v2.2d, v8.d[0] | fmla v18.2d, v2.2d, v8.d[0] | ||||
| fmla v19.2d, v3.2d, v8.d[0] | fmla v19.2d, v3.2d, v8.d[0] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v20.2d, v0.2d, v8.d[1] | fmla v20.2d, v0.2d, v8.d[1] | ||||
| fmla v21.2d, v1.2d, v8.d[1] | fmla v21.2d, v1.2d, v8.d[1] | ||||
| fmla v22.2d, v2.2d, v8.d[1] | fmla v22.2d, v2.2d, v8.d[1] | ||||
| @@ -586,7 +598,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE8x2 | .macro SAVE8x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| add pCRow1, pCRow0, LDC | |||||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| fmla v0.2d, v16.2d, alphaV0 | fmla v0.2d, v16.2d, alphaV0 | ||||
| @@ -595,6 +606,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v3.2d, v19.2d, alphaV0 | fmla v3.2d, v19.2d, alphaV0 | ||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #64 | |||||
| ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ||||
| fmla v4.2d, v20.2d, alphaV0 | fmla v4.2d, v20.2d, alphaV0 | ||||
| fmla v5.2d, v21.2d, alphaV0 | fmla v5.2d, v21.2d, alphaV0 | ||||
| @@ -602,7 +616,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v7.2d, v23.2d, alphaV0 | fmla v7.2d, v23.2d, alphaV0 | ||||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #64 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #64 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -628,19 +643,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x2 | .macro SAVE4x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d, v9.2d}, [pCRow0] | ld1 {v8.2d, v9.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| fmla v9.2d, v17.2d, alphaV0 | fmla v9.2d, v17.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| add pCRow1, pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #32 | |||||
| ld1 {v12.2d, v13.2d}, [pCRow1] | ld1 {v12.2d, v13.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| fmla v13.2d, v21.2d, alphaV0 | fmla v13.2d, v21.2d, alphaV0 | ||||
| st1 {v12.2d, v13.2d}, [pCRow1] | st1 {v12.2d, v13.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #32 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #32 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -663,17 +681,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE2x2 | .macro SAVE2x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d}, [pCRow0] | ld1 {v8.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| add pCRow1 , pCRow0, LDC | |||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #16 | |||||
| ld1 {v12.2d}, [pCRow1] | ld1 {v12.2d}, [pCRow1] | ||||
| fmla v12.2d, v20.2d, alphaV0 | fmla v12.2d, v20.2d, alphaV0 | ||||
| st1 {v12.2d}, [pCRow1] | st1 {v12.2d}, [pCRow1] | ||||
| add pCRow0, pCRow0, #16 | |||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #16 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -694,7 +715,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE1x2 | .macro SAVE1x2 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| add pCRow1 , pCRow0, LDC | |||||
| ld1 {v8.d}[0], [pCRow0] | ld1 {v8.d}[0], [pCRow0] | ||||
| ld1 {v8.d}[1], [pCRow1] | ld1 {v8.d}[1], [pCRow1] | ||||
| @@ -702,7 +722,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| st1 {v8.d}[0], [pCRow0] | st1 {v8.d}[0], [pCRow0] | ||||
| st1 {v8.d}[1], [pCRow1] | st1 {v8.d}[1], [pCRow1] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #8 | add pCRow0, pCRow0, #8 | ||||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||||
| add pCRow1, pCRow1, #8 | |||||
| .endm | .endm | ||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| @@ -726,12 +749,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v16.2d, v0.2d, v8.d[0] | fmla v16.2d, v0.2d, v8.d[0] | ||||
| fmla v17.2d, v1.2d, v8.d[0] | fmla v17.2d, v1.2d, v8.d[0] | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| fmla v18.2d, v2.2d, v8.d[0] | fmla v18.2d, v2.2d, v8.d[0] | ||||
| fmla v19.2d, v3.2d, v8.d[0] | fmla v19.2d, v3.2d, v8.d[0] | ||||
| .endm | .endm | ||||
| .macro SAVE8x1 | .macro SAVE8x1 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| fmla v0.2d, v16.2d, alphaV0 | fmla v0.2d, v16.2d, alphaV0 | ||||
| fmla v1.2d, v17.2d, alphaV0 | fmla v1.2d, v17.2d, alphaV0 | ||||
| @@ -739,6 +764,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmla v3.2d, v19.2d, alphaV0 | fmla v3.2d, v19.2d, alphaV0 | ||||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #64 | add pCRow0, pCRow0, #64 | ||||
| .endm | .endm | ||||
| @@ -763,11 +789,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE4x1 | .macro SAVE4x1 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d, v9.2d}, [pCRow0] | ld1 {v8.2d, v9.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| fmla v9.2d, v17.2d, alphaV0 | fmla v9.2d, v17.2d, alphaV0 | ||||
| st1 {v8.2d, v9.2d}, [pCRow0] | st1 {v8.2d, v9.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #32 | add pCRow0, pCRow0, #32 | ||||
| .endm | .endm | ||||
| @@ -790,10 +818,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| .macro SAVE2x1 | .macro SAVE2x1 | ||||
| fmov alpha0, alpha | fmov alpha0, alpha | ||||
| ld1 {v8.2d}, [pCRow0] | ld1 {v8.2d}, [pCRow0] | ||||
| fmla v8.2d, v16.2d, alphaV0 | fmla v8.2d, v16.2d, alphaV0 | ||||
| st1 {v8.2d}, [pCRow0] | st1 {v8.2d}, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #16 | add pCRow0, pCRow0, #16 | ||||
| .endm | .endm | ||||
| @@ -819,6 +849,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| fmadd d8, d16, alpha0, d8 | fmadd d8, d16, alpha0, d8 | ||||
| str d8, [pCRow0] | str d8, [pCRow0] | ||||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||||
| add pCRow0, pCRow0, #8 | add pCRow0, pCRow0, #8 | ||||
| .endm | .endm | ||||
| @@ -858,6 +889,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| /******************************************************************************/ | /******************************************************************************/ | ||||
| .align 5 | |||||
| dgemm_kernel_L4_BEGIN: | dgemm_kernel_L4_BEGIN: | ||||
| mov pCRow0, pC | mov pCRow0, pC | ||||
| add pCRow1, pCRow0, LDC | add pCRow1, pCRow0, LDC | ||||
| @@ -989,17 +1021,26 @@ dgemm_kernel_L4_M4_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L4_M4_40 | ble dgemm_kernel_L4_M4_40 | ||||
| .align 5 | |||||
| dgemm_kernel_L4_M4_22: | dgemm_kernel_L4_M4_22: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M4_22 | bgt dgemm_kernel_L4_M4_22 | ||||
| @@ -1012,6 +1053,8 @@ dgemm_kernel_L4_M4_40: | |||||
| dgemm_kernel_L4_M4_42: | dgemm_kernel_L4_M4_42: | ||||
| KERNEL4x4_SUB | KERNEL4x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M4_42 | bgt dgemm_kernel_L4_M4_42 | ||||
| @@ -1022,7 +1065,6 @@ dgemm_kernel_L4_M4_100: | |||||
| dgemm_kernel_L4_M4_END: | dgemm_kernel_L4_M4_END: | ||||
| dgemm_kernel_L4_M2_BEGIN: | dgemm_kernel_L4_M2_BEGIN: | ||||
| mov counterI, origM | mov counterI, origM | ||||
| @@ -1042,16 +1084,23 @@ dgemm_kernel_L4_M2_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L4_M2_40 | ble dgemm_kernel_L4_M2_40 | ||||
| .align 5 | |||||
| dgemm_kernel_L4_M2_22: | dgemm_kernel_L4_M2_22: | ||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1063,9 +1112,12 @@ dgemm_kernel_L4_M2_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L4_M2_100 | ble dgemm_kernel_L4_M2_100 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| dgemm_kernel_L4_M2_42: | dgemm_kernel_L4_M2_42: | ||||
| KERNEL2x4_SUB | KERNEL2x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M2_42 | bgt dgemm_kernel_L4_M2_42 | ||||
| @@ -1092,15 +1144,22 @@ dgemm_kernel_L4_M1_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L4_M1_40 | ble dgemm_kernel_L4_M1_40 | ||||
| .align 5 | |||||
| dgemm_kernel_L4_M1_22: | dgemm_kernel_L4_M1_22: | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1112,9 +1171,11 @@ dgemm_kernel_L4_M1_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L4_M1_100 | ble dgemm_kernel_L4_M1_100 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| dgemm_kernel_L4_M1_42: | dgemm_kernel_L4_M1_42: | ||||
| KERNEL1x4_SUB | KERNEL1x4_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L4_M1_42 | bgt dgemm_kernel_L4_M1_42 | ||||
| @@ -1143,9 +1204,10 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||||
| tst counterJ , #2 | tst counterJ , #2 | ||||
| ble dgemm_kernel_L1_BEGIN | ble dgemm_kernel_L1_BEGIN | ||||
| mov pCRow0, pC // pCRow0 = pC | |||||
| mov pCRow0, pC | |||||
| add pCRow1, pCRow0, LDC | |||||
| add pC,pC,LDC, lsl #1 | |||||
| add pC, pCRow1, LDC | |||||
| mov pA, origPA // pA = A | mov pA, origPA // pA = A | ||||
| @@ -1156,6 +1218,7 @@ dgemm_kernel_L2_M8_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble dgemm_kernel_L2_M4_BEGIN | ble dgemm_kernel_L2_M4_BEGIN | ||||
| .align 5 | |||||
| dgemm_kernel_L2_M8_20: | dgemm_kernel_L2_M8_20: | ||||
| INIT8x2 | INIT8x2 | ||||
| @@ -1165,28 +1228,31 @@ dgemm_kernel_L2_M8_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL,#0 | cmp counterL,#0 | ||||
| ble dgemm_kernel_L2_M8_40 | ble dgemm_kernel_L2_M8_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L2_M8_22: | dgemm_kernel_L2_M8_22: | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M8_22 | bgt dgemm_kernel_L2_M8_22 | ||||
| dgemm_kernel_L2_M8_40: | dgemm_kernel_L2_M8_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L2_M8_100 | ble dgemm_kernel_L2_M8_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M8_42: | dgemm_kernel_L2_M8_42: | ||||
| KERNEL8x2_SUB | KERNEL8x2_SUB | ||||
| @@ -1221,17 +1287,23 @@ dgemm_kernel_L2_M4_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL,#0 | cmp counterL,#0 | ||||
| ble dgemm_kernel_L2_M4_40 | ble dgemm_kernel_L2_M4_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L2_M4_22: | dgemm_kernel_L2_M4_22: | ||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1243,9 +1315,12 @@ dgemm_kernel_L2_M4_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L2_M4_100 | ble dgemm_kernel_L2_M4_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M4_42: | dgemm_kernel_L2_M4_42: | ||||
| KERNEL4x2_SUB | KERNEL4x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M4_42 | bgt dgemm_kernel_L2_M4_42 | ||||
| @@ -1279,19 +1354,26 @@ dgemm_kernel_L2_M2_20: | |||||
| dgemm_kernel_L2_M2_22: | dgemm_kernel_L2_M2_22: | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x2_SUB | KERNEL2x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M2_22 | bgt dgemm_kernel_L2_M2_22 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M2_40: | dgemm_kernel_L2_M2_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| @@ -1329,18 +1411,24 @@ dgemm_kernel_L2_M1_20: | |||||
| dgemm_kernel_L2_M1_22: | dgemm_kernel_L2_M1_22: | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| KERNEL1x2_SUB | KERNEL1x2_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L2_M1_22 | bgt dgemm_kernel_L2_M1_22 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||||
| dgemm_kernel_L2_M1_40: | dgemm_kernel_L2_M1_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| @@ -1380,6 +1468,7 @@ dgemm_kernel_L1_M8_BEGIN: | |||||
| cmp counterI, #0 | cmp counterI, #0 | ||||
| ble dgemm_kernel_L1_M4_BEGIN | ble dgemm_kernel_L1_M4_BEGIN | ||||
| .align 5 | |||||
| dgemm_kernel_L1_M8_20: | dgemm_kernel_L1_M8_20: | ||||
| INIT8x1 | INIT8x1 | ||||
| @@ -1388,14 +1477,16 @@ dgemm_kernel_L1_M8_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L1_M8_40 | ble dgemm_kernel_L1_M8_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L1_M8_22: | dgemm_kernel_L1_M8_22: | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| @@ -1410,6 +1501,7 @@ dgemm_kernel_L1_M8_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L1_M8_100 | ble dgemm_kernel_L1_M8_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M8_42: | dgemm_kernel_L1_M8_42: | ||||
| KERNEL8x1_SUB | KERNEL8x1_SUB | ||||
| @@ -1443,17 +1535,23 @@ dgemm_kernel_L1_M4_20: | |||||
| asr counterL , origK, #3 // counterL = counterL / 8 | asr counterL , origK, #3 // counterL = counterL / 8 | ||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L1_M4_40 | ble dgemm_kernel_L1_M4_40 | ||||
| .align 5 | |||||
| .align 5 | |||||
| dgemm_kernel_L1_M4_22: | dgemm_kernel_L1_M4_22: | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| @@ -1465,9 +1563,11 @@ dgemm_kernel_L1_M4_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L1_M4_100 | ble dgemm_kernel_L1_M4_100 | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M4_42: | dgemm_kernel_L1_M4_42: | ||||
| KERNEL4x1_SUB | KERNEL4x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L1_M4_42 | bgt dgemm_kernel_L1_M4_42 | ||||
| @@ -1501,18 +1601,24 @@ dgemm_kernel_L1_M2_22: | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| KERNEL2x1_SUB | KERNEL2x1_SUB | ||||
| subs counterL, counterL, #1 | subs counterL, counterL, #1 | ||||
| bgt dgemm_kernel_L1_M2_22 | bgt dgemm_kernel_L1_M2_22 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M2_40: | dgemm_kernel_L1_M2_40: | ||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| @@ -1547,14 +1653,17 @@ dgemm_kernel_L1_M1_20: | |||||
| cmp counterL , #0 | cmp counterL , #0 | ||||
| ble dgemm_kernel_L1_M1_40 | ble dgemm_kernel_L1_M1_40 | ||||
| dgemm_kernel_L1_M1_22: | dgemm_kernel_L1_M1_22: | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||
| @@ -1567,6 +1676,8 @@ dgemm_kernel_L1_M1_40: | |||||
| ands counterL , origK, #7 // counterL = counterL % 8 | ands counterL , origK, #7 // counterL = counterL % 8 | ||||
| ble dgemm_kernel_L1_M1_100 | ble dgemm_kernel_L1_M1_100 | ||||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||||
| dgemm_kernel_L1_M1_42: | dgemm_kernel_L1_M1_42: | ||||
| KERNEL1x1_SUB | KERNEL1x1_SUB | ||||