ARM64: Convert all labels to local labelstags/v0.3.0
| @@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble amax_kernel_zero | |||
| ble .Lamax_kernel_zero | |||
| cmp INC_X, xzr | |||
| ble amax_kernel_zero | |||
| ble .Lamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne amax_kernel_S_BEGIN | |||
| bne .Lamax_kernel_S_BEGIN | |||
| amax_kernel_F_BEGIN: | |||
| .Lamax_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq amax_kernel_F1_INIT | |||
| beq .Lamax_kernel_F1_INIT | |||
| INIT_F4 | |||
| subs I, I, #1 | |||
| beq amax_kernel_F1 | |||
| beq .Lamax_kernel_F1 | |||
| amax_kernel_F4: | |||
| .Lamax_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne amax_kernel_F4 | |||
| bne .Lamax_kernel_F4 | |||
| amax_kernel_F1: | |||
| .Lamax_kernel_F1: | |||
| ands I, N, #3 | |||
| ble amax_kernel_L999 | |||
| ble .Lamax_kernel_L999 | |||
| amax_kernel_F10: | |||
| .Lamax_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne amax_kernel_F10 | |||
| bne .Lamax_kernel_F10 | |||
| ret | |||
| amax_kernel_F1_INIT: | |||
| .Lamax_kernel_F1_INIT: | |||
| INIT_F1 | |||
| subs N, N, #1 | |||
| b amax_kernel_F1 | |||
| b .Lamax_kernel_F1 | |||
| amax_kernel_S_BEGIN: | |||
| .Lamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble amax_kernel_L999 | |||
| ble .Lamax_kernel_L999 | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble amax_kernel_S1 | |||
| ble .Lamax_kernel_S1 | |||
| amax_kernel_S4: | |||
| .Lamax_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -223,25 +223,25 @@ amax_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne amax_kernel_S4 | |||
| bne .Lamax_kernel_S4 | |||
| amax_kernel_S1: | |||
| .Lamax_kernel_S1: | |||
| ands I, N, #3 | |||
| ble amax_kernel_L999 | |||
| ble .Lamax_kernel_L999 | |||
| amax_kernel_S10: | |||
| .Lamax_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne amax_kernel_S10 | |||
| bne .Lamax_kernel_S10 | |||
| amax_kernel_L999: | |||
| .Lamax_kernel_L999: | |||
| ret | |||
| amax_kernel_zero: | |||
| .Lamax_kernel_zero: | |||
| fmov MAXF, REG0 | |||
| ret | |||
| @@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| cmp N, xzr | |||
| ble asum_kernel_L999 | |||
| ble .Lasum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble asum_kernel_L999 | |||
| ble .Lasum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne asum_kernel_S_BEGIN | |||
| bne .Lasum_kernel_S_BEGIN | |||
| asum_kernel_F_BEGIN: | |||
| .Lasum_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq asum_kernel_F1 | |||
| beq .Lasum_kernel_F1 | |||
| asum_kernel_F8: | |||
| .Lasum_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F8 | |||
| bne .Lasum_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| asum_kernel_F1: | |||
| .Lasum_kernel_F1: | |||
| ands I, N, #7 | |||
| ble asum_kernel_L999 | |||
| ble .Lasum_kernel_L999 | |||
| asum_kernel_F10: | |||
| .Lasum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F10 | |||
| bne .Lasum_kernel_F10 | |||
| asum_kernel_L999: | |||
| .Lasum_kernel_L999: | |||
| ret | |||
| asum_kernel_S_BEGIN: | |||
| .Lasum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble asum_kernel_S1 | |||
| ble .Lasum_kernel_S1 | |||
| asum_kernel_S4: | |||
| .Lasum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -175,19 +175,19 @@ asum_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S4 | |||
| bne .Lasum_kernel_S4 | |||
| asum_kernel_S1: | |||
| .Lasum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| ble .Lasum_kernel_L999 | |||
| asum_kernel_S10: | |||
| .Lasum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S10 | |||
| bne .Lasum_kernel_S10 | |||
| ret | |||
| @@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble axpy_kernel_L999 | |||
| ble .Laxpy_kernel_L999 | |||
| fcmp DA, #0.0 | |||
| beq axpy_kernel_L999 | |||
| beq .Laxpy_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne axpy_kernel_S_BEGIN | |||
| bne .Laxpy_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne axpy_kernel_S_BEGIN | |||
| bne .Laxpy_kernel_S_BEGIN | |||
| axpy_kernel_F_BEGIN: | |||
| .Laxpy_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq axpy_kernel_F1 | |||
| beq .Laxpy_kernel_F1 | |||
| axpy_kernel_F8: | |||
| .Laxpy_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_F8 | |||
| bne .Laxpy_kernel_F8 | |||
| axpy_kernel_F1: | |||
| .Laxpy_kernel_F1: | |||
| ands I, N, #7 | |||
| ble axpy_kernel_L999 | |||
| ble .Laxpy_kernel_L999 | |||
| axpy_kernel_F10: | |||
| .Laxpy_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_F10 | |||
| bne .Laxpy_kernel_F10 | |||
| mov w0, wzr | |||
| ret | |||
| axpy_kernel_S_BEGIN: | |||
| .Laxpy_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble axpy_kernel_S1 | |||
| ble .Laxpy_kernel_S1 | |||
| axpy_kernel_S4: | |||
| .Laxpy_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -189,21 +189,21 @@ axpy_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_S4 | |||
| bne .Laxpy_kernel_S4 | |||
| axpy_kernel_S1: | |||
| .Laxpy_kernel_S1: | |||
| ands I, N, #3 | |||
| ble axpy_kernel_L999 | |||
| ble .Laxpy_kernel_L999 | |||
| axpy_kernel_S10: | |||
| .Laxpy_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_S10 | |||
| bne .Laxpy_kernel_S10 | |||
| axpy_kernel_L999: | |||
| .Laxpy_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmov s1, SUMF | |||
| cmp N, xzr | |||
| ble asum_kernel_L999 | |||
| ble .Lcasum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble asum_kernel_L999 | |||
| ble .Lcasum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne asum_kernel_S_BEGIN | |||
| bne .Lcasum_kernel_S_BEGIN | |||
| asum_kernel_F_BEGIN: | |||
| .Lcasum_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq asum_kernel_F1 | |||
| beq .Lcasum_kernel_F1 | |||
| asum_kernel_F8: | |||
| .Lcasum_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F8 | |||
| bne .Lcasum_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| asum_kernel_F1: | |||
| .Lcasum_kernel_F1: | |||
| ands I, N, #7 | |||
| ble asum_kernel_L999 | |||
| ble .Lcasum_kernel_L999 | |||
| asum_kernel_F10: | |||
| .Lcasum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F10 | |||
| bne .Lcasum_kernel_F10 | |||
| asum_kernel_L999: | |||
| .Lcasum_kernel_L999: | |||
| ret | |||
| asum_kernel_S_BEGIN: | |||
| .Lcasum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble asum_kernel_S1 | |||
| ble .Lcasum_kernel_S1 | |||
| asum_kernel_S4: | |||
| .Lcasum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -151,19 +151,19 @@ asum_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S4 | |||
| bne .Lcasum_kernel_S4 | |||
| asum_kernel_S1: | |||
| .Lcasum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| ble .Lcasum_kernel_L999 | |||
| asum_kernel_S10: | |||
| .Lcasum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S10 | |||
| bne .Lcasum_kernel_S10 | |||
| ret | |||
| @@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble cgemm_kernel_L2_BEGIN | |||
| ble .Lcgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L4_BEGIN: | |||
| .Lcgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| @@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| add ppA, temp, pA | |||
| cgemm_kernel_L4_M8_BEGIN: | |||
| .Lcgemm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L4_M4_BEGIN | |||
| ble .Lcgemm_kernel_L4_M4_BEGIN | |||
| cgemm_kernel_L4_M8_20: | |||
| .Lcgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt cgemm_kernel_L4_M8_32 | |||
| blt .Lcgemm_kernel_L4_M8_32 | |||
| KERNEL8x4_I // do one in the K | |||
| KERNEL8x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble cgemm_kernel_L4_M8_22a | |||
| ble .Lcgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_22: | |||
| .Lcgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M8_22 | |||
| bgt .Lcgemm_kernel_L4_M8_22 | |||
| cgemm_kernel_L4_M8_22a: | |||
| .Lcgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b cgemm_kernel_L4_M8_44 | |||
| b .Lcgemm_kernel_L4_M8_44 | |||
| cgemm_kernel_L4_M8_32: | |||
| .Lcgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble cgemm_kernel_L4_M8_40 | |||
| ble .Lcgemm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_E | |||
| b cgemm_kernel_L4_M8_44 | |||
| b .Lcgemm_kernel_L4_M8_44 | |||
| cgemm_kernel_L4_M8_40: | |||
| .Lcgemm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| cgemm_kernel_L4_M8_44: | |||
| .Lcgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #1 | |||
| ble cgemm_kernel_L4_M8_100 | |||
| ble .Lcgemm_kernel_L4_M8_100 | |||
| cgemm_kernel_L4_M8_46: | |||
| .Lcgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| cgemm_kernel_L4_M8_100: | |||
| .Lcgemm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| cgemm_kernel_L4_M8_END: | |||
| .Lcgemm_kernel_L4_M8_END: | |||
| lsl temp, origK, #5 // k * 4 * 8 | |||
| add pA, pA, temp | |||
| add ppA, ppA, temp | |||
| subs counterI, counterI, #1 | |||
| bne cgemm_kernel_L4_M8_20 | |||
| bne .Lcgemm_kernel_L4_M8_20 | |||
| cgemm_kernel_L4_M4_BEGIN: | |||
| .Lcgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble cgemm_kernel_L4_M2_BEGIN | |||
| ble .Lcgemm_kernel_L4_M2_BEGIN | |||
| cgemm_kernel_L4_M4_20: | |||
| .Lcgemm_kernel_L4_M4_20: | |||
| INIT4x4 | |||
| mov pB, origPB | |||
| asr counterL, origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble cgemm_kernel_L4_M4_40 | |||
| ble .Lcgemm_kernel_L4_M4_40 | |||
| cgemm_kernel_L4_M4_22: | |||
| .Lcgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| KERNEL4x4_SUB | |||
| @@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M4_22 | |||
| bgt .Lcgemm_kernel_L4_M4_22 | |||
| cgemm_kernel_L4_M4_40: | |||
| .Lcgemm_kernel_L4_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L4_M4_100 | |||
| ble .Lcgemm_kernel_L4_M4_100 | |||
| cgemm_kernel_L4_M4_42: | |||
| .Lcgemm_kernel_L4_M4_42: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M4_42 | |||
| bgt .Lcgemm_kernel_L4_M4_42 | |||
| cgemm_kernel_L4_M4_100: | |||
| .Lcgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| cgemm_kernel_L4_M4_END: | |||
| .Lcgemm_kernel_L4_M4_END: | |||
| cgemm_kernel_L4_M2_BEGIN: | |||
| .Lcgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L4_M1_BEGIN | |||
| ble .Lcgemm_kernel_L4_M1_BEGIN | |||
| cgemm_kernel_L4_M2_20: | |||
| .Lcgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L4_M2_40 | |||
| ble .Lcgemm_kernel_L4_M2_40 | |||
| cgemm_kernel_L4_M2_22: | |||
| .Lcgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M2_22 | |||
| bgt .Lcgemm_kernel_L4_M2_22 | |||
| cgemm_kernel_L4_M2_40: | |||
| .Lcgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L4_M2_100 | |||
| ble .Lcgemm_kernel_L4_M2_100 | |||
| cgemm_kernel_L4_M2_42: | |||
| .Lcgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M2_42 | |||
| bgt .Lcgemm_kernel_L4_M2_42 | |||
| cgemm_kernel_L4_M2_100: | |||
| .Lcgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| cgemm_kernel_L4_M2_END: | |||
| .Lcgemm_kernel_L4_M2_END: | |||
| cgemm_kernel_L4_M1_BEGIN: | |||
| .Lcgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| cgemm_kernel_L4_M1_20: | |||
| .Lcgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L4_M1_40 | |||
| ble .Lcgemm_kernel_L4_M1_40 | |||
| cgemm_kernel_L4_M1_22: | |||
| .Lcgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M1_22 | |||
| bgt .Lcgemm_kernel_L4_M1_22 | |||
| cgemm_kernel_L4_M1_40: | |||
| .Lcgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L4_M1_100 | |||
| ble .Lcgemm_kernel_L4_M1_100 | |||
| cgemm_kernel_L4_M1_42: | |||
| .Lcgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M1_42 | |||
| bgt .Lcgemm_kernel_L4_M1_42 | |||
| cgemm_kernel_L4_M1_100: | |||
| .Lcgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| cgemm_kernel_L4_END: | |||
| .Lcgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt cgemm_kernel_L4_BEGIN | |||
| bgt .Lcgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble cgemm_kernel_L999 // error, N was less than 4? | |||
| ble .Lcgemm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble cgemm_kernel_L1_BEGIN | |||
| ble .Lcgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| cgemm_kernel_L2_M4_BEGIN: | |||
| .Lcgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble cgemm_kernel_L2_M2_BEGIN | |||
| ble .Lcgemm_kernel_L2_M2_BEGIN | |||
| cgemm_kernel_L2_M4_20: | |||
| .Lcgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M4_40 | |||
| ble .Lcgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| cgemm_kernel_L2_M4_22: | |||
| .Lcgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M4_22 | |||
| bgt .Lcgemm_kernel_L2_M4_22 | |||
| cgemm_kernel_L2_M4_40: | |||
| .Lcgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M4_100 | |||
| ble .Lcgemm_kernel_L2_M4_100 | |||
| cgemm_kernel_L2_M4_42: | |||
| .Lcgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M4_42 | |||
| bgt .Lcgemm_kernel_L2_M4_42 | |||
| cgemm_kernel_L2_M4_100: | |||
| .Lcgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| cgemm_kernel_L2_M4_END: | |||
| .Lcgemm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt cgemm_kernel_L2_M4_20 | |||
| bgt .Lcgemm_kernel_L2_M4_20 | |||
| cgemm_kernel_L2_M2_BEGIN: | |||
| .Lcgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L2_M1_BEGIN | |||
| ble .Lcgemm_kernel_L2_M1_BEGIN | |||
| cgemm_kernel_L2_M2_20: | |||
| .Lcgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M2_40 | |||
| ble .Lcgemm_kernel_L2_M2_40 | |||
| cgemm_kernel_L2_M2_22: | |||
| .Lcgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M2_22 | |||
| bgt .Lcgemm_kernel_L2_M2_22 | |||
| cgemm_kernel_L2_M2_40: | |||
| .Lcgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M2_100 | |||
| ble .Lcgemm_kernel_L2_M2_100 | |||
| cgemm_kernel_L2_M2_42: | |||
| .Lcgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M2_42 | |||
| bgt .Lcgemm_kernel_L2_M2_42 | |||
| cgemm_kernel_L2_M2_100: | |||
| .Lcgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| cgemm_kernel_L2_M2_END: | |||
| .Lcgemm_kernel_L2_M2_END: | |||
| cgemm_kernel_L2_M1_BEGIN: | |||
| .Lcgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| cgemm_kernel_L2_M1_20: | |||
| .Lcgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble cgemm_kernel_L2_M1_40 | |||
| ble .Lcgemm_kernel_L2_M1_40 | |||
| cgemm_kernel_L2_M1_22: | |||
| .Lcgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M1_22 | |||
| bgt .Lcgemm_kernel_L2_M1_22 | |||
| cgemm_kernel_L2_M1_40: | |||
| .Lcgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M1_100 | |||
| ble .Lcgemm_kernel_L2_M1_100 | |||
| cgemm_kernel_L2_M1_42: | |||
| .Lcgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M1_42 | |||
| bgt .Lcgemm_kernel_L2_M1_42 | |||
| cgemm_kernel_L2_M1_100: | |||
| .Lcgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| cgemm_kernel_L2_END: | |||
| .Lcgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L1_BEGIN: | |||
| .Lcgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble cgemm_kernel_L999 // done | |||
| ble .Lcgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN: | |||
| cgemm_kernel_L1_M4_BEGIN: | |||
| .Lcgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L1_M2_BEGIN | |||
| ble .Lcgemm_kernel_L1_M2_BEGIN | |||
| cgemm_kernel_L1_M4_20: | |||
| .Lcgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M4_40 | |||
| ble .Lcgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| cgemm_kernel_L1_M4_22: | |||
| .Lcgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M4_22 | |||
| bgt .Lcgemm_kernel_L1_M4_22 | |||
| cgemm_kernel_L1_M4_40: | |||
| .Lcgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M4_100 | |||
| ble .Lcgemm_kernel_L1_M4_100 | |||
| cgemm_kernel_L1_M4_42: | |||
| .Lcgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M4_42 | |||
| bgt .Lcgemm_kernel_L1_M4_42 | |||
| cgemm_kernel_L1_M4_100: | |||
| .Lcgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| cgemm_kernel_L1_M4_END: | |||
| .Lcgemm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt cgemm_kernel_L1_M4_20 | |||
| bgt .Lcgemm_kernel_L1_M4_20 | |||
| cgemm_kernel_L1_M2_BEGIN: | |||
| .Lcgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L1_M1_BEGIN | |||
| ble .Lcgemm_kernel_L1_M1_BEGIN | |||
| cgemm_kernel_L1_M2_20: | |||
| .Lcgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M2_40 | |||
| ble .Lcgemm_kernel_L1_M2_40 | |||
| cgemm_kernel_L1_M2_22: | |||
| .Lcgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M2_22 | |||
| bgt .Lcgemm_kernel_L1_M2_22 | |||
| cgemm_kernel_L1_M2_40: | |||
| .Lcgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M2_100 | |||
| ble .Lcgemm_kernel_L1_M2_100 | |||
| cgemm_kernel_L1_M2_42: | |||
| .Lcgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M2_42 | |||
| bgt .Lcgemm_kernel_L1_M2_42 | |||
| cgemm_kernel_L1_M2_100: | |||
| .Lcgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| cgemm_kernel_L1_M2_END: | |||
| .Lcgemm_kernel_L1_M2_END: | |||
| cgemm_kernel_L1_M1_BEGIN: | |||
| .Lcgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| cgemm_kernel_L1_M1_20: | |||
| .Lcgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M1_40 | |||
| ble .Lcgemm_kernel_L1_M1_40 | |||
| cgemm_kernel_L1_M1_22: | |||
| .Lcgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M1_22 | |||
| bgt .Lcgemm_kernel_L1_M1_22 | |||
| cgemm_kernel_L1_M1_40: | |||
| .Lcgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M1_100 | |||
| ble .Lcgemm_kernel_L1_M1_100 | |||
| cgemm_kernel_L1_M1_42: | |||
| .Lcgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M1_42 | |||
| bgt .Lcgemm_kernel_L1_M1_42 | |||
| cgemm_kernel_L1_M1_100: | |||
| .Lcgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| cgemm_kernel_L1_END: | |||
| .Lcgemm_kernel_L1_END: | |||
| cgemm_kernel_L999: | |||
| .Lcgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble cgemm_kernel_L2_BEGIN | |||
| ble .Lcgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L4_BEGIN: | |||
| .Lcgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| cgemm_kernel_L4_M8_BEGIN: | |||
| .Lcgemm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L4_M4_BEGIN | |||
| ble .Lcgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_20: | |||
| .Lcgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt cgemm_kernel_L4_M8_32 | |||
| blt .Lcgemm_kernel_L4_M8_32 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble cgemm_kernel_L4_M8_22a | |||
| ble .Lcgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_22: | |||
| .Lcgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M8_22 | |||
| bgt .Lcgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_22a: | |||
| .Lcgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b cgemm_kernel_L4_M8_44 | |||
| b .Lcgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_32: | |||
| .Lcgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble cgemm_kernel_L4_M8_40 | |||
| ble .Lcgemm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b cgemm_kernel_L4_M8_44 | |||
| b .Lcgemm_kernel_L4_M8_44 | |||
| cgemm_kernel_L4_M8_40: | |||
| .Lcgemm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| cgemm_kernel_L4_M8_44: | |||
| .Lcgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #7 | |||
| ble cgemm_kernel_L4_M8_100 | |||
| ble .Lcgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_46: | |||
| .Lcgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne cgemm_kernel_L4_M8_46 | |||
| bne .Lcgemm_kernel_L4_M8_46 | |||
| cgemm_kernel_L4_M8_100: | |||
| .Lcgemm_kernel_L4_M8_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE8x4 | |||
| cgemm_kernel_L4_M8_END: | |||
| .Lcgemm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bne cgemm_kernel_L4_M8_20 | |||
| bne .Lcgemm_kernel_L4_M8_20 | |||
| cgemm_kernel_L4_M4_BEGIN: | |||
| .Lcgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble cgemm_kernel_L4_M2_BEGIN | |||
| ble .Lcgemm_kernel_L4_M2_BEGIN | |||
| cgemm_kernel_L4_M4_20: | |||
| .Lcgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt cgemm_kernel_L4_M4_32 | |||
| blt .Lcgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble cgemm_kernel_L4_M4_22a | |||
| ble .Lcgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| cgemm_kernel_L4_M4_22: | |||
| .Lcgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M4_22 | |||
| bgt .Lcgemm_kernel_L4_M4_22 | |||
| cgemm_kernel_L4_M4_22a: | |||
| .Lcgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b cgemm_kernel_L4_M4_44 | |||
| cgemm_kernel_L4_M4_32: | |||
| b .Lcgemm_kernel_L4_M4_44 | |||
| .Lcgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble cgemm_kernel_L4_M4_40 | |||
| ble .Lcgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b cgemm_kernel_L4_M4_44 | |||
| cgemm_kernel_L4_M4_40: | |||
| b .Lcgemm_kernel_L4_M4_44 | |||
| .Lcgemm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| cgemm_kernel_L4_M4_44: | |||
| .Lcgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #1 | |||
| ble cgemm_kernel_L4_M4_100 | |||
| ble .Lcgemm_kernel_L4_M4_100 | |||
| cgemm_kernel_L4_M4_46: | |||
| .Lcgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| cgemm_kernel_L4_M4_100: | |||
| .Lcgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| cgemm_kernel_L4_M4_END: | |||
| .Lcgemm_kernel_L4_M4_END: | |||
| cgemm_kernel_L4_M2_BEGIN: | |||
| .Lcgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L4_M1_BEGIN | |||
| ble .Lcgemm_kernel_L4_M1_BEGIN | |||
| cgemm_kernel_L4_M2_20: | |||
| .Lcgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L4_M2_40 | |||
| ble .Lcgemm_kernel_L4_M2_40 | |||
| cgemm_kernel_L4_M2_22: | |||
| .Lcgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M2_22 | |||
| bgt .Lcgemm_kernel_L4_M2_22 | |||
| cgemm_kernel_L4_M2_40: | |||
| .Lcgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L4_M2_100 | |||
| ble .Lcgemm_kernel_L4_M2_100 | |||
| cgemm_kernel_L4_M2_42: | |||
| .Lcgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M2_42 | |||
| bgt .Lcgemm_kernel_L4_M2_42 | |||
| cgemm_kernel_L4_M2_100: | |||
| .Lcgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| cgemm_kernel_L4_M2_END: | |||
| .Lcgemm_kernel_L4_M2_END: | |||
| cgemm_kernel_L4_M1_BEGIN: | |||
| .Lcgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| cgemm_kernel_L4_M1_20: | |||
| .Lcgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L4_M1_40 | |||
| ble .Lcgemm_kernel_L4_M1_40 | |||
| cgemm_kernel_L4_M1_22: | |||
| .Lcgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M1_22 | |||
| bgt .Lcgemm_kernel_L4_M1_22 | |||
| cgemm_kernel_L4_M1_40: | |||
| .Lcgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L4_M1_100 | |||
| ble .Lcgemm_kernel_L4_M1_100 | |||
| cgemm_kernel_L4_M1_42: | |||
| .Lcgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M1_42 | |||
| bgt .Lcgemm_kernel_L4_M1_42 | |||
| cgemm_kernel_L4_M1_100: | |||
| .Lcgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| cgemm_kernel_L4_END: | |||
| .Lcgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt cgemm_kernel_L4_BEGIN | |||
| bgt .Lcgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble cgemm_kernel_L999 // error, N was less than 4? | |||
| ble .Lcgemm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble cgemm_kernel_L1_BEGIN | |||
| ble .Lcgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| cgemm_kernel_L2_M8_BEGIN: | |||
| .Lcgemm_kernel_L2_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L2_M4_BEGIN | |||
| ble .Lcgemm_kernel_L2_M4_BEGIN | |||
| cgemm_kernel_L2_M8_20: | |||
| .Lcgemm_kernel_L2_M8_20: | |||
| INIT8x2 | |||
| @@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M8_40 | |||
| ble .Lcgemm_kernel_L2_M8_40 | |||
| .align 5 | |||
| cgemm_kernel_L2_M8_22: | |||
| .Lcgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| @@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M8_22 | |||
| bgt .Lcgemm_kernel_L2_M8_22 | |||
| cgemm_kernel_L2_M8_40: | |||
| .Lcgemm_kernel_L2_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M8_100 | |||
| ble .Lcgemm_kernel_L2_M8_100 | |||
| cgemm_kernel_L2_M8_42: | |||
| .Lcgemm_kernel_L2_M8_42: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M8_42 | |||
| bgt .Lcgemm_kernel_L2_M8_42 | |||
| cgemm_kernel_L2_M8_100: | |||
| .Lcgemm_kernel_L2_M8_100: | |||
| SAVE8x2 | |||
| cgemm_kernel_L2_M8_END: | |||
| .Lcgemm_kernel_L2_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt cgemm_kernel_L2_M8_20 | |||
| bgt .Lcgemm_kernel_L2_M8_20 | |||
| cgemm_kernel_L2_M4_BEGIN: | |||
| .Lcgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L2_M2_BEGIN | |||
| ble .Lcgemm_kernel_L2_M2_BEGIN | |||
| cgemm_kernel_L2_M4_20: | |||
| .Lcgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M4_40 | |||
| ble .Lcgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| cgemm_kernel_L2_M4_22: | |||
| .Lcgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M4_22 | |||
| bgt .Lcgemm_kernel_L2_M4_22 | |||
| cgemm_kernel_L2_M4_40: | |||
| .Lcgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M4_100 | |||
| ble .Lcgemm_kernel_L2_M4_100 | |||
| cgemm_kernel_L2_M4_42: | |||
| .Lcgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M4_42 | |||
| bgt .Lcgemm_kernel_L2_M4_42 | |||
| cgemm_kernel_L2_M4_100: | |||
| .Lcgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| cgemm_kernel_L2_M4_END: | |||
| .Lcgemm_kernel_L2_M4_END: | |||
| cgemm_kernel_L2_M2_BEGIN: | |||
| .Lcgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L2_M1_BEGIN | |||
| ble .Lcgemm_kernel_L2_M1_BEGIN | |||
| cgemm_kernel_L2_M2_20: | |||
| .Lcgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M2_40 | |||
| ble .Lcgemm_kernel_L2_M2_40 | |||
| cgemm_kernel_L2_M2_22: | |||
| .Lcgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M2_22 | |||
| bgt .Lcgemm_kernel_L2_M2_22 | |||
| cgemm_kernel_L2_M2_40: | |||
| .Lcgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M2_100 | |||
| ble .Lcgemm_kernel_L2_M2_100 | |||
| cgemm_kernel_L2_M2_42: | |||
| .Lcgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M2_42 | |||
| bgt .Lcgemm_kernel_L2_M2_42 | |||
| cgemm_kernel_L2_M2_100: | |||
| .Lcgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| cgemm_kernel_L2_M2_END: | |||
| .Lcgemm_kernel_L2_M2_END: | |||
| cgemm_kernel_L2_M1_BEGIN: | |||
| .Lcgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| cgemm_kernel_L2_M1_20: | |||
| .Lcgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble cgemm_kernel_L2_M1_40 | |||
| ble .Lcgemm_kernel_L2_M1_40 | |||
| cgemm_kernel_L2_M1_22: | |||
| .Lcgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M1_22 | |||
| bgt .Lcgemm_kernel_L2_M1_22 | |||
| cgemm_kernel_L2_M1_40: | |||
| .Lcgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M1_100 | |||
| ble .Lcgemm_kernel_L2_M1_100 | |||
| cgemm_kernel_L2_M1_42: | |||
| .Lcgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M1_42 | |||
| bgt .Lcgemm_kernel_L2_M1_42 | |||
| cgemm_kernel_L2_M1_100: | |||
| .Lcgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| cgemm_kernel_L2_END: | |||
| .Lcgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L1_BEGIN: | |||
| .Lcgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble cgemm_kernel_L999 // done | |||
| ble .Lcgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN: | |||
| mov pA, origPA // pA = A | |||
| cgemm_kernel_L1_M8_BEGIN: | |||
| .Lcgemm_kernel_L1_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L1_M4_BEGIN | |||
| ble .Lcgemm_kernel_L1_M4_BEGIN | |||
| cgemm_kernel_L1_M8_20: | |||
| .Lcgemm_kernel_L1_M8_20: | |||
| INIT8x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M8_40 | |||
| ble .Lcgemm_kernel_L1_M8_40 | |||
| .align 5 | |||
| cgemm_kernel_L1_M8_22: | |||
| .Lcgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| @@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M8_22 | |||
| bgt .Lcgemm_kernel_L1_M8_22 | |||
| cgemm_kernel_L1_M8_40: | |||
| .Lcgemm_kernel_L1_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M8_100 | |||
| ble .Lcgemm_kernel_L1_M8_100 | |||
| cgemm_kernel_L1_M8_42: | |||
| .Lcgemm_kernel_L1_M8_42: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M8_42 | |||
| bgt .Lcgemm_kernel_L1_M8_42 | |||
| cgemm_kernel_L1_M8_100: | |||
| .Lcgemm_kernel_L1_M8_100: | |||
| SAVE8x1 | |||
| cgemm_kernel_L1_M8_END: | |||
| .Lcgemm_kernel_L1_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt cgemm_kernel_L1_M8_20 | |||
| bgt .Lcgemm_kernel_L1_M8_20 | |||
| cgemm_kernel_L1_M4_BEGIN: | |||
| .Lcgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L1_M2_BEGIN | |||
| ble .Lcgemm_kernel_L1_M2_BEGIN | |||
| cgemm_kernel_L1_M4_20: | |||
| .Lcgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M4_40 | |||
| ble .Lcgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| cgemm_kernel_L1_M4_22: | |||
| .Lcgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M4_22 | |||
| bgt .Lcgemm_kernel_L1_M4_22 | |||
| cgemm_kernel_L1_M4_40: | |||
| .Lcgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M4_100 | |||
| ble .Lcgemm_kernel_L1_M4_100 | |||
| cgemm_kernel_L1_M4_42: | |||
| .Lcgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M4_42 | |||
| bgt .Lcgemm_kernel_L1_M4_42 | |||
| cgemm_kernel_L1_M4_100: | |||
| .Lcgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| cgemm_kernel_L1_M4_END: | |||
| .Lcgemm_kernel_L1_M4_END: | |||
| cgemm_kernel_L1_M2_BEGIN: | |||
| .Lcgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L1_M1_BEGIN | |||
| ble .Lcgemm_kernel_L1_M1_BEGIN | |||
| cgemm_kernel_L1_M2_20: | |||
| .Lcgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M2_40 | |||
| ble .Lcgemm_kernel_L1_M2_40 | |||
| cgemm_kernel_L1_M2_22: | |||
| .Lcgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M2_22 | |||
| bgt .Lcgemm_kernel_L1_M2_22 | |||
| cgemm_kernel_L1_M2_40: | |||
| .Lcgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M2_100 | |||
| ble .Lcgemm_kernel_L1_M2_100 | |||
| cgemm_kernel_L1_M2_42: | |||
| .Lcgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M2_42 | |||
| bgt .Lcgemm_kernel_L1_M2_42 | |||
| cgemm_kernel_L1_M2_100: | |||
| .Lcgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| cgemm_kernel_L1_M2_END: | |||
| .Lcgemm_kernel_L1_M2_END: | |||
| cgemm_kernel_L1_M1_BEGIN: | |||
| .Lcgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| cgemm_kernel_L1_M1_20: | |||
| .Lcgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M1_40 | |||
| ble .Lcgemm_kernel_L1_M1_40 | |||
| cgemm_kernel_L1_M1_22: | |||
| .Lcgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M1_22 | |||
| bgt .Lcgemm_kernel_L1_M1_22 | |||
| cgemm_kernel_L1_M1_40: | |||
| .Lcgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M1_100 | |||
| ble .Lcgemm_kernel_L1_M1_100 | |||
| cgemm_kernel_L1_M1_42: | |||
| .Lcgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M1_42 | |||
| bgt .Lcgemm_kernel_L1_M1_42 | |||
| cgemm_kernel_L1_M1_100: | |||
| .Lcgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| cgemm_kernel_L1_END: | |||
| .Lcgemm_kernel_L1_END: | |||
| cgemm_kernel_L999: | |||
| .Lcgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble cgemm_kernel_L2_BEGIN | |||
| ble .Lcgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L4_BEGIN: | |||
| .Lcgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| cgemm_kernel_L4_M8_BEGIN: | |||
| .Lcgemm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L4_M4_BEGIN | |||
| ble .Lcgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_20: | |||
| .Lcgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #5 // origK / 32 | |||
| cmp counterL , #2 | |||
| blt cgemm_kernel_L4_M8_32 | |||
| blt .Lcgemm_kernel_L4_M8_32 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20: | |||
| KERNEL8x4_M1_M2_x8 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble cgemm_kernel_L4_M8_22a | |||
| ble .Lcgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_22: | |||
| .Lcgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1_M2_x16 | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M8_22 | |||
| bgt .Lcgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_22a: | |||
| .Lcgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1_M2_x8 | |||
| KERNEL8x4_M1_M2_x4 | |||
| @@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b cgemm_kernel_L4_M8_44 | |||
| b .Lcgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_32: | |||
| .Lcgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble cgemm_kernel_L4_M8_40 | |||
| ble .Lcgemm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b cgemm_kernel_L4_M8_44 | |||
| b .Lcgemm_kernel_L4_M8_44 | |||
| cgemm_kernel_L4_M8_40: | |||
| .Lcgemm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| cgemm_kernel_L4_M8_44: | |||
| .Lcgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #31 | |||
| ble cgemm_kernel_L4_M8_100 | |||
| ble .Lcgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| cgemm_kernel_L4_M8_46: | |||
| .Lcgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne cgemm_kernel_L4_M8_46 | |||
| bne .Lcgemm_kernel_L4_M8_46 | |||
| cgemm_kernel_L4_M8_100: | |||
| .Lcgemm_kernel_L4_M8_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE8x4 | |||
| cgemm_kernel_L4_M8_END: | |||
| .Lcgemm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bne cgemm_kernel_L4_M8_20 | |||
| bne .Lcgemm_kernel_L4_M8_20 | |||
| cgemm_kernel_L4_M4_BEGIN: | |||
| .Lcgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble cgemm_kernel_L4_M2_BEGIN | |||
| ble .Lcgemm_kernel_L4_M2_BEGIN | |||
| cgemm_kernel_L4_M4_20: | |||
| .Lcgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt cgemm_kernel_L4_M4_32 | |||
| blt .Lcgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble cgemm_kernel_L4_M4_22a | |||
| ble .Lcgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| cgemm_kernel_L4_M4_22: | |||
| .Lcgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M4_22 | |||
| bgt .Lcgemm_kernel_L4_M4_22 | |||
| cgemm_kernel_L4_M4_22a: | |||
| .Lcgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b cgemm_kernel_L4_M4_44 | |||
| cgemm_kernel_L4_M4_32: | |||
| b .Lcgemm_kernel_L4_M4_44 | |||
| .Lcgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble cgemm_kernel_L4_M4_40 | |||
| ble .Lcgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b cgemm_kernel_L4_M4_44 | |||
| cgemm_kernel_L4_M4_40: | |||
| b .Lcgemm_kernel_L4_M4_44 | |||
| .Lcgemm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| cgemm_kernel_L4_M4_44: | |||
| .Lcgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #1 | |||
| ble cgemm_kernel_L4_M4_100 | |||
| ble .Lcgemm_kernel_L4_M4_100 | |||
| cgemm_kernel_L4_M4_46: | |||
| .Lcgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| cgemm_kernel_L4_M4_100: | |||
| .Lcgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| cgemm_kernel_L4_M4_END: | |||
| .Lcgemm_kernel_L4_M4_END: | |||
| cgemm_kernel_L4_M2_BEGIN: | |||
| .Lcgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L4_M1_BEGIN | |||
| ble .Lcgemm_kernel_L4_M1_BEGIN | |||
| cgemm_kernel_L4_M2_20: | |||
| .Lcgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L4_M2_40 | |||
| ble .Lcgemm_kernel_L4_M2_40 | |||
| cgemm_kernel_L4_M2_22: | |||
| .Lcgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M2_22 | |||
| bgt .Lcgemm_kernel_L4_M2_22 | |||
| cgemm_kernel_L4_M2_40: | |||
| .Lcgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L4_M2_100 | |||
| ble .Lcgemm_kernel_L4_M2_100 | |||
| cgemm_kernel_L4_M2_42: | |||
| .Lcgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M2_42 | |||
| bgt .Lcgemm_kernel_L4_M2_42 | |||
| cgemm_kernel_L4_M2_100: | |||
| .Lcgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| cgemm_kernel_L4_M2_END: | |||
| .Lcgemm_kernel_L4_M2_END: | |||
| cgemm_kernel_L4_M1_BEGIN: | |||
| .Lcgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L4_END | |||
| ble .Lcgemm_kernel_L4_END | |||
| cgemm_kernel_L4_M1_20: | |||
| .Lcgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L4_M1_40 | |||
| ble .Lcgemm_kernel_L4_M1_40 | |||
| cgemm_kernel_L4_M1_22: | |||
| .Lcgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M1_22 | |||
| bgt .Lcgemm_kernel_L4_M1_22 | |||
| cgemm_kernel_L4_M1_40: | |||
| .Lcgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L4_M1_100 | |||
| ble .Lcgemm_kernel_L4_M1_100 | |||
| cgemm_kernel_L4_M1_42: | |||
| .Lcgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L4_M1_42 | |||
| bgt .Lcgemm_kernel_L4_M1_42 | |||
| cgemm_kernel_L4_M1_100: | |||
| .Lcgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| cgemm_kernel_L4_END: | |||
| .Lcgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt cgemm_kernel_L4_BEGIN | |||
| bgt .Lcgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble cgemm_kernel_L999 // error, N was less than 4? | |||
| ble .Lcgemm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble cgemm_kernel_L1_BEGIN | |||
| ble .Lcgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| cgemm_kernel_L2_M8_BEGIN: | |||
| .Lcgemm_kernel_L2_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L2_M4_BEGIN | |||
| ble .Lcgemm_kernel_L2_M4_BEGIN | |||
| cgemm_kernel_L2_M8_20: | |||
| .Lcgemm_kernel_L2_M8_20: | |||
| INIT8x2 | |||
| @@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M8_40 | |||
| ble .Lcgemm_kernel_L2_M8_40 | |||
| .align 5 | |||
| cgemm_kernel_L2_M8_22: | |||
| .Lcgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| @@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M8_22 | |||
| bgt .Lcgemm_kernel_L2_M8_22 | |||
| cgemm_kernel_L2_M8_40: | |||
| .Lcgemm_kernel_L2_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M8_100 | |||
| ble .Lcgemm_kernel_L2_M8_100 | |||
| cgemm_kernel_L2_M8_42: | |||
| .Lcgemm_kernel_L2_M8_42: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M8_42 | |||
| bgt .Lcgemm_kernel_L2_M8_42 | |||
| cgemm_kernel_L2_M8_100: | |||
| .Lcgemm_kernel_L2_M8_100: | |||
| SAVE8x2 | |||
| cgemm_kernel_L2_M8_END: | |||
| .Lcgemm_kernel_L2_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt cgemm_kernel_L2_M8_20 | |||
| bgt .Lcgemm_kernel_L2_M8_20 | |||
| cgemm_kernel_L2_M4_BEGIN: | |||
| .Lcgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L2_M2_BEGIN | |||
| ble .Lcgemm_kernel_L2_M2_BEGIN | |||
| cgemm_kernel_L2_M4_20: | |||
| .Lcgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M4_40 | |||
| ble .Lcgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| cgemm_kernel_L2_M4_22: | |||
| .Lcgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M4_22 | |||
| bgt .Lcgemm_kernel_L2_M4_22 | |||
| cgemm_kernel_L2_M4_40: | |||
| .Lcgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M4_100 | |||
| ble .Lcgemm_kernel_L2_M4_100 | |||
| cgemm_kernel_L2_M4_42: | |||
| .Lcgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M4_42 | |||
| bgt .Lcgemm_kernel_L2_M4_42 | |||
| cgemm_kernel_L2_M4_100: | |||
| .Lcgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| cgemm_kernel_L2_M4_END: | |||
| .Lcgemm_kernel_L2_M4_END: | |||
| cgemm_kernel_L2_M2_BEGIN: | |||
| .Lcgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L2_M1_BEGIN | |||
| ble .Lcgemm_kernel_L2_M1_BEGIN | |||
| cgemm_kernel_L2_M2_20: | |||
| .Lcgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble cgemm_kernel_L2_M2_40 | |||
| ble .Lcgemm_kernel_L2_M2_40 | |||
| cgemm_kernel_L2_M2_22: | |||
| .Lcgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M2_22 | |||
| bgt .Lcgemm_kernel_L2_M2_22 | |||
| cgemm_kernel_L2_M2_40: | |||
| .Lcgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M2_100 | |||
| ble .Lcgemm_kernel_L2_M2_100 | |||
| cgemm_kernel_L2_M2_42: | |||
| .Lcgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M2_42 | |||
| bgt .Lcgemm_kernel_L2_M2_42 | |||
| cgemm_kernel_L2_M2_100: | |||
| .Lcgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| cgemm_kernel_L2_M2_END: | |||
| .Lcgemm_kernel_L2_M2_END: | |||
| cgemm_kernel_L2_M1_BEGIN: | |||
| .Lcgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L2_END | |||
| ble .Lcgemm_kernel_L2_END | |||
| cgemm_kernel_L2_M1_20: | |||
| .Lcgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble cgemm_kernel_L2_M1_40 | |||
| ble .Lcgemm_kernel_L2_M1_40 | |||
| cgemm_kernel_L2_M1_22: | |||
| .Lcgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M1_22 | |||
| bgt .Lcgemm_kernel_L2_M1_22 | |||
| cgemm_kernel_L2_M1_40: | |||
| .Lcgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L2_M1_100 | |||
| ble .Lcgemm_kernel_L2_M1_100 | |||
| cgemm_kernel_L2_M1_42: | |||
| .Lcgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L2_M1_42 | |||
| bgt .Lcgemm_kernel_L2_M1_42 | |||
| cgemm_kernel_L2_M1_100: | |||
| .Lcgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| cgemm_kernel_L2_END: | |||
| .Lcgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| cgemm_kernel_L1_BEGIN: | |||
| .Lcgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble cgemm_kernel_L999 // done | |||
| ble .Lcgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN: | |||
| mov pA, origPA // pA = A | |||
| cgemm_kernel_L1_M8_BEGIN: | |||
| .Lcgemm_kernel_L1_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble cgemm_kernel_L1_M4_BEGIN | |||
| ble .Lcgemm_kernel_L1_M4_BEGIN | |||
| cgemm_kernel_L1_M8_20: | |||
| .Lcgemm_kernel_L1_M8_20: | |||
| INIT8x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M8_40 | |||
| ble .Lcgemm_kernel_L1_M8_40 | |||
| .align 5 | |||
| cgemm_kernel_L1_M8_22: | |||
| .Lcgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| @@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M8_22 | |||
| bgt .Lcgemm_kernel_L1_M8_22 | |||
| cgemm_kernel_L1_M8_40: | |||
| .Lcgemm_kernel_L1_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M8_100 | |||
| ble .Lcgemm_kernel_L1_M8_100 | |||
| cgemm_kernel_L1_M8_42: | |||
| .Lcgemm_kernel_L1_M8_42: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M8_42 | |||
| bgt .Lcgemm_kernel_L1_M8_42 | |||
| cgemm_kernel_L1_M8_100: | |||
| .Lcgemm_kernel_L1_M8_100: | |||
| SAVE8x1 | |||
| cgemm_kernel_L1_M8_END: | |||
| .Lcgemm_kernel_L1_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt cgemm_kernel_L1_M8_20 | |||
| bgt .Lcgemm_kernel_L1_M8_20 | |||
| cgemm_kernel_L1_M4_BEGIN: | |||
| .Lcgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L1_M2_BEGIN | |||
| ble .Lcgemm_kernel_L1_M2_BEGIN | |||
| cgemm_kernel_L1_M4_20: | |||
| .Lcgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M4_40 | |||
| ble .Lcgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| cgemm_kernel_L1_M4_22: | |||
| .Lcgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M4_22 | |||
| bgt .Lcgemm_kernel_L1_M4_22 | |||
| cgemm_kernel_L1_M4_40: | |||
| .Lcgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M4_100 | |||
| ble .Lcgemm_kernel_L1_M4_100 | |||
| cgemm_kernel_L1_M4_42: | |||
| .Lcgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M4_42 | |||
| bgt .Lcgemm_kernel_L1_M4_42 | |||
| cgemm_kernel_L1_M4_100: | |||
| .Lcgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| cgemm_kernel_L1_M4_END: | |||
| .Lcgemm_kernel_L1_M4_END: | |||
| cgemm_kernel_L1_M2_BEGIN: | |||
| .Lcgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble cgemm_kernel_L1_M1_BEGIN | |||
| ble .Lcgemm_kernel_L1_M1_BEGIN | |||
| cgemm_kernel_L1_M2_20: | |||
| .Lcgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M2_40 | |||
| ble .Lcgemm_kernel_L1_M2_40 | |||
| cgemm_kernel_L1_M2_22: | |||
| .Lcgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M2_22 | |||
| bgt .Lcgemm_kernel_L1_M2_22 | |||
| cgemm_kernel_L1_M2_40: | |||
| .Lcgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M2_100 | |||
| ble .Lcgemm_kernel_L1_M2_100 | |||
| cgemm_kernel_L1_M2_42: | |||
| .Lcgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M2_42 | |||
| bgt .Lcgemm_kernel_L1_M2_42 | |||
| cgemm_kernel_L1_M2_100: | |||
| .Lcgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| cgemm_kernel_L1_M2_END: | |||
| .Lcgemm_kernel_L1_M2_END: | |||
| cgemm_kernel_L1_M1_BEGIN: | |||
| .Lcgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble cgemm_kernel_L1_END | |||
| ble .Lcgemm_kernel_L1_END | |||
| cgemm_kernel_L1_M1_20: | |||
| .Lcgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble cgemm_kernel_L1_M1_40 | |||
| ble .Lcgemm_kernel_L1_M1_40 | |||
| cgemm_kernel_L1_M1_22: | |||
| .Lcgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M1_22 | |||
| bgt .Lcgemm_kernel_L1_M1_22 | |||
| cgemm_kernel_L1_M1_40: | |||
| .Lcgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble cgemm_kernel_L1_M1_100 | |||
| ble .Lcgemm_kernel_L1_M1_100 | |||
| cgemm_kernel_L1_M1_42: | |||
| .Lcgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt cgemm_kernel_L1_M1_42 | |||
| bgt .Lcgemm_kernel_L1_M1_42 | |||
| cgemm_kernel_L1_M1_100: | |||
| .Lcgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| cgemm_kernel_L1_END: | |||
| .Lcgemm_kernel_L1_END: | |||
| cgemm_kernel_L999: | |||
| .Lcgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble copy_kernel_L999 | |||
| ble .Lcopy_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne copy_kernel_S_BEGIN | |||
| bne .Lcopy_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne copy_kernel_S_BEGIN | |||
| bne .Lcopy_kernel_S_BEGIN | |||
| copy_kernel_F_BEGIN: | |||
| .Lcopy_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq copy_kernel_F1 | |||
| beq .Lcopy_kernel_F1 | |||
| copy_kernel_F4: | |||
| .Lcopy_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne copy_kernel_F4 | |||
| bne .Lcopy_kernel_F4 | |||
| copy_kernel_F1: | |||
| .Lcopy_kernel_F1: | |||
| ands I, N, #3 | |||
| ble copy_kernel_L999 | |||
| ble .Lcopy_kernel_L999 | |||
| copy_kernel_F10: | |||
| .Lcopy_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne copy_kernel_F10 | |||
| bne .Lcopy_kernel_F10 | |||
| mov w0, wzr | |||
| ret | |||
| copy_kernel_S_BEGIN: | |||
| .Lcopy_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble copy_kernel_S1 | |||
| ble .Lcopy_kernel_S1 | |||
| copy_kernel_S4: | |||
| .Lcopy_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -210,21 +210,21 @@ copy_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne copy_kernel_S4 | |||
| bne .Lcopy_kernel_S4 | |||
| copy_kernel_S1: | |||
| .Lcopy_kernel_S1: | |||
| ands I, N, #3 | |||
| ble copy_kernel_L999 | |||
| ble .Lcopy_kernel_L999 | |||
| copy_kernel_S10: | |||
| .Lcopy_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne copy_kernel_S10 | |||
| bne .Lcopy_kernel_S10 | |||
| copy_kernel_L999: | |||
| .Lcopy_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble ctrmm_kernel_L2_BEGIN | |||
| ble .Lctrmm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| ctrmm_kernel_L4_BEGIN: | |||
| .Lctrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| @@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN: | |||
| #endif | |||
| mov pA, origPA // pA = start of A array | |||
| ctrmm_kernel_L4_M4_BEGIN: | |||
| .Lctrmm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble ctrmm_kernel_L4_M2_BEGIN | |||
| ble .Lctrmm_kernel_L4_M2_BEGIN | |||
| ctrmm_kernel_L4_M4_20: | |||
| .Lctrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20: | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt ctrmm_kernel_L4_M4_32 | |||
| blt .Lctrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble ctrmm_kernel_L4_M4_22a | |||
| ble .Lctrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| ctrmm_kernel_L4_M4_22: | |||
| .Lctrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M4_22 | |||
| bgt .Lctrmm_kernel_L4_M4_22 | |||
| ctrmm_kernel_L4_M4_22a: | |||
| .Lctrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ctrmm_kernel_L4_M4_44 | |||
| b .Lctrmm_kernel_L4_M4_44 | |||
| ctrmm_kernel_L4_M4_32: | |||
| .Lctrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble ctrmm_kernel_L4_M4_40 | |||
| ble .Lctrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b ctrmm_kernel_L4_M4_44 | |||
| b .Lctrmm_kernel_L4_M4_44 | |||
| ctrmm_kernel_L4_M4_40: | |||
| .Lctrmm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| ctrmm_kernel_L4_M4_44: | |||
| .Lctrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ble ctrmm_kernel_L4_M4_100 | |||
| ble .Lctrmm_kernel_L4_M4_100 | |||
| ctrmm_kernel_L4_M4_46: | |||
| .Lctrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| ctrmm_kernel_L4_M4_100: | |||
| .Lctrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ctrmm_kernel_L4_M4_END: | |||
| .Lctrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne ctrmm_kernel_L4_M4_20 | |||
| bne .Lctrmm_kernel_L4_M4_20 | |||
| ctrmm_kernel_L4_M2_BEGIN: | |||
| .Lctrmm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ctrmm_kernel_L4_END | |||
| ble .Lctrmm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L4_M1_BEGIN | |||
| ble .Lctrmm_kernel_L4_M1_BEGIN | |||
| ctrmm_kernel_L4_M2_20: | |||
| .Lctrmm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L4_M2_40 | |||
| ble .Lctrmm_kernel_L4_M2_40 | |||
| ctrmm_kernel_L4_M2_22: | |||
| .Lctrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M2_22 | |||
| bgt .Lctrmm_kernel_L4_M2_22 | |||
| ctrmm_kernel_L4_M2_40: | |||
| .Lctrmm_kernel_L4_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L4_M2_100 | |||
| ble .Lctrmm_kernel_L4_M2_100 | |||
| ctrmm_kernel_L4_M2_42: | |||
| .Lctrmm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M2_42 | |||
| bgt .Lctrmm_kernel_L4_M2_42 | |||
| ctrmm_kernel_L4_M2_100: | |||
| .Lctrmm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| @@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ctrmm_kernel_L4_M2_END: | |||
| .Lctrmm_kernel_L4_M2_END: | |||
| ctrmm_kernel_L4_M1_BEGIN: | |||
| .Lctrmm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ctrmm_kernel_L4_END | |||
| ble .Lctrmm_kernel_L4_END | |||
| ctrmm_kernel_L4_M1_20: | |||
| .Lctrmm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L4_M1_40 | |||
| ble .Lctrmm_kernel_L4_M1_40 | |||
| ctrmm_kernel_L4_M1_22: | |||
| .Lctrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M1_22 | |||
| bgt .Lctrmm_kernel_L4_M1_22 | |||
| ctrmm_kernel_L4_M1_40: | |||
| .Lctrmm_kernel_L4_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L4_M1_100 | |||
| ble .Lctrmm_kernel_L4_M1_100 | |||
| ctrmm_kernel_L4_M1_42: | |||
| .Lctrmm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M1_42 | |||
| bgt .Lctrmm_kernel_L4_M1_42 | |||
| ctrmm_kernel_L4_M1_100: | |||
| .Lctrmm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| @@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| ctrmm_kernel_L4_END: | |||
| .Lctrmm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| @@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END: | |||
| #endif | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt ctrmm_kernel_L4_BEGIN | |||
| bgt .Lctrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble ctrmm_kernel_L999 // error, N was less than 4? | |||
| ble .Lctrmm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble ctrmm_kernel_L1_BEGIN | |||
| ble .Lctrmm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| ctrmm_kernel_L2_M4_BEGIN: | |||
| .Lctrmm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble ctrmm_kernel_L2_M2_BEGIN | |||
| ble .Lctrmm_kernel_L2_M2_BEGIN | |||
| ctrmm_kernel_L2_M4_20: | |||
| .Lctrmm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble ctrmm_kernel_L2_M4_40 | |||
| ble .Lctrmm_kernel_L2_M4_40 | |||
| .align 5 | |||
| ctrmm_kernel_L2_M4_22: | |||
| .Lctrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M4_22 | |||
| bgt .Lctrmm_kernel_L2_M4_22 | |||
| ctrmm_kernel_L2_M4_40: | |||
| .Lctrmm_kernel_L2_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L2_M4_100 | |||
| ble .Lctrmm_kernel_L2_M4_100 | |||
| ctrmm_kernel_L2_M4_42: | |||
| .Lctrmm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M4_42 | |||
| bgt .Lctrmm_kernel_L2_M4_42 | |||
| ctrmm_kernel_L2_M4_100: | |||
| .Lctrmm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| @@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ctrmm_kernel_L2_M4_END: | |||
| .Lctrmm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt ctrmm_kernel_L2_M4_20 | |||
| bgt .Lctrmm_kernel_L2_M4_20 | |||
| ctrmm_kernel_L2_M2_BEGIN: | |||
| .Lctrmm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ctrmm_kernel_L2_END | |||
| ble .Lctrmm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L2_M1_BEGIN | |||
| ble .Lctrmm_kernel_L2_M1_BEGIN | |||
| ctrmm_kernel_L2_M2_20: | |||
| .Lctrmm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble ctrmm_kernel_L2_M2_40 | |||
| ble .Lctrmm_kernel_L2_M2_40 | |||
| ctrmm_kernel_L2_M2_22: | |||
| .Lctrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M2_22 | |||
| bgt .Lctrmm_kernel_L2_M2_22 | |||
| ctrmm_kernel_L2_M2_40: | |||
| .Lctrmm_kernel_L2_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L2_M2_100 | |||
| ble .Lctrmm_kernel_L2_M2_100 | |||
| ctrmm_kernel_L2_M2_42: | |||
| .Lctrmm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M2_42 | |||
| bgt .Lctrmm_kernel_L2_M2_42 | |||
| ctrmm_kernel_L2_M2_100: | |||
| .Lctrmm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| @@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ctrmm_kernel_L2_M2_END: | |||
| .Lctrmm_kernel_L2_M2_END: | |||
| ctrmm_kernel_L2_M1_BEGIN: | |||
| .Lctrmm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ctrmm_kernel_L2_END | |||
| ble .Lctrmm_kernel_L2_END | |||
| ctrmm_kernel_L2_M1_20: | |||
| .Lctrmm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble ctrmm_kernel_L2_M1_40 | |||
| ble .Lctrmm_kernel_L2_M1_40 | |||
| ctrmm_kernel_L2_M1_22: | |||
| .Lctrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M1_22 | |||
| bgt .Lctrmm_kernel_L2_M1_22 | |||
| ctrmm_kernel_L2_M1_40: | |||
| .Lctrmm_kernel_L2_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L2_M1_100 | |||
| ble .Lctrmm_kernel_L2_M1_100 | |||
| ctrmm_kernel_L2_M1_42: | |||
| .Lctrmm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M1_42 | |||
| bgt .Lctrmm_kernel_L2_M1_42 | |||
| ctrmm_kernel_L2_M1_100: | |||
| .Lctrmm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| @@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| ctrmm_kernel_L2_END: | |||
| .Lctrmm_kernel_L2_END: | |||
| #if !defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| @@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END: | |||
| /******************************************************************************/ | |||
| ctrmm_kernel_L1_BEGIN: | |||
| .Lctrmm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble ctrmm_kernel_L999 // done | |||
| ble .Lctrmm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN: | |||
| mov pA, origPA // pA = A | |||
| ctrmm_kernel_L1_M4_BEGIN: | |||
| .Lctrmm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble ctrmm_kernel_L1_M2_BEGIN | |||
| ble .Lctrmm_kernel_L1_M2_BEGIN | |||
| ctrmm_kernel_L1_M4_20: | |||
| .Lctrmm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| @@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L1_M4_40 | |||
| ble .Lctrmm_kernel_L1_M4_40 | |||
| .align 5 | |||
| ctrmm_kernel_L1_M4_22: | |||
| .Lctrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M4_22 | |||
| bgt .Lctrmm_kernel_L1_M4_22 | |||
| ctrmm_kernel_L1_M4_40: | |||
| .Lctrmm_kernel_L1_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L1_M4_100 | |||
| ble .Lctrmm_kernel_L1_M4_100 | |||
| ctrmm_kernel_L1_M4_42: | |||
| .Lctrmm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M4_42 | |||
| bgt .Lctrmm_kernel_L1_M4_42 | |||
| ctrmm_kernel_L1_M4_100: | |||
| .Lctrmm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| @@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ctrmm_kernel_L1_M4_END: | |||
| .Lctrmm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt ctrmm_kernel_L1_M4_20 | |||
| bgt .Lctrmm_kernel_L1_M4_20 | |||
| ctrmm_kernel_L1_M2_BEGIN: | |||
| .Lctrmm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ctrmm_kernel_L1_END | |||
| ble .Lctrmm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L1_M1_BEGIN | |||
| ble .Lctrmm_kernel_L1_M1_BEGIN | |||
| ctrmm_kernel_L1_M2_20: | |||
| .Lctrmm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L1_M2_40 | |||
| ble .Lctrmm_kernel_L1_M2_40 | |||
| ctrmm_kernel_L1_M2_22: | |||
| .Lctrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M2_22 | |||
| bgt .Lctrmm_kernel_L1_M2_22 | |||
| ctrmm_kernel_L1_M2_40: | |||
| .Lctrmm_kernel_L1_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L1_M2_100 | |||
| ble .Lctrmm_kernel_L1_M2_100 | |||
| ctrmm_kernel_L1_M2_42: | |||
| .Lctrmm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M2_42 | |||
| bgt .Lctrmm_kernel_L1_M2_42 | |||
| ctrmm_kernel_L1_M2_100: | |||
| .Lctrmm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| @@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ctrmm_kernel_L1_M2_END: | |||
| .Lctrmm_kernel_L1_M2_END: | |||
| ctrmm_kernel_L1_M1_BEGIN: | |||
| .Lctrmm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ctrmm_kernel_L1_END | |||
| ble .Lctrmm_kernel_L1_END | |||
| ctrmm_kernel_L1_M1_20: | |||
| .Lctrmm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L1_M1_40 | |||
| ble .Lctrmm_kernel_L1_M1_40 | |||
| ctrmm_kernel_L1_M1_22: | |||
| .Lctrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M1_22 | |||
| bgt .Lctrmm_kernel_L1_M1_22 | |||
| ctrmm_kernel_L1_M1_40: | |||
| .Lctrmm_kernel_L1_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L1_M1_100 | |||
| ble .Lctrmm_kernel_L1_M1_100 | |||
| ctrmm_kernel_L1_M1_42: | |||
| .Lctrmm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M1_42 | |||
| bgt .Lctrmm_kernel_L1_M1_42 | |||
| ctrmm_kernel_L1_M1_100: | |||
| .Lctrmm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| ctrmm_kernel_L1_END: | |||
| .Lctrmm_kernel_L1_END: | |||
| ctrmm_kernel_L999: | |||
| .Lctrmm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble ctrmm_kernel_L2_BEGIN | |||
| ble .Lctrmm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| ctrmm_kernel_L4_BEGIN: | |||
| .Lctrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN: | |||
| #endif | |||
| mov pA, origPA // pA = start of A array | |||
| ctrmm_kernel_L4_M8_BEGIN: | |||
| .Lctrmm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble ctrmm_kernel_L4_M4_BEGIN | |||
| ble .Lctrmm_kernel_L4_M4_BEGIN | |||
| ctrmm_kernel_L4_M8_20: | |||
| .Lctrmm_kernel_L4_M8_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20: | |||
| asr counterL , tempK, #3 | |||
| cmp counterL , #2 | |||
| blt ctrmm_kernel_L4_M8_32 | |||
| blt .Lctrmm_kernel_L4_M8_32 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble ctrmm_kernel_L4_M8_22a | |||
| ble .Lctrmm_kernel_L4_M8_22a | |||
| .align 5 | |||
| ctrmm_kernel_L4_M8_22: | |||
| .Lctrmm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M8_22 | |||
| bgt .Lctrmm_kernel_L4_M8_22 | |||
| .align 5 | |||
| ctrmm_kernel_L4_M8_22a: | |||
| .Lctrmm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b ctrmm_kernel_L4_M8_44 | |||
| b .Lctrmm_kernel_L4_M8_44 | |||
| .align 5 | |||
| ctrmm_kernel_L4_M8_32: | |||
| .Lctrmm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble ctrmm_kernel_L4_M8_40 | |||
| ble .Lctrmm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b ctrmm_kernel_L4_M8_44 | |||
| b .Lctrmm_kernel_L4_M8_44 | |||
| ctrmm_kernel_L4_M8_40: | |||
| .Lctrmm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| ctrmm_kernel_L4_M8_44: | |||
| .Lctrmm_kernel_L4_M8_44: | |||
| ands counterL , tempK, #7 | |||
| ble ctrmm_kernel_L4_M8_100 | |||
| ble .Lctrmm_kernel_L4_M8_100 | |||
| .align 5 | |||
| ctrmm_kernel_L4_M8_46: | |||
| .Lctrmm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne ctrmm_kernel_L4_M8_46 | |||
| bne .Lctrmm_kernel_L4_M8_46 | |||
| ctrmm_kernel_L4_M8_100: | |||
| .Lctrmm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| @@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100: | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| ctrmm_kernel_L4_M8_END: | |||
| .Lctrmm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bne ctrmm_kernel_L4_M8_20 | |||
| bne .Lctrmm_kernel_L4_M8_20 | |||
| ctrmm_kernel_L4_M4_BEGIN: | |||
| .Lctrmm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble ctrmm_kernel_L4_END | |||
| ble .Lctrmm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble ctrmm_kernel_L4_M2_BEGIN | |||
| ble .Lctrmm_kernel_L4_M2_BEGIN | |||
| ctrmm_kernel_L4_M4_20: | |||
| .Lctrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20: | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt ctrmm_kernel_L4_M4_32 | |||
| blt .Lctrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble ctrmm_kernel_L4_M4_22a | |||
| ble .Lctrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| ctrmm_kernel_L4_M4_22: | |||
| .Lctrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M4_22 | |||
| bgt .Lctrmm_kernel_L4_M4_22 | |||
| ctrmm_kernel_L4_M4_22a: | |||
| .Lctrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ctrmm_kernel_L4_M4_44 | |||
| ctrmm_kernel_L4_M4_32: | |||
| b .Lctrmm_kernel_L4_M4_44 | |||
| .Lctrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble ctrmm_kernel_L4_M4_40 | |||
| ble .Lctrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b ctrmm_kernel_L4_M4_44 | |||
| ctrmm_kernel_L4_M4_40: | |||
| b .Lctrmm_kernel_L4_M4_44 | |||
| .Lctrmm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| ctrmm_kernel_L4_M4_44: | |||
| .Lctrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ble ctrmm_kernel_L4_M4_100 | |||
| ble .Lctrmm_kernel_L4_M4_100 | |||
| ctrmm_kernel_L4_M4_46: | |||
| .Lctrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| ctrmm_kernel_L4_M4_100: | |||
| .Lctrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ctrmm_kernel_L4_M4_END: | |||
| .Lctrmm_kernel_L4_M4_END: | |||
| ctrmm_kernel_L4_M2_BEGIN: | |||
| .Lctrmm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ctrmm_kernel_L4_END | |||
| ble .Lctrmm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L4_M1_BEGIN | |||
| ble .Lctrmm_kernel_L4_M1_BEGIN | |||
| ctrmm_kernel_L4_M2_20: | |||
| .Lctrmm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L4_M2_40 | |||
| ble .Lctrmm_kernel_L4_M2_40 | |||
| ctrmm_kernel_L4_M2_22: | |||
| .Lctrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M2_22 | |||
| bgt .Lctrmm_kernel_L4_M2_22 | |||
| ctrmm_kernel_L4_M2_40: | |||
| .Lctrmm_kernel_L4_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L4_M2_100 | |||
| ble .Lctrmm_kernel_L4_M2_100 | |||
| ctrmm_kernel_L4_M2_42: | |||
| .Lctrmm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M2_42 | |||
| bgt .Lctrmm_kernel_L4_M2_42 | |||
| ctrmm_kernel_L4_M2_100: | |||
| .Lctrmm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| @@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ctrmm_kernel_L4_M2_END: | |||
| .Lctrmm_kernel_L4_M2_END: | |||
| ctrmm_kernel_L4_M1_BEGIN: | |||
| .Lctrmm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ctrmm_kernel_L4_END | |||
| ble .Lctrmm_kernel_L4_END | |||
| ctrmm_kernel_L4_M1_20: | |||
| .Lctrmm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L4_M1_40 | |||
| ble .Lctrmm_kernel_L4_M1_40 | |||
| ctrmm_kernel_L4_M1_22: | |||
| .Lctrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M1_22 | |||
| bgt .Lctrmm_kernel_L4_M1_22 | |||
| ctrmm_kernel_L4_M1_40: | |||
| .Lctrmm_kernel_L4_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L4_M1_100 | |||
| ble .Lctrmm_kernel_L4_M1_100 | |||
| ctrmm_kernel_L4_M1_42: | |||
| .Lctrmm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L4_M1_42 | |||
| bgt .Lctrmm_kernel_L4_M1_42 | |||
| ctrmm_kernel_L4_M1_100: | |||
| .Lctrmm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| @@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| ctrmm_kernel_L4_END: | |||
| .Lctrmm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| @@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END: | |||
| #endif | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt ctrmm_kernel_L4_BEGIN | |||
| bgt .Lctrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble ctrmm_kernel_L999 // error, N was less than 4? | |||
| ble .Lctrmm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble ctrmm_kernel_L1_BEGIN | |||
| ble .Lctrmm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| #endif | |||
| mov pA, origPA // pA = A | |||
| ctrmm_kernel_L2_M8_BEGIN: | |||
| .Lctrmm_kernel_L2_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble ctrmm_kernel_L2_M4_BEGIN | |||
| ble .Lctrmm_kernel_L2_M4_BEGIN | |||
| ctrmm_kernel_L2_M8_20: | |||
| .Lctrmm_kernel_L2_M8_20: | |||
| INIT8x2 | |||
| @@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble ctrmm_kernel_L2_M8_40 | |||
| ble .Lctrmm_kernel_L2_M8_40 | |||
| .align 5 | |||
| ctrmm_kernel_L2_M8_22: | |||
| .Lctrmm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| @@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M8_22 | |||
| bgt .Lctrmm_kernel_L2_M8_22 | |||
| ctrmm_kernel_L2_M8_40: | |||
| .Lctrmm_kernel_L2_M8_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L2_M8_100 | |||
| ble .Lctrmm_kernel_L2_M8_100 | |||
| ctrmm_kernel_L2_M8_42: | |||
| .Lctrmm_kernel_L2_M8_42: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M8_42 | |||
| bgt .Lctrmm_kernel_L2_M8_42 | |||
| ctrmm_kernel_L2_M8_100: | |||
| .Lctrmm_kernel_L2_M8_100: | |||
| SAVE8x2 | |||
| @@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100: | |||
| add tempOffset, tempOffset, #8 | |||
| #endif | |||
| ctrmm_kernel_L2_M8_END: | |||
| .Lctrmm_kernel_L2_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt ctrmm_kernel_L2_M8_20 | |||
| bgt .Lctrmm_kernel_L2_M8_20 | |||
| ctrmm_kernel_L2_M4_BEGIN: | |||
| .Lctrmm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble ctrmm_kernel_L2_END | |||
| ble .Lctrmm_kernel_L2_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L2_M2_BEGIN | |||
| ble .Lctrmm_kernel_L2_M2_BEGIN | |||
| ctrmm_kernel_L2_M4_20: | |||
| .Lctrmm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble ctrmm_kernel_L2_M4_40 | |||
| ble .Lctrmm_kernel_L2_M4_40 | |||
| .align 5 | |||
| ctrmm_kernel_L2_M4_22: | |||
| .Lctrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M4_22 | |||
| bgt .Lctrmm_kernel_L2_M4_22 | |||
| ctrmm_kernel_L2_M4_40: | |||
| .Lctrmm_kernel_L2_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L2_M4_100 | |||
| ble .Lctrmm_kernel_L2_M4_100 | |||
| ctrmm_kernel_L2_M4_42: | |||
| .Lctrmm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M4_42 | |||
| bgt .Lctrmm_kernel_L2_M4_42 | |||
| ctrmm_kernel_L2_M4_100: | |||
| .Lctrmm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| @@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ctrmm_kernel_L2_M4_END: | |||
| .Lctrmm_kernel_L2_M4_END: | |||
| ctrmm_kernel_L2_M2_BEGIN: | |||
| .Lctrmm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ctrmm_kernel_L2_END | |||
| ble .Lctrmm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L2_M1_BEGIN | |||
| ble .Lctrmm_kernel_L2_M1_BEGIN | |||
| ctrmm_kernel_L2_M2_20: | |||
| .Lctrmm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble ctrmm_kernel_L2_M2_40 | |||
| ble .Lctrmm_kernel_L2_M2_40 | |||
| ctrmm_kernel_L2_M2_22: | |||
| .Lctrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M2_22 | |||
| bgt .Lctrmm_kernel_L2_M2_22 | |||
| ctrmm_kernel_L2_M2_40: | |||
| .Lctrmm_kernel_L2_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L2_M2_100 | |||
| ble .Lctrmm_kernel_L2_M2_100 | |||
| ctrmm_kernel_L2_M2_42: | |||
| .Lctrmm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M2_42 | |||
| bgt .Lctrmm_kernel_L2_M2_42 | |||
| ctrmm_kernel_L2_M2_100: | |||
| .Lctrmm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| @@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ctrmm_kernel_L2_M2_END: | |||
| .Lctrmm_kernel_L2_M2_END: | |||
| ctrmm_kernel_L2_M1_BEGIN: | |||
| .Lctrmm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ctrmm_kernel_L2_END | |||
| ble .Lctrmm_kernel_L2_END | |||
| ctrmm_kernel_L2_M1_20: | |||
| .Lctrmm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble ctrmm_kernel_L2_M1_40 | |||
| ble .Lctrmm_kernel_L2_M1_40 | |||
| ctrmm_kernel_L2_M1_22: | |||
| .Lctrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M1_22 | |||
| bgt .Lctrmm_kernel_L2_M1_22 | |||
| ctrmm_kernel_L2_M1_40: | |||
| .Lctrmm_kernel_L2_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L2_M1_100 | |||
| ble .Lctrmm_kernel_L2_M1_100 | |||
| ctrmm_kernel_L2_M1_42: | |||
| .Lctrmm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L2_M1_42 | |||
| bgt .Lctrmm_kernel_L2_M1_42 | |||
| ctrmm_kernel_L2_M1_100: | |||
| .Lctrmm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| @@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| ctrmm_kernel_L2_END: | |||
| .Lctrmm_kernel_L2_END: | |||
| #if !defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| @@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END: | |||
| /******************************************************************************/ | |||
| ctrmm_kernel_L1_BEGIN: | |||
| .Lctrmm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble ctrmm_kernel_L999 // done | |||
| ble .Lctrmm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC , pC , LDC // Update pC to point to next | |||
| @@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN: | |||
| #endif | |||
| mov pA, origPA // pA = A | |||
| ctrmm_kernel_L1_M8_BEGIN: | |||
| .Lctrmm_kernel_L1_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble ctrmm_kernel_L1_M4_BEGIN | |||
| ble .Lctrmm_kernel_L1_M4_BEGIN | |||
| ctrmm_kernel_L1_M8_20: | |||
| .Lctrmm_kernel_L1_M8_20: | |||
| INIT8x1 | |||
| @@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L1_M8_40 | |||
| ble .Lctrmm_kernel_L1_M8_40 | |||
| .align 5 | |||
| ctrmm_kernel_L1_M8_22: | |||
| .Lctrmm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| @@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M8_22 | |||
| bgt .Lctrmm_kernel_L1_M8_22 | |||
| ctrmm_kernel_L1_M8_40: | |||
| .Lctrmm_kernel_L1_M8_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L1_M8_100 | |||
| ble .Lctrmm_kernel_L1_M8_100 | |||
| ctrmm_kernel_L1_M8_42: | |||
| .Lctrmm_kernel_L1_M8_42: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M8_42 | |||
| bgt .Lctrmm_kernel_L1_M8_42 | |||
| ctrmm_kernel_L1_M8_100: | |||
| .Lctrmm_kernel_L1_M8_100: | |||
| SAVE8x1 | |||
| @@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100: | |||
| add tempOffset, tempOffset, #8 | |||
| #endif | |||
| ctrmm_kernel_L1_M8_END: | |||
| .Lctrmm_kernel_L1_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt ctrmm_kernel_L1_M8_20 | |||
| bgt .Lctrmm_kernel_L1_M8_20 | |||
| ctrmm_kernel_L1_M4_BEGIN: | |||
| .Lctrmm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble ctrmm_kernel_L1_END | |||
| ble .Lctrmm_kernel_L1_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L1_M2_BEGIN | |||
| ble .Lctrmm_kernel_L1_M2_BEGIN | |||
| ctrmm_kernel_L1_M4_20: | |||
| .Lctrmm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| @@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L1_M4_40 | |||
| ble .Lctrmm_kernel_L1_M4_40 | |||
| .align 5 | |||
| ctrmm_kernel_L1_M4_22: | |||
| .Lctrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M4_22 | |||
| bgt .Lctrmm_kernel_L1_M4_22 | |||
| ctrmm_kernel_L1_M4_40: | |||
| .Lctrmm_kernel_L1_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L1_M4_100 | |||
| ble .Lctrmm_kernel_L1_M4_100 | |||
| ctrmm_kernel_L1_M4_42: | |||
| .Lctrmm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M4_42 | |||
| bgt .Lctrmm_kernel_L1_M4_42 | |||
| ctrmm_kernel_L1_M4_100: | |||
| .Lctrmm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| @@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ctrmm_kernel_L1_M4_END: | |||
| .Lctrmm_kernel_L1_M4_END: | |||
| ctrmm_kernel_L1_M2_BEGIN: | |||
| .Lctrmm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ctrmm_kernel_L1_END | |||
| ble .Lctrmm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ctrmm_kernel_L1_M1_BEGIN | |||
| ble .Lctrmm_kernel_L1_M1_BEGIN | |||
| ctrmm_kernel_L1_M2_20: | |||
| .Lctrmm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L1_M2_40 | |||
| ble .Lctrmm_kernel_L1_M2_40 | |||
| ctrmm_kernel_L1_M2_22: | |||
| .Lctrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M2_22 | |||
| bgt .Lctrmm_kernel_L1_M2_22 | |||
| ctrmm_kernel_L1_M2_40: | |||
| .Lctrmm_kernel_L1_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L1_M2_100 | |||
| ble .Lctrmm_kernel_L1_M2_100 | |||
| ctrmm_kernel_L1_M2_42: | |||
| .Lctrmm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M2_42 | |||
| bgt .Lctrmm_kernel_L1_M2_42 | |||
| ctrmm_kernel_L1_M2_100: | |||
| .Lctrmm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| @@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ctrmm_kernel_L1_M2_END: | |||
| .Lctrmm_kernel_L1_M2_END: | |||
| ctrmm_kernel_L1_M1_BEGIN: | |||
| .Lctrmm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ctrmm_kernel_L1_END | |||
| ble .Lctrmm_kernel_L1_END | |||
| ctrmm_kernel_L1_M1_20: | |||
| .Lctrmm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ctrmm_kernel_L1_M1_40 | |||
| ble .Lctrmm_kernel_L1_M1_40 | |||
| ctrmm_kernel_L1_M1_22: | |||
| .Lctrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M1_22 | |||
| bgt .Lctrmm_kernel_L1_M1_22 | |||
| ctrmm_kernel_L1_M1_40: | |||
| .Lctrmm_kernel_L1_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ctrmm_kernel_L1_M1_100 | |||
| ble .Lctrmm_kernel_L1_M1_100 | |||
| ctrmm_kernel_L1_M1_42: | |||
| .Lctrmm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ctrmm_kernel_L1_M1_42 | |||
| bgt .Lctrmm_kernel_L1_M1_42 | |||
| ctrmm_kernel_L1_M1_100: | |||
| .Lctrmm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| ctrmm_kernel_L1_END: | |||
| .Lctrmm_kernel_L1_END: | |||
| ctrmm_kernel_L999: | |||
| .Lctrmm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble axpy_kernel_L999 | |||
| ble .Ldaxpy_kernel_L999 | |||
| fcmp DA, #0.0 | |||
| beq axpy_kernel_L999 | |||
| beq .Ldaxpy_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne axpy_kernel_S_BEGIN | |||
| bne .Ldaxpy_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne axpy_kernel_S_BEGIN | |||
| bne .Ldaxpy_kernel_S_BEGIN | |||
| axpy_kernel_F_BEGIN: | |||
| .Ldaxpy_kernel_F_BEGIN: | |||
| asr I, N, #5 | |||
| cmp I, xzr | |||
| beq axpy_kernel_F1 | |||
| beq .Ldaxpy_kernel_F1 | |||
| .align 5 | |||
| axpy_kernel_F32: | |||
| .Ldaxpy_kernel_F32: | |||
| KERNEL_F32 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_F32 | |||
| bne .Ldaxpy_kernel_F32 | |||
| axpy_kernel_F1: | |||
| .Ldaxpy_kernel_F1: | |||
| ands I, N, #31 | |||
| ble axpy_kernel_L999 | |||
| ble .Ldaxpy_kernel_L999 | |||
| axpy_kernel_F10: | |||
| .Ldaxpy_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_F10 | |||
| bne .Ldaxpy_kernel_F10 | |||
| b axpy_kernel_L999 | |||
| b .Ldaxpy_kernel_L999 | |||
| axpy_kernel_S_BEGIN: | |||
| .Ldaxpy_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble axpy_kernel_S1 | |||
| ble .Ldaxpy_kernel_S1 | |||
| axpy_kernel_S4: | |||
| .Ldaxpy_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -176,21 +176,21 @@ axpy_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_S4 | |||
| bne .Ldaxpy_kernel_S4 | |||
| axpy_kernel_S1: | |||
| .Ldaxpy_kernel_S1: | |||
| ands I, N, #3 | |||
| ble axpy_kernel_L999 | |||
| ble .Ldaxpy_kernel_L999 | |||
| axpy_kernel_S10: | |||
| .Ldaxpy_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne axpy_kernel_S10 | |||
| bne .Ldaxpy_kernel_S10 | |||
| axpy_kernel_L999: | |||
| .Ldaxpy_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble dgemm_kernel_L2_BEGIN | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| dgemm_kernel_L4_BEGIN: | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN: | |||
| //------------------------------------------------------------------------------ | |||
| dgemm_kernel_L4_M8_BEGIN: | |||
| .Ldgemm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L4_M4_BEGIN | |||
| ble .Ldgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_20: | |||
| .Ldgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #2 // L = K / 4 | |||
| cmp counterL , #2 | |||
| blt dgemm_kernel_L4_M8_32 | |||
| blt .Ldgemm_kernel_L4_M8_32 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dgemm_kernel_L4_M8_22a | |||
| ble .Ldgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22: | |||
| .Ldgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M8_22 | |||
| bgt .Ldgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22a: | |||
| .Ldgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| b .Ldgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_32: | |||
| .Ldgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dgemm_kernel_L4_M8_40 | |||
| ble .Ldgemm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| b .Ldgemm_kernel_L4_M8_44 | |||
| dgemm_kernel_L4_M8_40: | |||
| .Ldgemm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| dgemm_kernel_L4_M8_44: | |||
| .Ldgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #3 | |||
| ble dgemm_kernel_L4_M8_100 | |||
| ble .Ldgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_46: | |||
| .Ldgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dgemm_kernel_L4_M8_46 | |||
| bne .Ldgemm_kernel_L4_M8_46 | |||
| dgemm_kernel_L4_M8_100: | |||
| .Ldgemm_kernel_L4_M8_100: | |||
| lsl temp, origK, #5 | |||
| prfm PLDL1KEEP, [pA, temp] | |||
| prfm PLDL1KEEP, [ppA, temp] | |||
| @@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| dgemm_kernel_L4_M8_END: | |||
| .Ldgemm_kernel_L4_M8_END: | |||
| lsl temp, origK, #5 // k * 4 * 8 | |||
| add pA, pA, temp | |||
| add ppA, ppA, temp | |||
| subs counterI, counterI, #1 | |||
| bne dgemm_kernel_L4_M8_20 | |||
| bne .Ldgemm_kernel_L4_M8_20 | |||
| dgemm_kernel_L4_M4_BEGIN: | |||
| .Ldgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble dgemm_kernel_L4_M2_BEGIN | |||
| ble .Ldgemm_kernel_L4_M2_BEGIN | |||
| dgemm_kernel_L4_M4_20: | |||
| .Ldgemm_kernel_L4_M4_20: | |||
| INIT4x4 | |||
| mov pB, origPB | |||
| asr counterL, origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dgemm_kernel_L4_M4_40 | |||
| ble .Ldgemm_kernel_L4_M4_40 | |||
| dgemm_kernel_L4_M4_22: | |||
| .Ldgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| KERNEL4x4_SUB | |||
| @@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_22 | |||
| bgt .Ldgemm_kernel_L4_M4_22 | |||
| dgemm_kernel_L4_M4_40: | |||
| .Ldgemm_kernel_L4_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M4_100 | |||
| ble .Ldgemm_kernel_L4_M4_100 | |||
| dgemm_kernel_L4_M4_42: | |||
| .Ldgemm_kernel_L4_M4_42: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_42 | |||
| bgt .Ldgemm_kernel_L4_M4_42 | |||
| dgemm_kernel_L4_M4_100: | |||
| .Ldgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| dgemm_kernel_L4_M4_END: | |||
| .Ldgemm_kernel_L4_M4_END: | |||
| dgemm_kernel_L4_M2_BEGIN: | |||
| .Ldgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L4_M1_BEGIN | |||
| ble .Ldgemm_kernel_L4_M1_BEGIN | |||
| dgemm_kernel_L4_M2_20: | |||
| .Ldgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M2_40 | |||
| ble .Ldgemm_kernel_L4_M2_40 | |||
| dgemm_kernel_L4_M2_22: | |||
| .Ldgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_22 | |||
| bgt .Ldgemm_kernel_L4_M2_22 | |||
| dgemm_kernel_L4_M2_40: | |||
| .Ldgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M2_100 | |||
| ble .Ldgemm_kernel_L4_M2_100 | |||
| dgemm_kernel_L4_M2_42: | |||
| .Ldgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_42 | |||
| bgt .Ldgemm_kernel_L4_M2_42 | |||
| dgemm_kernel_L4_M2_100: | |||
| .Ldgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| dgemm_kernel_L4_M2_END: | |||
| .Ldgemm_kernel_L4_M2_END: | |||
| dgemm_kernel_L4_M1_BEGIN: | |||
| .Ldgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| dgemm_kernel_L4_M1_20: | |||
| .Ldgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M1_40 | |||
| ble .Ldgemm_kernel_L4_M1_40 | |||
| dgemm_kernel_L4_M1_22: | |||
| .Ldgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_22 | |||
| bgt .Ldgemm_kernel_L4_M1_22 | |||
| dgemm_kernel_L4_M1_40: | |||
| .Ldgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M1_100 | |||
| ble .Ldgemm_kernel_L4_M1_100 | |||
| dgemm_kernel_L4_M1_42: | |||
| .Ldgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_42 | |||
| bgt .Ldgemm_kernel_L4_M1_42 | |||
| dgemm_kernel_L4_M1_100: | |||
| .Ldgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| dgemm_kernel_L4_END: | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt dgemm_kernel_L4_BEGIN | |||
| bgt .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble dgemm_kernel_L999 // error, N was less than 4? | |||
| ble .Ldgemm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble dgemm_kernel_L1_BEGIN | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| dgemm_kernel_L2_M4_BEGIN: | |||
| .Ldgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble dgemm_kernel_L2_M2_BEGIN | |||
| ble .Ldgemm_kernel_L2_M2_BEGIN | |||
| dgemm_kernel_L2_M4_20: | |||
| .Ldgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M4_40 | |||
| ble .Ldgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L2_M4_22: | |||
| .Ldgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_22 | |||
| bgt .Ldgemm_kernel_L2_M4_22 | |||
| dgemm_kernel_L2_M4_40: | |||
| .Ldgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M4_100 | |||
| ble .Ldgemm_kernel_L2_M4_100 | |||
| dgemm_kernel_L2_M4_42: | |||
| .Ldgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_42 | |||
| bgt .Ldgemm_kernel_L2_M4_42 | |||
| dgemm_kernel_L2_M4_100: | |||
| .Ldgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| dgemm_kernel_L2_M4_END: | |||
| .Ldgemm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L2_M4_20 | |||
| bgt .Ldgemm_kernel_L2_M4_20 | |||
| dgemm_kernel_L2_M2_BEGIN: | |||
| .Ldgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L2_M1_BEGIN | |||
| ble .Ldgemm_kernel_L2_M1_BEGIN | |||
| dgemm_kernel_L2_M2_20: | |||
| .Ldgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M2_40 | |||
| ble .Ldgemm_kernel_L2_M2_40 | |||
| dgemm_kernel_L2_M2_22: | |||
| .Ldgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_22 | |||
| bgt .Ldgemm_kernel_L2_M2_22 | |||
| dgemm_kernel_L2_M2_40: | |||
| .Ldgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M2_100 | |||
| ble .Ldgemm_kernel_L2_M2_100 | |||
| dgemm_kernel_L2_M2_42: | |||
| .Ldgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_42 | |||
| bgt .Ldgemm_kernel_L2_M2_42 | |||
| dgemm_kernel_L2_M2_100: | |||
| .Ldgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| dgemm_kernel_L2_M2_END: | |||
| .Ldgemm_kernel_L2_M2_END: | |||
| dgemm_kernel_L2_M1_BEGIN: | |||
| .Ldgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| dgemm_kernel_L2_M1_20: | |||
| .Ldgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dgemm_kernel_L2_M1_40 | |||
| ble .Ldgemm_kernel_L2_M1_40 | |||
| dgemm_kernel_L2_M1_22: | |||
| .Ldgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_22 | |||
| bgt .Ldgemm_kernel_L2_M1_22 | |||
| dgemm_kernel_L2_M1_40: | |||
| .Ldgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M1_100 | |||
| ble .Ldgemm_kernel_L2_M1_100 | |||
| dgemm_kernel_L2_M1_42: | |||
| .Ldgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_42 | |||
| bgt .Ldgemm_kernel_L2_M1_42 | |||
| dgemm_kernel_L2_M1_100: | |||
| .Ldgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| dgemm_kernel_L2_END: | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L1_BEGIN: | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble dgemm_kernel_L999 // done | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN: | |||
| dgemm_kernel_L1_M4_BEGIN: | |||
| .Ldgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L1_M2_BEGIN | |||
| ble .Ldgemm_kernel_L1_M2_BEGIN | |||
| dgemm_kernel_L1_M4_20: | |||
| .Ldgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M4_40 | |||
| ble .Ldgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L1_M4_22: | |||
| .Ldgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_22 | |||
| bgt .Ldgemm_kernel_L1_M4_22 | |||
| dgemm_kernel_L1_M4_40: | |||
| .Ldgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M4_100 | |||
| ble .Ldgemm_kernel_L1_M4_100 | |||
| dgemm_kernel_L1_M4_42: | |||
| .Ldgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_42 | |||
| bgt .Ldgemm_kernel_L1_M4_42 | |||
| dgemm_kernel_L1_M4_100: | |||
| .Ldgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| dgemm_kernel_L1_M4_END: | |||
| .Ldgemm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L1_M4_20 | |||
| bgt .Ldgemm_kernel_L1_M4_20 | |||
| dgemm_kernel_L1_M2_BEGIN: | |||
| .Ldgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L1_M1_BEGIN | |||
| ble .Ldgemm_kernel_L1_M1_BEGIN | |||
| dgemm_kernel_L1_M2_20: | |||
| .Ldgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M2_40 | |||
| ble .Ldgemm_kernel_L1_M2_40 | |||
| dgemm_kernel_L1_M2_22: | |||
| .Ldgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_22 | |||
| bgt .Ldgemm_kernel_L1_M2_22 | |||
| dgemm_kernel_L1_M2_40: | |||
| .Ldgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M2_100 | |||
| ble .Ldgemm_kernel_L1_M2_100 | |||
| dgemm_kernel_L1_M2_42: | |||
| .Ldgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_42 | |||
| bgt .Ldgemm_kernel_L1_M2_42 | |||
| dgemm_kernel_L1_M2_100: | |||
| .Ldgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| dgemm_kernel_L1_M2_END: | |||
| .Ldgemm_kernel_L1_M2_END: | |||
| dgemm_kernel_L1_M1_BEGIN: | |||
| .Ldgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| dgemm_kernel_L1_M1_20: | |||
| .Ldgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M1_40 | |||
| ble .Ldgemm_kernel_L1_M1_40 | |||
| dgemm_kernel_L1_M1_22: | |||
| .Ldgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_22 | |||
| bgt .Ldgemm_kernel_L1_M1_22 | |||
| dgemm_kernel_L1_M1_40: | |||
| .Ldgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M1_100 | |||
| ble .Ldgemm_kernel_L1_M1_100 | |||
| dgemm_kernel_L1_M1_42: | |||
| .Ldgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_42 | |||
| bgt .Ldgemm_kernel_L1_M1_42 | |||
| dgemm_kernel_L1_M1_100: | |||
| .Ldgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| dgemm_kernel_L1_END: | |||
| .Ldgemm_kernel_L1_END: | |||
| dgemm_kernel_L999: | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble dgemm_kernel_L4_BEGIN | |||
| ble .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L8_BEGIN: | |||
| .Ldgemm_kernel_L8_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #3 | |||
| mov pA, origPA // pA = start of A array | |||
| dgemm_kernel_L8_M4_BEGIN: | |||
| .Ldgemm_kernel_L8_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L8_M2_BEGIN | |||
| ble .Ldgemm_kernel_L8_M2_BEGIN | |||
| dgemm_kernel_L8_M4_20: | |||
| .Ldgemm_kernel_L8_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dgemm_kernel_L8_M4_32 | |||
| blt .Ldgemm_kernel_L8_M4_32 | |||
| KERNEL4x8_I // do one in the K | |||
| KERNEL4x8_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble dgemm_kernel_L8_M4_22a | |||
| ble .Ldgemm_kernel_L8_M4_22a | |||
| .align 5 | |||
| dgemm_kernel_L8_M4_22: | |||
| .Ldgemm_kernel_L8_M4_22: | |||
| KERNEL4x8_M1 | |||
| KERNEL4x8_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L8_M4_22 | |||
| bgt .Ldgemm_kernel_L8_M4_22 | |||
| dgemm_kernel_L8_M4_22a: | |||
| .Ldgemm_kernel_L8_M4_22a: | |||
| KERNEL4x8_M1 | |||
| KERNEL4x8_E | |||
| b dgemm_kernel_L8_M4_44 | |||
| b .Ldgemm_kernel_L8_M4_44 | |||
| dgemm_kernel_L8_M4_32: | |||
| .Ldgemm_kernel_L8_M4_32: | |||
| tst counterL, #1 | |||
| ble dgemm_kernel_L8_M4_40 | |||
| ble .Ldgemm_kernel_L8_M4_40 | |||
| KERNEL4x8_I | |||
| KERNEL4x8_E | |||
| b dgemm_kernel_L8_M4_44 | |||
| b .Ldgemm_kernel_L8_M4_44 | |||
| dgemm_kernel_L8_M4_40: | |||
| .Ldgemm_kernel_L8_M4_40: | |||
| INIT4x8 | |||
| dgemm_kernel_L8_M4_44: | |||
| .Ldgemm_kernel_L8_M4_44: | |||
| ands counterL , origK, #1 | |||
| ble dgemm_kernel_L8_M4_100 | |||
| ble .Ldgemm_kernel_L8_M4_100 | |||
| dgemm_kernel_L8_M4_46: | |||
| .Ldgemm_kernel_L8_M4_46: | |||
| KERNEL4x8_SUB | |||
| dgemm_kernel_L8_M4_100: | |||
| .Ldgemm_kernel_L8_M4_100: | |||
| SAVE4x8 | |||
| dgemm_kernel_L8_M4_END: | |||
| .Ldgemm_kernel_L8_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne dgemm_kernel_L8_M4_20 | |||
| bne .Ldgemm_kernel_L8_M4_20 | |||
| dgemm_kernel_L8_M2_BEGIN: | |||
| .Ldgemm_kernel_L8_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L8_END | |||
| ble .Ldgemm_kernel_L8_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L8_M1_BEGIN | |||
| ble .Ldgemm_kernel_L8_M1_BEGIN | |||
| dgemm_kernel_L8_M2_20: | |||
| .Ldgemm_kernel_L8_M2_20: | |||
| INIT2x8 | |||
| @@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L8_M2_40 | |||
| ble .Ldgemm_kernel_L8_M2_40 | |||
| dgemm_kernel_L8_M2_22: | |||
| .Ldgemm_kernel_L8_M2_22: | |||
| KERNEL2x8_SUB | |||
| KERNEL2x8_SUB | |||
| @@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22: | |||
| KERNEL2x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L8_M2_22 | |||
| bgt .Ldgemm_kernel_L8_M2_22 | |||
| dgemm_kernel_L8_M2_40: | |||
| .Ldgemm_kernel_L8_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L8_M2_100 | |||
| ble .Ldgemm_kernel_L8_M2_100 | |||
| dgemm_kernel_L8_M2_42: | |||
| .Ldgemm_kernel_L8_M2_42: | |||
| KERNEL2x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L8_M2_42 | |||
| bgt .Ldgemm_kernel_L8_M2_42 | |||
| dgemm_kernel_L8_M2_100: | |||
| .Ldgemm_kernel_L8_M2_100: | |||
| SAVE2x8 | |||
| dgemm_kernel_L8_M2_END: | |||
| .Ldgemm_kernel_L8_M2_END: | |||
| dgemm_kernel_L8_M1_BEGIN: | |||
| .Ldgemm_kernel_L8_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L8_END | |||
| ble .Ldgemm_kernel_L8_END | |||
| dgemm_kernel_L8_M1_20: | |||
| .Ldgemm_kernel_L8_M1_20: | |||
| INIT1x8 | |||
| @@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L8_M1_40 | |||
| ble .Ldgemm_kernel_L8_M1_40 | |||
| dgemm_kernel_L8_M1_22: | |||
| .Ldgemm_kernel_L8_M1_22: | |||
| KERNEL1x8_SUB | |||
| KERNEL1x8_SUB | |||
| KERNEL1x8_SUB | |||
| @@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22: | |||
| KERNEL1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L8_M1_22 | |||
| bgt .Ldgemm_kernel_L8_M1_22 | |||
| dgemm_kernel_L8_M1_40: | |||
| .Ldgemm_kernel_L8_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L8_M1_100 | |||
| ble .Ldgemm_kernel_L8_M1_100 | |||
| dgemm_kernel_L8_M1_42: | |||
| .Ldgemm_kernel_L8_M1_42: | |||
| KERNEL1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L8_M1_42 | |||
| bgt .Ldgemm_kernel_L8_M1_42 | |||
| dgemm_kernel_L8_M1_100: | |||
| .Ldgemm_kernel_L8_M1_100: | |||
| SAVE1x8 | |||
| dgemm_kernel_L8_END: | |||
| .Ldgemm_kernel_L8_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 8 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt dgemm_kernel_L8_BEGIN | |||
| bgt .Ldgemm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L4_BEGIN: | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #7 | |||
| ble dgemm_kernel_L999 | |||
| ble .Ldgemm_kernel_L999 | |||
| tst counterJ , #4 | |||
| ble dgemm_kernel_L2_BEGIN | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pA, origPA // pA = start of A array | |||
| dgemm_kernel_L4_M4_BEGIN: | |||
| .Ldgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L4_M2_BEGIN | |||
| ble .Ldgemm_kernel_L4_M2_BEGIN | |||
| dgemm_kernel_L4_M4_20: | |||
| .Ldgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dgemm_kernel_L4_M4_32 | |||
| blt .Ldgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble dgemm_kernel_L4_M4_22a | |||
| ble .Ldgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| dgemm_kernel_L4_M4_22: | |||
| .Ldgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_22 | |||
| bgt .Ldgemm_kernel_L4_M4_22 | |||
| dgemm_kernel_L4_M4_22a: | |||
| .Ldgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b dgemm_kernel_L4_M4_44 | |||
| b .Ldgemm_kernel_L4_M4_44 | |||
| dgemm_kernel_L4_M4_32: | |||
| .Ldgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble dgemm_kernel_L4_M4_40 | |||
| ble .Ldgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b dgemm_kernel_L4_M4_44 | |||
| b .Ldgemm_kernel_L4_M4_44 | |||
| dgemm_kernel_L4_M4_40: | |||
| .Ldgemm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| dgemm_kernel_L4_M4_44: | |||
| .Ldgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #1 | |||
| ble dgemm_kernel_L4_M4_100 | |||
| ble .Ldgemm_kernel_L4_M4_100 | |||
| dgemm_kernel_L4_M4_46: | |||
| .Ldgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| dgemm_kernel_L4_M4_100: | |||
| .Ldgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| dgemm_kernel_L4_M4_END: | |||
| .Ldgemm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne dgemm_kernel_L4_M4_20 | |||
| bne .Ldgemm_kernel_L4_M4_20 | |||
| dgemm_kernel_L4_M2_BEGIN: | |||
| .Ldgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L4_M1_BEGIN | |||
| ble .Ldgemm_kernel_L4_M1_BEGIN | |||
| dgemm_kernel_L4_M2_20: | |||
| .Ldgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M2_40 | |||
| ble .Ldgemm_kernel_L4_M2_40 | |||
| dgemm_kernel_L4_M2_22: | |||
| .Ldgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_22 | |||
| bgt .Ldgemm_kernel_L4_M2_22 | |||
| dgemm_kernel_L4_M2_40: | |||
| .Ldgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M2_100 | |||
| ble .Ldgemm_kernel_L4_M2_100 | |||
| dgemm_kernel_L4_M2_42: | |||
| .Ldgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_42 | |||
| bgt .Ldgemm_kernel_L4_M2_42 | |||
| dgemm_kernel_L4_M2_100: | |||
| .Ldgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| dgemm_kernel_L4_M2_END: | |||
| .Ldgemm_kernel_L4_M2_END: | |||
| dgemm_kernel_L4_M1_BEGIN: | |||
| .Ldgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| dgemm_kernel_L4_M1_20: | |||
| .Ldgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M1_40 | |||
| ble .Ldgemm_kernel_L4_M1_40 | |||
| dgemm_kernel_L4_M1_22: | |||
| .Ldgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_22 | |||
| bgt .Ldgemm_kernel_L4_M1_22 | |||
| dgemm_kernel_L4_M1_40: | |||
| .Ldgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M1_100 | |||
| ble .Ldgemm_kernel_L4_M1_100 | |||
| dgemm_kernel_L4_M1_42: | |||
| .Ldgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_42 | |||
| bgt .Ldgemm_kernel_L4_M1_42 | |||
| dgemm_kernel_L4_M1_100: | |||
| .Ldgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| dgemm_kernel_L4_END: | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble dgemm_kernel_L999 // error, N was less than 4? | |||
| ble .Ldgemm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble dgemm_kernel_L1_BEGIN | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| dgemm_kernel_L2_M4_BEGIN: | |||
| .Ldgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble dgemm_kernel_L2_M2_BEGIN | |||
| ble .Ldgemm_kernel_L2_M2_BEGIN | |||
| dgemm_kernel_L2_M4_20: | |||
| .Ldgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M4_40 | |||
| ble .Ldgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L2_M4_22: | |||
| .Ldgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_22 | |||
| bgt .Ldgemm_kernel_L2_M4_22 | |||
| dgemm_kernel_L2_M4_40: | |||
| .Ldgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M4_100 | |||
| ble .Ldgemm_kernel_L2_M4_100 | |||
| dgemm_kernel_L2_M4_42: | |||
| .Ldgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_42 | |||
| bgt .Ldgemm_kernel_L2_M4_42 | |||
| dgemm_kernel_L2_M4_100: | |||
| .Ldgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| dgemm_kernel_L2_M4_END: | |||
| .Ldgemm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L2_M4_20 | |||
| bgt .Ldgemm_kernel_L2_M4_20 | |||
| dgemm_kernel_L2_M2_BEGIN: | |||
| .Ldgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L2_M1_BEGIN | |||
| ble .Ldgemm_kernel_L2_M1_BEGIN | |||
| dgemm_kernel_L2_M2_20: | |||
| .Ldgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M2_40 | |||
| ble .Ldgemm_kernel_L2_M2_40 | |||
| dgemm_kernel_L2_M2_22: | |||
| .Ldgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_22 | |||
| bgt .Ldgemm_kernel_L2_M2_22 | |||
| dgemm_kernel_L2_M2_40: | |||
| .Ldgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M2_100 | |||
| ble .Ldgemm_kernel_L2_M2_100 | |||
| dgemm_kernel_L2_M2_42: | |||
| .Ldgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_42 | |||
| bgt .Ldgemm_kernel_L2_M2_42 | |||
| dgemm_kernel_L2_M2_100: | |||
| .Ldgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| dgemm_kernel_L2_M2_END: | |||
| .Ldgemm_kernel_L2_M2_END: | |||
| dgemm_kernel_L2_M1_BEGIN: | |||
| .Ldgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| dgemm_kernel_L2_M1_20: | |||
| .Ldgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dgemm_kernel_L2_M1_40 | |||
| ble .Ldgemm_kernel_L2_M1_40 | |||
| dgemm_kernel_L2_M1_22: | |||
| .Ldgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_22 | |||
| bgt .Ldgemm_kernel_L2_M1_22 | |||
| dgemm_kernel_L2_M1_40: | |||
| .Ldgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M1_100 | |||
| ble .Ldgemm_kernel_L2_M1_100 | |||
| dgemm_kernel_L2_M1_42: | |||
| .Ldgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_42 | |||
| bgt .Ldgemm_kernel_L2_M1_42 | |||
| dgemm_kernel_L2_M1_100: | |||
| .Ldgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| dgemm_kernel_L2_END: | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L1_BEGIN: | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble dgemm_kernel_L999 // done | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN: | |||
| mov pA, origPA // pA = A | |||
| dgemm_kernel_L1_M4_BEGIN: | |||
| .Ldgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L1_M2_BEGIN | |||
| ble .Ldgemm_kernel_L1_M2_BEGIN | |||
| dgemm_kernel_L1_M4_20: | |||
| .Ldgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M4_40 | |||
| ble .Ldgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L1_M4_22: | |||
| .Ldgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_22 | |||
| bgt .Ldgemm_kernel_L1_M4_22 | |||
| dgemm_kernel_L1_M4_40: | |||
| .Ldgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M4_100 | |||
| ble .Ldgemm_kernel_L1_M4_100 | |||
| dgemm_kernel_L1_M4_42: | |||
| .Ldgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_42 | |||
| bgt .Ldgemm_kernel_L1_M4_42 | |||
| dgemm_kernel_L1_M4_100: | |||
| .Ldgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| dgemm_kernel_L1_M4_END: | |||
| .Ldgemm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L1_M4_20 | |||
| bgt .Ldgemm_kernel_L1_M4_20 | |||
| dgemm_kernel_L1_M2_BEGIN: | |||
| .Ldgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L1_M1_BEGIN | |||
| ble .Ldgemm_kernel_L1_M1_BEGIN | |||
| dgemm_kernel_L1_M2_20: | |||
| .Ldgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M2_40 | |||
| ble .Ldgemm_kernel_L1_M2_40 | |||
| dgemm_kernel_L1_M2_22: | |||
| .Ldgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_22 | |||
| bgt .Ldgemm_kernel_L1_M2_22 | |||
| dgemm_kernel_L1_M2_40: | |||
| .Ldgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M2_100 | |||
| ble .Ldgemm_kernel_L1_M2_100 | |||
| dgemm_kernel_L1_M2_42: | |||
| .Ldgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_42 | |||
| bgt .Ldgemm_kernel_L1_M2_42 | |||
| dgemm_kernel_L1_M2_100: | |||
| .Ldgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| dgemm_kernel_L1_M2_END: | |||
| .Ldgemm_kernel_L1_M2_END: | |||
| dgemm_kernel_L1_M1_BEGIN: | |||
| .Ldgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| dgemm_kernel_L1_M1_20: | |||
| .Ldgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M1_40 | |||
| ble .Ldgemm_kernel_L1_M1_40 | |||
| dgemm_kernel_L1_M1_22: | |||
| .Ldgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_22 | |||
| bgt .Ldgemm_kernel_L1_M1_22 | |||
| dgemm_kernel_L1_M1_40: | |||
| .Ldgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M1_100 | |||
| ble .Ldgemm_kernel_L1_M1_100 | |||
| dgemm_kernel_L1_M1_42: | |||
| .Ldgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_42 | |||
| bgt .Ldgemm_kernel_L1_M1_42 | |||
| dgemm_kernel_L1_M1_100: | |||
| .Ldgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| dgemm_kernel_L1_END: | |||
| .Ldgemm_kernel_L1_END: | |||
| dgemm_kernel_L999: | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble dgemm_kernel_L2_BEGIN | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| .align 5 | |||
| dgemm_kernel_L4_BEGIN: | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| dgemm_kernel_L4_M8_BEGIN: | |||
| .Ldgemm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L4_M4_BEGIN | |||
| ble .Ldgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_20: | |||
| .Ldgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dgemm_kernel_L4_M8_32 | |||
| blt .Ldgemm_kernel_L4_M8_32 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dgemm_kernel_L4_M8_22a | |||
| ble .Ldgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22: | |||
| .Ldgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M8_22 | |||
| bgt .Ldgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22a: | |||
| .Ldgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| b .Ldgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_32: | |||
| .Ldgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dgemm_kernel_L4_M8_40 | |||
| ble .Ldgemm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| b .Ldgemm_kernel_L4_M8_44 | |||
| dgemm_kernel_L4_M8_40: | |||
| .Ldgemm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| dgemm_kernel_L4_M8_44: | |||
| .Ldgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #7 | |||
| ble dgemm_kernel_L4_M8_100 | |||
| ble .Ldgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_46: | |||
| .Ldgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dgemm_kernel_L4_M8_46 | |||
| bne .Ldgemm_kernel_L4_M8_46 | |||
| dgemm_kernel_L4_M8_100: | |||
| .Ldgemm_kernel_L4_M8_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE8x4 | |||
| dgemm_kernel_L4_M8_END: | |||
| .Ldgemm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bne dgemm_kernel_L4_M8_20 | |||
| bne .Ldgemm_kernel_L4_M8_20 | |||
| dgemm_kernel_L4_M4_BEGIN: | |||
| .Ldgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble dgemm_kernel_L4_M2_BEGIN | |||
| ble .Ldgemm_kernel_L4_M2_BEGIN | |||
| dgemm_kernel_L4_M4_20: | |||
| .Ldgemm_kernel_L4_M4_20: | |||
| INIT4x4 | |||
| @@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M4_40 | |||
| ble .Ldgemm_kernel_L4_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M4_22: | |||
| .Ldgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| @@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22: | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_22 | |||
| bgt .Ldgemm_kernel_L4_M4_22 | |||
| dgemm_kernel_L4_M4_40: | |||
| .Ldgemm_kernel_L4_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M4_100 | |||
| ble .Ldgemm_kernel_L4_M4_100 | |||
| dgemm_kernel_L4_M4_42: | |||
| .Ldgemm_kernel_L4_M4_42: | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_42 | |||
| bgt .Ldgemm_kernel_L4_M4_42 | |||
| dgemm_kernel_L4_M4_100: | |||
| .Ldgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| dgemm_kernel_L4_M4_END: | |||
| .Ldgemm_kernel_L4_M4_END: | |||
| dgemm_kernel_L4_M2_BEGIN: | |||
| .Ldgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L4_M1_BEGIN | |||
| ble .Ldgemm_kernel_L4_M1_BEGIN | |||
| dgemm_kernel_L4_M2_20: | |||
| .Ldgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M2_40 | |||
| ble .Ldgemm_kernel_L4_M2_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M2_22: | |||
| .Ldgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| @@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_22 | |||
| bgt .Ldgemm_kernel_L4_M2_22 | |||
| dgemm_kernel_L4_M2_40: | |||
| .Ldgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M2_100 | |||
| ble .Ldgemm_kernel_L4_M2_100 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| dgemm_kernel_L4_M2_42: | |||
| .Ldgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_42 | |||
| bgt .Ldgemm_kernel_L4_M2_42 | |||
| dgemm_kernel_L4_M2_100: | |||
| .Ldgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| dgemm_kernel_L4_M2_END: | |||
| .Ldgemm_kernel_L4_M2_END: | |||
| dgemm_kernel_L4_M1_BEGIN: | |||
| .Ldgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| dgemm_kernel_L4_M1_20: | |||
| .Ldgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M1_40 | |||
| ble .Ldgemm_kernel_L4_M1_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M1_22: | |||
| .Ldgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| KERNEL1x4_SUB | |||
| @@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_22 | |||
| bgt .Ldgemm_kernel_L4_M1_22 | |||
| dgemm_kernel_L4_M1_40: | |||
| .Ldgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M1_100 | |||
| ble .Ldgemm_kernel_L4_M1_100 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| dgemm_kernel_L4_M1_42: | |||
| .Ldgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_42 | |||
| bgt .Ldgemm_kernel_L4_M1_42 | |||
| dgemm_kernel_L4_M1_100: | |||
| .Ldgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| dgemm_kernel_L4_END: | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt dgemm_kernel_L4_BEGIN | |||
| bgt .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble dgemm_kernel_L999 // error, N was less than 4? | |||
| ble .Ldgemm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble dgemm_kernel_L1_BEGIN | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| @@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| dgemm_kernel_L2_M8_BEGIN: | |||
| .Ldgemm_kernel_L2_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L2_M4_BEGIN | |||
| ble .Ldgemm_kernel_L2_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L2_M8_20: | |||
| .Ldgemm_kernel_L2_M8_20: | |||
| INIT8x2 | |||
| @@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M8_40 | |||
| ble .Ldgemm_kernel_L2_M8_40 | |||
| .align 5 | |||
| dgemm_kernel_L2_M8_22: | |||
| .Ldgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| @@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M8_22 | |||
| bgt .Ldgemm_kernel_L2_M8_22 | |||
| dgemm_kernel_L2_M8_40: | |||
| .Ldgemm_kernel_L2_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M8_100 | |||
| ble .Ldgemm_kernel_L2_M8_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M8_42: | |||
| .Ldgemm_kernel_L2_M8_42: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M8_42 | |||
| bgt .Ldgemm_kernel_L2_M8_42 | |||
| dgemm_kernel_L2_M8_100: | |||
| .Ldgemm_kernel_L2_M8_100: | |||
| SAVE8x2 | |||
| dgemm_kernel_L2_M8_END: | |||
| .Ldgemm_kernel_L2_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L2_M8_20 | |||
| bgt .Ldgemm_kernel_L2_M8_20 | |||
| dgemm_kernel_L2_M4_BEGIN: | |||
| .Ldgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L2_M2_BEGIN | |||
| ble .Ldgemm_kernel_L2_M2_BEGIN | |||
| dgemm_kernel_L2_M4_20: | |||
| .Ldgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M4_40 | |||
| ble .Ldgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L2_M4_22: | |||
| .Ldgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| @@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_22 | |||
| bgt .Ldgemm_kernel_L2_M4_22 | |||
| dgemm_kernel_L2_M4_40: | |||
| .Ldgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M4_100 | |||
| ble .Ldgemm_kernel_L2_M4_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M4_42: | |||
| .Ldgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_42 | |||
| bgt .Ldgemm_kernel_L2_M4_42 | |||
| dgemm_kernel_L2_M4_100: | |||
| .Ldgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| dgemm_kernel_L2_M4_END: | |||
| .Ldgemm_kernel_L2_M4_END: | |||
| dgemm_kernel_L2_M2_BEGIN: | |||
| .Ldgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L2_M1_BEGIN | |||
| ble .Ldgemm_kernel_L2_M1_BEGIN | |||
| dgemm_kernel_L2_M2_20: | |||
| .Ldgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M2_40 | |||
| ble .Ldgemm_kernel_L2_M2_40 | |||
| dgemm_kernel_L2_M2_22: | |||
| .Ldgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| @@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_22 | |||
| bgt .Ldgemm_kernel_L2_M2_22 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M2_40: | |||
| .Ldgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M2_100 | |||
| ble .Ldgemm_kernel_L2_M2_100 | |||
| dgemm_kernel_L2_M2_42: | |||
| .Ldgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_42 | |||
| bgt .Ldgemm_kernel_L2_M2_42 | |||
| dgemm_kernel_L2_M2_100: | |||
| .Ldgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| dgemm_kernel_L2_M2_END: | |||
| .Ldgemm_kernel_L2_M2_END: | |||
| dgemm_kernel_L2_M1_BEGIN: | |||
| .Ldgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| dgemm_kernel_L2_M1_20: | |||
| .Ldgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dgemm_kernel_L2_M1_40 | |||
| ble .Ldgemm_kernel_L2_M1_40 | |||
| dgemm_kernel_L2_M1_22: | |||
| .Ldgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| @@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_22 | |||
| bgt .Ldgemm_kernel_L2_M1_22 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| dgemm_kernel_L2_M1_40: | |||
| .Ldgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M1_100 | |||
| ble .Ldgemm_kernel_L2_M1_100 | |||
| dgemm_kernel_L2_M1_42: | |||
| .Ldgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_42 | |||
| bgt .Ldgemm_kernel_L2_M1_42 | |||
| dgemm_kernel_L2_M1_100: | |||
| .Ldgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| dgemm_kernel_L2_END: | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L1_BEGIN: | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble dgemm_kernel_L999 // done | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC , pC , LDC // Update pC to point to next | |||
| mov pA, origPA // pA = A | |||
| dgemm_kernel_L1_M8_BEGIN: | |||
| .Ldgemm_kernel_L1_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L1_M4_BEGIN | |||
| ble .Ldgemm_kernel_L1_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L1_M8_20: | |||
| .Ldgemm_kernel_L1_M8_20: | |||
| INIT8x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M8_40 | |||
| ble .Ldgemm_kernel_L1_M8_40 | |||
| .align 5 | |||
| dgemm_kernel_L1_M8_22: | |||
| .Ldgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| @@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M8_22 | |||
| bgt .Ldgemm_kernel_L1_M8_22 | |||
| dgemm_kernel_L1_M8_40: | |||
| .Ldgemm_kernel_L1_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M8_100 | |||
| ble .Ldgemm_kernel_L1_M8_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M8_42: | |||
| .Ldgemm_kernel_L1_M8_42: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M8_42 | |||
| bgt .Ldgemm_kernel_L1_M8_42 | |||
| dgemm_kernel_L1_M8_100: | |||
| .Ldgemm_kernel_L1_M8_100: | |||
| SAVE8x1 | |||
| dgemm_kernel_L1_M8_END: | |||
| .Ldgemm_kernel_L1_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L1_M8_20 | |||
| bgt .Ldgemm_kernel_L1_M8_20 | |||
| dgemm_kernel_L1_M4_BEGIN: | |||
| .Ldgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L1_M2_BEGIN | |||
| ble .Ldgemm_kernel_L1_M2_BEGIN | |||
| dgemm_kernel_L1_M4_20: | |||
| .Ldgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M4_40 | |||
| ble .Ldgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L1_M4_22: | |||
| .Ldgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| KERNEL4x1_SUB | |||
| @@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_22 | |||
| bgt .Ldgemm_kernel_L1_M4_22 | |||
| dgemm_kernel_L1_M4_40: | |||
| .Ldgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M4_100 | |||
| ble .Ldgemm_kernel_L1_M4_100 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M4_42: | |||
| .Ldgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_42 | |||
| bgt .Ldgemm_kernel_L1_M4_42 | |||
| dgemm_kernel_L1_M4_100: | |||
| .Ldgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| dgemm_kernel_L1_M4_END: | |||
| .Ldgemm_kernel_L1_M4_END: | |||
| dgemm_kernel_L1_M2_BEGIN: | |||
| .Ldgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L1_M1_BEGIN | |||
| ble .Ldgemm_kernel_L1_M1_BEGIN | |||
| dgemm_kernel_L1_M2_20: | |||
| .Ldgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M2_40 | |||
| ble .Ldgemm_kernel_L1_M2_40 | |||
| dgemm_kernel_L1_M2_22: | |||
| .Ldgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_22 | |||
| bgt .Ldgemm_kernel_L1_M2_22 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M2_40: | |||
| .Ldgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M2_100 | |||
| ble .Ldgemm_kernel_L1_M2_100 | |||
| dgemm_kernel_L1_M2_42: | |||
| .Ldgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_42 | |||
| bgt .Ldgemm_kernel_L1_M2_42 | |||
| dgemm_kernel_L1_M2_100: | |||
| .Ldgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| dgemm_kernel_L1_M2_END: | |||
| .Ldgemm_kernel_L1_M2_END: | |||
| dgemm_kernel_L1_M1_BEGIN: | |||
| .Ldgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| dgemm_kernel_L1_M1_20: | |||
| .Ldgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M1_40 | |||
| ble .Ldgemm_kernel_L1_M1_40 | |||
| dgemm_kernel_L1_M1_22: | |||
| .Ldgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| @@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_22 | |||
| bgt .Ldgemm_kernel_L1_M1_22 | |||
| dgemm_kernel_L1_M1_40: | |||
| .Ldgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M1_100 | |||
| ble .Ldgemm_kernel_L1_M1_100 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| dgemm_kernel_L1_M1_42: | |||
| .Ldgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_42 | |||
| bgt .Ldgemm_kernel_L1_M1_42 | |||
| dgemm_kernel_L1_M1_100: | |||
| .Ldgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| dgemm_kernel_L1_END: | |||
| .Ldgemm_kernel_L1_END: | |||
| dgemm_kernel_L999: | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble dgemm_kernel_L2_BEGIN | |||
| ble .Ldgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| .align 5 | |||
| dgemm_kernel_L4_BEGIN: | |||
| .Ldgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| dgemm_kernel_L4_M8_BEGIN: | |||
| .Ldgemm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L4_M4_BEGIN | |||
| ble .Ldgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_20: | |||
| .Ldgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #7 // L = K / 128 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dgemm_kernel_L4_M8_32 | |||
| blt .Ldgemm_kernel_L4_M8_32 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20: | |||
| KERNEL8x4_M1_M2_x1 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dgemm_kernel_L4_M8_22a | |||
| ble .Ldgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22: | |||
| .Ldgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1_M2_x64 | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M8_22 | |||
| bgt .Ldgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22a: | |||
| .Ldgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1_M2_x32 | |||
| KERNEL8x4_M1_M2_x16 | |||
| @@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| b .Ldgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_32: | |||
| .Ldgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dgemm_kernel_L4_M8_40 | |||
| ble .Ldgemm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dgemm_kernel_L4_M8_44 | |||
| b .Ldgemm_kernel_L4_M8_44 | |||
| dgemm_kernel_L4_M8_40: | |||
| .Ldgemm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| dgemm_kernel_L4_M8_44: | |||
| .Ldgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #127 | |||
| ble dgemm_kernel_L4_M8_100 | |||
| ble .Ldgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_46: | |||
| .Ldgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dgemm_kernel_L4_M8_46 | |||
| bne .Ldgemm_kernel_L4_M8_46 | |||
| dgemm_kernel_L4_M8_100: | |||
| .Ldgemm_kernel_L4_M8_100: | |||
| prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] | |||
| prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] | |||
| @@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| dgemm_kernel_L4_M8_END: | |||
| .Ldgemm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bne dgemm_kernel_L4_M8_20 | |||
| bne .Ldgemm_kernel_L4_M8_20 | |||
| dgemm_kernel_L4_M4_BEGIN: | |||
| .Ldgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble dgemm_kernel_L4_M2_BEGIN | |||
| ble .Ldgemm_kernel_L4_M2_BEGIN | |||
| dgemm_kernel_L4_M4_20: | |||
| .Ldgemm_kernel_L4_M4_20: | |||
| INIT4x4 | |||
| @@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M4_40 | |||
| ble .Ldgemm_kernel_L4_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M4_22: | |||
| .Ldgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| @@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22: | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_22 | |||
| bgt .Ldgemm_kernel_L4_M4_22 | |||
| dgemm_kernel_L4_M4_40: | |||
| .Ldgemm_kernel_L4_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M4_100 | |||
| ble .Ldgemm_kernel_L4_M4_100 | |||
| dgemm_kernel_L4_M4_42: | |||
| .Ldgemm_kernel_L4_M4_42: | |||
| KERNEL4x4_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M4_42 | |||
| bgt .Ldgemm_kernel_L4_M4_42 | |||
| dgemm_kernel_L4_M4_100: | |||
| .Ldgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| dgemm_kernel_L4_M4_END: | |||
| .Ldgemm_kernel_L4_M4_END: | |||
| dgemm_kernel_L4_M2_BEGIN: | |||
| .Ldgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L4_M1_BEGIN | |||
| ble .Ldgemm_kernel_L4_M1_BEGIN | |||
| dgemm_kernel_L4_M2_20: | |||
| .Ldgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M2_40 | |||
| ble .Ldgemm_kernel_L4_M2_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M2_22: | |||
| .Ldgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| @@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_22 | |||
| bgt .Ldgemm_kernel_L4_M2_22 | |||
| dgemm_kernel_L4_M2_40: | |||
| .Ldgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M2_100 | |||
| ble .Ldgemm_kernel_L4_M2_100 | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] | |||
| dgemm_kernel_L4_M2_42: | |||
| .Ldgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M2_42 | |||
| bgt .Ldgemm_kernel_L4_M2_42 | |||
| dgemm_kernel_L4_M2_100: | |||
| .Ldgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| dgemm_kernel_L4_M2_END: | |||
| .Ldgemm_kernel_L4_M2_END: | |||
| dgemm_kernel_L4_M1_BEGIN: | |||
| .Ldgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L4_END | |||
| ble .Ldgemm_kernel_L4_END | |||
| dgemm_kernel_L4_M1_20: | |||
| .Ldgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L4_M1_40 | |||
| ble .Ldgemm_kernel_L4_M1_40 | |||
| .align 5 | |||
| dgemm_kernel_L4_M1_22: | |||
| .Ldgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| KERNEL1x4_SUB | |||
| @@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_22 | |||
| bgt .Ldgemm_kernel_L4_M1_22 | |||
| dgemm_kernel_L4_M1_40: | |||
| .Ldgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L4_M1_100 | |||
| ble .Ldgemm_kernel_L4_M1_100 | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| dgemm_kernel_L4_M1_42: | |||
| .Ldgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M1_42 | |||
| bgt .Ldgemm_kernel_L4_M1_42 | |||
| dgemm_kernel_L4_M1_100: | |||
| .Ldgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| dgemm_kernel_L4_END: | |||
| .Ldgemm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt dgemm_kernel_L4_BEGIN | |||
| bgt .Ldgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble dgemm_kernel_L999 // error, N was less than 4? | |||
| ble .Ldgemm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble dgemm_kernel_L1_BEGIN | |||
| ble .Ldgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| @@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| dgemm_kernel_L2_M8_BEGIN: | |||
| .Ldgemm_kernel_L2_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L2_M4_BEGIN | |||
| ble .Ldgemm_kernel_L2_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L2_M8_20: | |||
| .Ldgemm_kernel_L2_M8_20: | |||
| INIT8x2 | |||
| @@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M8_40 | |||
| ble .Ldgemm_kernel_L2_M8_40 | |||
| .align 5 | |||
| dgemm_kernel_L2_M8_22: | |||
| .Ldgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| @@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M8_22 | |||
| bgt .Ldgemm_kernel_L2_M8_22 | |||
| dgemm_kernel_L2_M8_40: | |||
| .Ldgemm_kernel_L2_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M8_100 | |||
| ble .Ldgemm_kernel_L2_M8_100 | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] | |||
| dgemm_kernel_L2_M8_42: | |||
| .Ldgemm_kernel_L2_M8_42: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M8_42 | |||
| bgt .Ldgemm_kernel_L2_M8_42 | |||
| dgemm_kernel_L2_M8_100: | |||
| .Ldgemm_kernel_L2_M8_100: | |||
| SAVE8x2 | |||
| dgemm_kernel_L2_M8_END: | |||
| .Ldgemm_kernel_L2_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L2_M8_20 | |||
| bgt .Ldgemm_kernel_L2_M8_20 | |||
| dgemm_kernel_L2_M4_BEGIN: | |||
| .Ldgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L2_M2_BEGIN | |||
| ble .Ldgemm_kernel_L2_M2_BEGIN | |||
| dgemm_kernel_L2_M4_20: | |||
| .Ldgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M4_40 | |||
| ble .Ldgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L2_M4_22: | |||
| .Ldgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| KERNEL4x2_SUB | |||
| @@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_22 | |||
| bgt .Ldgemm_kernel_L2_M4_22 | |||
| dgemm_kernel_L2_M4_40: | |||
| .Ldgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M4_100 | |||
| ble .Ldgemm_kernel_L2_M4_100 | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] | |||
| dgemm_kernel_L2_M4_42: | |||
| .Ldgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M4_42 | |||
| bgt .Ldgemm_kernel_L2_M4_42 | |||
| dgemm_kernel_L2_M4_100: | |||
| .Ldgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| dgemm_kernel_L2_M4_END: | |||
| .Ldgemm_kernel_L2_M4_END: | |||
| dgemm_kernel_L2_M2_BEGIN: | |||
| .Ldgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L2_M1_BEGIN | |||
| ble .Ldgemm_kernel_L2_M1_BEGIN | |||
| dgemm_kernel_L2_M2_20: | |||
| .Ldgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dgemm_kernel_L2_M2_40 | |||
| ble .Ldgemm_kernel_L2_M2_40 | |||
| dgemm_kernel_L2_M2_22: | |||
| .Ldgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| @@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_22 | |||
| bgt .Ldgemm_kernel_L2_M2_22 | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] | |||
| dgemm_kernel_L2_M2_40: | |||
| .Ldgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M2_100 | |||
| ble .Ldgemm_kernel_L2_M2_100 | |||
| dgemm_kernel_L2_M2_42: | |||
| .Ldgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M2_42 | |||
| bgt .Ldgemm_kernel_L2_M2_42 | |||
| dgemm_kernel_L2_M2_100: | |||
| .Ldgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| dgemm_kernel_L2_M2_END: | |||
| .Ldgemm_kernel_L2_M2_END: | |||
| dgemm_kernel_L2_M1_BEGIN: | |||
| .Ldgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L2_END | |||
| ble .Ldgemm_kernel_L2_END | |||
| dgemm_kernel_L2_M1_20: | |||
| .Ldgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dgemm_kernel_L2_M1_40 | |||
| ble .Ldgemm_kernel_L2_M1_40 | |||
| dgemm_kernel_L2_M1_22: | |||
| .Ldgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| @@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_22 | |||
| bgt .Ldgemm_kernel_L2_M1_22 | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] | |||
| dgemm_kernel_L2_M1_40: | |||
| .Ldgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L2_M1_100 | |||
| ble .Ldgemm_kernel_L2_M1_100 | |||
| dgemm_kernel_L2_M1_42: | |||
| .Ldgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L2_M1_42 | |||
| bgt .Ldgemm_kernel_L2_M1_42 | |||
| dgemm_kernel_L2_M1_100: | |||
| .Ldgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| dgemm_kernel_L2_END: | |||
| .Ldgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 | |||
| /******************************************************************************/ | |||
| dgemm_kernel_L1_BEGIN: | |||
| .Ldgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble dgemm_kernel_L999 // done | |||
| ble .Ldgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC , pC , LDC // Update pC to point to next | |||
| mov pA, origPA // pA = A | |||
| dgemm_kernel_L1_M8_BEGIN: | |||
| .Ldgemm_kernel_L1_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L1_M4_BEGIN | |||
| ble .Ldgemm_kernel_L1_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L1_M8_20: | |||
| .Ldgemm_kernel_L1_M8_20: | |||
| INIT8x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M8_40 | |||
| ble .Ldgemm_kernel_L1_M8_40 | |||
| .align 5 | |||
| dgemm_kernel_L1_M8_22: | |||
| .Ldgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| @@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M8_22 | |||
| bgt .Ldgemm_kernel_L1_M8_22 | |||
| dgemm_kernel_L1_M8_40: | |||
| .Ldgemm_kernel_L1_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M8_100 | |||
| ble .Ldgemm_kernel_L1_M8_100 | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| dgemm_kernel_L1_M8_42: | |||
| .Ldgemm_kernel_L1_M8_42: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M8_42 | |||
| bgt .Ldgemm_kernel_L1_M8_42 | |||
| dgemm_kernel_L1_M8_100: | |||
| .Ldgemm_kernel_L1_M8_100: | |||
| SAVE8x1 | |||
| dgemm_kernel_L1_M8_END: | |||
| .Ldgemm_kernel_L1_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dgemm_kernel_L1_M8_20 | |||
| bgt .Ldgemm_kernel_L1_M8_20 | |||
| dgemm_kernel_L1_M4_BEGIN: | |||
| .Ldgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L1_M2_BEGIN | |||
| ble .Ldgemm_kernel_L1_M2_BEGIN | |||
| dgemm_kernel_L1_M4_20: | |||
| .Ldgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M4_40 | |||
| ble .Ldgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| dgemm_kernel_L1_M4_22: | |||
| .Ldgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| KERNEL4x1_SUB | |||
| @@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_22 | |||
| bgt .Ldgemm_kernel_L1_M4_22 | |||
| dgemm_kernel_L1_M4_40: | |||
| .Ldgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M4_100 | |||
| ble .Ldgemm_kernel_L1_M4_100 | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| dgemm_kernel_L1_M4_42: | |||
| .Ldgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M4_42 | |||
| bgt .Ldgemm_kernel_L1_M4_42 | |||
| dgemm_kernel_L1_M4_100: | |||
| .Ldgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| dgemm_kernel_L1_M4_END: | |||
| .Ldgemm_kernel_L1_M4_END: | |||
| dgemm_kernel_L1_M2_BEGIN: | |||
| .Ldgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dgemm_kernel_L1_M1_BEGIN | |||
| ble .Ldgemm_kernel_L1_M1_BEGIN | |||
| dgemm_kernel_L1_M2_20: | |||
| .Ldgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M2_40 | |||
| ble .Ldgemm_kernel_L1_M2_40 | |||
| dgemm_kernel_L1_M2_22: | |||
| .Ldgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_22 | |||
| bgt .Ldgemm_kernel_L1_M2_22 | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| dgemm_kernel_L1_M2_40: | |||
| .Ldgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M2_100 | |||
| ble .Ldgemm_kernel_L1_M2_100 | |||
| dgemm_kernel_L1_M2_42: | |||
| .Ldgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M2_42 | |||
| bgt .Ldgemm_kernel_L1_M2_42 | |||
| dgemm_kernel_L1_M2_100: | |||
| .Ldgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| dgemm_kernel_L1_M2_END: | |||
| .Ldgemm_kernel_L1_M2_END: | |||
| dgemm_kernel_L1_M1_BEGIN: | |||
| .Ldgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dgemm_kernel_L1_END | |||
| ble .Ldgemm_kernel_L1_END | |||
| dgemm_kernel_L1_M1_20: | |||
| .Ldgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20: | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dgemm_kernel_L1_M1_40 | |||
| ble .Ldgemm_kernel_L1_M1_40 | |||
| dgemm_kernel_L1_M1_22: | |||
| .Ldgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| @@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_22 | |||
| bgt .Ldgemm_kernel_L1_M1_22 | |||
| dgemm_kernel_L1_M1_40: | |||
| .Ldgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble dgemm_kernel_L1_M1_100 | |||
| ble .Ldgemm_kernel_L1_M1_100 | |||
| prfm PLDL1KEEP, [pA, A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [pB, B_PRE_SIZE] | |||
| dgemm_kernel_L1_M1_42: | |||
| .Ldgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L1_M1_42 | |||
| bgt .Ldgemm_kernel_L1_M1_42 | |||
| dgemm_kernel_L1_M1_100: | |||
| .Ldgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| dgemm_kernel_L1_END: | |||
| .Ldgemm_kernel_L1_END: | |||
| dgemm_kernel_L999: | |||
| .Ldgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| dgemm_ncopy_L4_BEGIN: | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble dgemm_ncopy_L2_BEGIN | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_BEGIN: | |||
| .Ldgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| @@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN: | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M4_40 | |||
| ble .Ldgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_20: | |||
| .Ldgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M4_20 | |||
| bne .Ldgemm_ncopy_L4_M4_20 | |||
| dgemm_ncopy_L4_M4_40: | |||
| .Ldgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M4_END | |||
| ble .Ldgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_60: | |||
| .Ldgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M4_60 | |||
| bne .Ldgemm_ncopy_L4_M4_60 | |||
| dgemm_ncopy_L4_M4_END: | |||
| .Ldgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_ncopy_L4_M4_BEGIN | |||
| bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L2_BEGIN: | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble dgemm_ncopy_L999 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble dgemm_ncopy_L1_BEGIN | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| dgemm_ncopy_L2_M4_BEGIN: | |||
| .Ldgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M4_40 | |||
| ble .Ldgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L2_M4_20: | |||
| .Ldgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M4_20 | |||
| bne .Ldgemm_ncopy_L2_M4_20 | |||
| dgemm_ncopy_L2_M4_40: | |||
| .Ldgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M4_END | |||
| ble .Ldgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L2_M4_60: | |||
| .Ldgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M4_60 | |||
| bne .Ldgemm_ncopy_L2_M4_60 | |||
| dgemm_ncopy_L2_M4_END: | |||
| .Ldgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L1_BEGIN: | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble dgemm_ncopy_L999 | |||
| ble .Ldgemm_ncopy_L999 | |||
| dgemm_ncopy_L1_M4_BEGIN: | |||
| .Ldgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M4_40 | |||
| ble .Ldgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L1_M4_20: | |||
| .Ldgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M4_20 | |||
| bne .Ldgemm_ncopy_L1_M4_20 | |||
| dgemm_ncopy_L1_M4_40: | |||
| .Ldgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M4_END | |||
| ble .Ldgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L1_M4_60: | |||
| .Ldgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M4_60 | |||
| bne .Ldgemm_ncopy_L1_M4_60 | |||
| dgemm_ncopy_L1_M4_END: | |||
| .Ldgemm_ncopy_L1_M4_END: | |||
| dgemm_ncopy_L999: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| @@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| dgemm_ncopy_L8_BEGIN: | |||
| .Ldgemm_ncopy_L8_BEGIN: | |||
| asr J, N, #3 // J = N / 8 | |||
| cmp J, #0 | |||
| ble dgemm_ncopy_L4_BEGIN | |||
| ble .Ldgemm_ncopy_L4_BEGIN | |||
| dgemm_ncopy_L8_M8_BEGIN: | |||
| .Ldgemm_ncopy_L8_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| @@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN: | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L8_M8_40 | |||
| ble .Ldgemm_ncopy_L8_M8_40 | |||
| dgemm_ncopy_L8_M8_20: | |||
| .Ldgemm_ncopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L8_M8_20 | |||
| bne .Ldgemm_ncopy_L8_M8_20 | |||
| dgemm_ncopy_L8_M8_40: | |||
| .Ldgemm_ncopy_L8_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L8_M8_END | |||
| ble .Ldgemm_ncopy_L8_M8_END | |||
| dgemm_ncopy_L8_M8_60: | |||
| .Ldgemm_ncopy_L8_M8_60: | |||
| COPY1x8 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L8_M8_60 | |||
| bne .Ldgemm_ncopy_L8_M8_60 | |||
| dgemm_ncopy_L8_M8_END: | |||
| .Ldgemm_ncopy_L8_M8_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_ncopy_L8_M8_BEGIN | |||
| bne .Ldgemm_ncopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L4_BEGIN: | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| tst N, #7 | |||
| ble dgemm_ncopy_L999 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #4 | |||
| ble dgemm_ncopy_L2_BEGIN | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| dgemm_ncopy_L4_M8_BEGIN: | |||
| .Ldgemm_ncopy_L4_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| @@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN: | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M8_40 | |||
| ble .Ldgemm_ncopy_L4_M8_40 | |||
| dgemm_ncopy_L4_M8_20: | |||
| .Ldgemm_ncopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M8_20 | |||
| bne .Ldgemm_ncopy_L4_M8_20 | |||
| dgemm_ncopy_L4_M8_40: | |||
| .Ldgemm_ncopy_L4_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M8_END | |||
| ble .Ldgemm_ncopy_L4_M8_END | |||
| dgemm_ncopy_L4_M8_60: | |||
| .Ldgemm_ncopy_L4_M8_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M8_60 | |||
| bne .Ldgemm_ncopy_L4_M8_60 | |||
| dgemm_ncopy_L4_M8_END: | |||
| .Ldgemm_ncopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L2_BEGIN: | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble dgemm_ncopy_L999 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble dgemm_ncopy_L1_BEGIN | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| dgemm_ncopy_L2_M8_BEGIN: | |||
| .Ldgemm_ncopy_L2_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M8_40 | |||
| ble .Ldgemm_ncopy_L2_M8_40 | |||
| dgemm_ncopy_L2_M8_20: | |||
| .Ldgemm_ncopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M8_20 | |||
| bne .Ldgemm_ncopy_L2_M8_20 | |||
| dgemm_ncopy_L2_M8_40: | |||
| .Ldgemm_ncopy_L2_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M8_END | |||
| ble .Ldgemm_ncopy_L2_M8_END | |||
| dgemm_ncopy_L2_M8_60: | |||
| .Ldgemm_ncopy_L2_M8_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M8_60 | |||
| bne .Ldgemm_ncopy_L2_M8_60 | |||
| dgemm_ncopy_L2_M8_END: | |||
| .Ldgemm_ncopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L1_BEGIN: | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble dgemm_ncopy_L999 | |||
| ble .Ldgemm_ncopy_L999 | |||
| dgemm_ncopy_L1_M8_BEGIN: | |||
| .Ldgemm_ncopy_L1_M8_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M8_40 | |||
| ble .Ldgemm_ncopy_L1_M8_40 | |||
| dgemm_ncopy_L1_M8_20: | |||
| .Ldgemm_ncopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M8_20 | |||
| bne .Ldgemm_ncopy_L1_M8_20 | |||
| dgemm_ncopy_L1_M8_40: | |||
| .Ldgemm_ncopy_L1_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M8_END | |||
| ble .Ldgemm_ncopy_L1_M8_END | |||
| dgemm_ncopy_L1_M8_60: | |||
| .Ldgemm_ncopy_L1_M8_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M8_60 | |||
| bne .Ldgemm_ncopy_L1_M8_60 | |||
| dgemm_ncopy_L1_M8_END: | |||
| .Ldgemm_ncopy_L1_M8_END: | |||
| dgemm_ncopy_L999: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| @@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lsl M4, M, #5 // M4 = M * 4 * SIZE | |||
| dgemm_tcopy_L4_BEGIN: | |||
| .Ldgemm_tcopy_L4_BEGIN: | |||
| asr J, M, #2 // J = M / 4 | |||
| cmp J, #0 | |||
| ble dgemm_tcopy_L2_BEGIN | |||
| ble .Ldgemm_tcopy_L2_BEGIN | |||
| .align 5 | |||
| dgemm_tcopy_L4_M4_BEGIN: | |||
| .Ldgemm_tcopy_L4_M4_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| @@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN: | |||
| asr I, N, #2 // I = N / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L4_M4_40 | |||
| ble .Ldgemm_tcopy_L4_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L4_M4_20: | |||
| .Ldgemm_tcopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L4_M4_20 | |||
| bne .Ldgemm_tcopy_L4_M4_20 | |||
| dgemm_tcopy_L4_M4_40: | |||
| .Ldgemm_tcopy_L4_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L4_M4_60 | |||
| ble .Ldgemm_tcopy_L4_M4_60 | |||
| COPY2x4 | |||
| dgemm_tcopy_L4_M4_60: | |||
| .Ldgemm_tcopy_L4_M4_60: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L4_M4_END | |||
| ble .Ldgemm_tcopy_L4_M4_END | |||
| COPY1x4 | |||
| dgemm_tcopy_L4_M4_END: | |||
| .Ldgemm_tcopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_tcopy_L4_M4_BEGIN | |||
| bne .Ldgemm_tcopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L2_BEGIN: | |||
| .Ldgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble dgemm_tcopy_L999 | |||
| ble .Ldgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble dgemm_tcopy_L1_BEGIN | |||
| ble .Ldgemm_tcopy_L1_BEGIN | |||
| dgemm_tcopy_L2_M4_BEGIN: | |||
| .Ldgemm_tcopy_L2_M4_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| @@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN: | |||
| asr I, N, #2 // I = N / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L2_M4_40 | |||
| ble .Ldgemm_tcopy_L2_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L2_M4_20: | |||
| .Ldgemm_tcopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L2_M4_20 | |||
| bne .Ldgemm_tcopy_L2_M4_20 | |||
| dgemm_tcopy_L2_M4_40: | |||
| .Ldgemm_tcopy_L2_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L2_M4_60 | |||
| ble .Ldgemm_tcopy_L2_M4_60 | |||
| COPY2x2 | |||
| dgemm_tcopy_L2_M4_60: | |||
| .Ldgemm_tcopy_L2_M4_60: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L2_M4_END | |||
| ble .Ldgemm_tcopy_L2_M4_END | |||
| COPY1x2 | |||
| dgemm_tcopy_L2_M4_END: | |||
| .Ldgemm_tcopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L1_BEGIN: | |||
| .Ldgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble dgemm_tcopy_L999 | |||
| ble .Ldgemm_tcopy_L999 | |||
| dgemm_tcopy_L1_M4_BEGIN: | |||
| .Ldgemm_tcopy_L1_M4_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B01, B | |||
| asr I, N, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L1_M4_40 | |||
| ble .Ldgemm_tcopy_L1_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L1_M4_20: | |||
| .Ldgemm_tcopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L1_M4_20 | |||
| bne .Ldgemm_tcopy_L1_M4_20 | |||
| dgemm_tcopy_L1_M4_40: | |||
| .Ldgemm_tcopy_L1_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L1_M4_60 | |||
| ble .Ldgemm_tcopy_L1_M4_60 | |||
| COPY2x1 | |||
| dgemm_tcopy_L1_M4_60: | |||
| .Ldgemm_tcopy_L1_M4_60: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L1_M4_END | |||
| ble .Ldgemm_tcopy_L1_M4_END | |||
| COPY1x1 | |||
| dgemm_tcopy_L1_M4_END: | |||
| .Ldgemm_tcopy_L1_M4_END: | |||
| dgemm_tcopy_L999: | |||
| .Ldgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| @@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| lsl M8, M, #6 // M8 = M * 8 * SIZE | |||
| dgemm_tcopy_L8_BEGIN: | |||
| .Ldgemm_tcopy_L8_BEGIN: | |||
| asr J, M, #3 // J = M / 4 | |||
| cmp J, #0 | |||
| ble dgemm_tcopy_L4_BEGIN | |||
| ble .Ldgemm_tcopy_L4_BEGIN | |||
| .align 5 | |||
| dgemm_tcopy_L8_M8_BEGIN: | |||
| .Ldgemm_tcopy_L8_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| @@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN: | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L8_M8_40 | |||
| ble .Ldgemm_tcopy_L8_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L8_M8_20: | |||
| .Ldgemm_tcopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L8_M8_20 | |||
| bne .Ldgemm_tcopy_L8_M8_20 | |||
| dgemm_tcopy_L8_M8_40: | |||
| .Ldgemm_tcopy_L8_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L8_M8_60 | |||
| ble .Ldgemm_tcopy_L8_M8_60 | |||
| COPY4x8 | |||
| dgemm_tcopy_L8_M8_60: | |||
| .Ldgemm_tcopy_L8_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L8_M8_80 | |||
| ble .Ldgemm_tcopy_L8_M8_80 | |||
| COPY2x8 | |||
| dgemm_tcopy_L8_M8_80: | |||
| .Ldgemm_tcopy_L8_M8_80: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L8_M8_END | |||
| ble .Ldgemm_tcopy_L8_M8_END | |||
| COPY1x8 | |||
| dgemm_tcopy_L8_M8_END: | |||
| .Ldgemm_tcopy_L8_M8_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_tcopy_L8_M8_BEGIN | |||
| bne .Ldgemm_tcopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L4_BEGIN: | |||
| .Ldgemm_tcopy_L4_BEGIN: | |||
| tst M, #7 | |||
| ble dgemm_tcopy_L999 | |||
| ble .Ldgemm_tcopy_L999 | |||
| tst M, #4 | |||
| ble dgemm_tcopy_L2_BEGIN | |||
| ble .Ldgemm_tcopy_L2_BEGIN | |||
| dgemm_tcopy_L4_M8_BEGIN: | |||
| .Ldgemm_tcopy_L4_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| @@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN: | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L4_M8_40 | |||
| ble .Ldgemm_tcopy_L4_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L4_M8_20: | |||
| .Ldgemm_tcopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L4_M8_20 | |||
| bne .Ldgemm_tcopy_L4_M8_20 | |||
| dgemm_tcopy_L4_M8_40: | |||
| .Ldgemm_tcopy_L4_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L4_M8_60 | |||
| ble .Ldgemm_tcopy_L4_M8_60 | |||
| COPY4x4 | |||
| dgemm_tcopy_L4_M8_60: | |||
| .Ldgemm_tcopy_L4_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L4_M8_80 | |||
| ble .Ldgemm_tcopy_L4_M8_80 | |||
| COPY2x4 | |||
| dgemm_tcopy_L4_M8_80: | |||
| .Ldgemm_tcopy_L4_M8_80: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L4_M8_END | |||
| ble .Ldgemm_tcopy_L4_M8_END | |||
| COPY1x4 | |||
| dgemm_tcopy_L4_M8_END: | |||
| .Ldgemm_tcopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L2_BEGIN: | |||
| .Ldgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble dgemm_tcopy_L999 | |||
| ble .Ldgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble dgemm_tcopy_L1_BEGIN | |||
| ble .Ldgemm_tcopy_L1_BEGIN | |||
| dgemm_tcopy_L2_M8_BEGIN: | |||
| .Ldgemm_tcopy_L2_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| @@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN: | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L2_M8_40 | |||
| ble .Ldgemm_tcopy_L2_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L2_M8_20: | |||
| .Ldgemm_tcopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L2_M8_20 | |||
| bne .Ldgemm_tcopy_L2_M8_20 | |||
| dgemm_tcopy_L2_M8_40: | |||
| .Ldgemm_tcopy_L2_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L2_M8_60 | |||
| ble .Ldgemm_tcopy_L2_M8_60 | |||
| COPY4x2 | |||
| dgemm_tcopy_L2_M8_60: | |||
| .Ldgemm_tcopy_L2_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L2_M8_80 | |||
| ble .Ldgemm_tcopy_L2_M8_80 | |||
| COPY2x2 | |||
| dgemm_tcopy_L2_M8_80: | |||
| .Ldgemm_tcopy_L2_M8_80: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L2_M8_END | |||
| ble .Ldgemm_tcopy_L2_M8_END | |||
| COPY1x2 | |||
| dgemm_tcopy_L2_M8_END: | |||
| .Ldgemm_tcopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L1_BEGIN: | |||
| .Ldgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble dgemm_tcopy_L999 | |||
| ble .Ldgemm_tcopy_L999 | |||
| dgemm_tcopy_L1_M8_BEGIN: | |||
| .Ldgemm_tcopy_L1_M8_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B01, B | |||
| asr I, N, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L1_M8_40 | |||
| ble .Ldgemm_tcopy_L1_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L1_M8_20: | |||
| .Ldgemm_tcopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L1_M8_20 | |||
| bne .Ldgemm_tcopy_L1_M8_20 | |||
| dgemm_tcopy_L1_M8_40: | |||
| .Ldgemm_tcopy_L1_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L1_M8_60 | |||
| ble .Ldgemm_tcopy_L1_M8_60 | |||
| COPY4x1 | |||
| dgemm_tcopy_L1_M8_60: | |||
| .Ldgemm_tcopy_L1_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L1_M8_80 | |||
| ble .Ldgemm_tcopy_L1_M8_80 | |||
| COPY2x1 | |||
| dgemm_tcopy_L1_M8_80: | |||
| .Ldgemm_tcopy_L1_M8_80: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L1_M8_END | |||
| ble .Ldgemm_tcopy_L1_M8_END | |||
| COPY1x1 | |||
| dgemm_tcopy_L1_M8_END: | |||
| .Ldgemm_tcopy_L1_M8_END: | |||
| dgemm_tcopy_L999: | |||
| .Ldgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| @@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| cmp N, xzr | |||
| ble dot_kernel_L999 | |||
| ble .Ldot_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne dot_kernel_S_BEGIN | |||
| bne .Ldot_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne dot_kernel_S_BEGIN | |||
| bne .Ldot_kernel_S_BEGIN | |||
| dot_kernel_F_BEGIN: | |||
| .Ldot_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq dot_kernel_F1 | |||
| beq .Ldot_kernel_F1 | |||
| dot_kernel_F4: | |||
| .Ldot_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne dot_kernel_F4 | |||
| bne .Ldot_kernel_F4 | |||
| KERNEL_F4_FINALIZE | |||
| dot_kernel_F1: | |||
| .Ldot_kernel_F1: | |||
| ands I, N, #3 | |||
| ble dot_kernel_L999 | |||
| ble .Ldot_kernel_L999 | |||
| dot_kernel_F10: | |||
| .Ldot_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne dot_kernel_F10 | |||
| bne .Ldot_kernel_F10 | |||
| ret | |||
| dot_kernel_S_BEGIN: | |||
| .Ldot_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble dot_kernel_S1 | |||
| ble .Ldot_kernel_S1 | |||
| dot_kernel_S4: | |||
| .Ldot_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -206,21 +206,21 @@ dot_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne dot_kernel_S4 | |||
| bne .Ldot_kernel_S4 | |||
| dot_kernel_S1: | |||
| .Ldot_kernel_S1: | |||
| ands I, N, #3 | |||
| ble dot_kernel_L999 | |||
| ble .Ldot_kernel_L999 | |||
| dot_kernel_S10: | |||
| .Ldot_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne dot_kernel_S10 | |||
| bne .Ldot_kernel_S10 | |||
| dot_kernel_L999: | |||
| .Ldot_kernel_L999: | |||
| ret | |||
| @@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble dtrmm_kernel_L2_BEGIN | |||
| ble .Ldtrmm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L4_BEGIN: | |||
| .Ldtrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| @@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| dtrmm_kernel_L4_M4_BEGIN: | |||
| .Ldtrmm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L4_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L4_M2_BEGIN | |||
| dtrmm_kernel_L4_M4_20: | |||
| .Ldtrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20: | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dtrmm_kernel_L4_M4_32 | |||
| blt .Ldtrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble dtrmm_kernel_L4_M4_22a | |||
| ble .Ldtrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| dtrmm_kernel_L4_M4_22: | |||
| .Ldtrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M4_22 | |||
| bgt .Ldtrmm_kernel_L4_M4_22 | |||
| dtrmm_kernel_L4_M4_22a: | |||
| .Ldtrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b dtrmm_kernel_L4_M4_44 | |||
| b .Ldtrmm_kernel_L4_M4_44 | |||
| dtrmm_kernel_L4_M4_32: | |||
| .Ldtrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble dtrmm_kernel_L4_M4_40 | |||
| ble .Ldtrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b dtrmm_kernel_L4_M4_44 | |||
| b .Ldtrmm_kernel_L4_M4_44 | |||
| dtrmm_kernel_L4_M4_40: | |||
| .Ldtrmm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| dtrmm_kernel_L4_M4_44: | |||
| .Ldtrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ble dtrmm_kernel_L4_M4_100 | |||
| ble .Ldtrmm_kernel_L4_M4_100 | |||
| dtrmm_kernel_L4_M4_46: | |||
| .Ldtrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| dtrmm_kernel_L4_M4_100: | |||
| .Ldtrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L4_M4_END: | |||
| .Ldtrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne dtrmm_kernel_L4_M4_20 | |||
| bne .Ldtrmm_kernel_L4_M4_20 | |||
| dtrmm_kernel_L4_M2_BEGIN: | |||
| .Ldtrmm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L4_END | |||
| ble .Ldtrmm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L4_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L4_M1_BEGIN | |||
| dtrmm_kernel_L4_M2_20: | |||
| .Ldtrmm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L4_M2_40 | |||
| ble .Ldtrmm_kernel_L4_M2_40 | |||
| dtrmm_kernel_L4_M2_22: | |||
| .Ldtrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M2_22 | |||
| bgt .Ldtrmm_kernel_L4_M2_22 | |||
| dtrmm_kernel_L4_M2_40: | |||
| .Ldtrmm_kernel_L4_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L4_M2_100 | |||
| ble .Ldtrmm_kernel_L4_M2_100 | |||
| dtrmm_kernel_L4_M2_42: | |||
| .Ldtrmm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M2_42 | |||
| bgt .Ldtrmm_kernel_L4_M2_42 | |||
| dtrmm_kernel_L4_M2_100: | |||
| .Ldtrmm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| @@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L4_M2_END: | |||
| .Ldtrmm_kernel_L4_M2_END: | |||
| dtrmm_kernel_L4_M1_BEGIN: | |||
| .Ldtrmm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L4_END | |||
| ble .Ldtrmm_kernel_L4_END | |||
| dtrmm_kernel_L4_M1_20: | |||
| .Ldtrmm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L4_M1_40 | |||
| ble .Ldtrmm_kernel_L4_M1_40 | |||
| dtrmm_kernel_L4_M1_22: | |||
| .Ldtrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M1_22 | |||
| bgt .Ldtrmm_kernel_L4_M1_22 | |||
| dtrmm_kernel_L4_M1_40: | |||
| .Ldtrmm_kernel_L4_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L4_M1_100 | |||
| ble .Ldtrmm_kernel_L4_M1_100 | |||
| dtrmm_kernel_L4_M1_42: | |||
| .Ldtrmm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M1_42 | |||
| bgt .Ldtrmm_kernel_L4_M1_42 | |||
| dtrmm_kernel_L4_M1_100: | |||
| .Ldtrmm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| @@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| dtrmm_kernel_L4_END: | |||
| .Ldtrmm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| @@ -838,19 +838,19 @@ dtrmm_kernel_L4_END: | |||
| #endif | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt dtrmm_kernel_L4_BEGIN | |||
| bgt .Ldtrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble dtrmm_kernel_L999 // error, N was less than 4? | |||
| ble .Ldtrmm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble dtrmm_kernel_L1_BEGIN | |||
| ble .Ldtrmm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| dtrmm_kernel_L2_M4_BEGIN: | |||
| .Ldtrmm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble dtrmm_kernel_L2_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L2_M2_BEGIN | |||
| dtrmm_kernel_L2_M4_20: | |||
| .Ldtrmm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dtrmm_kernel_L2_M4_40 | |||
| ble .Ldtrmm_kernel_L2_M4_40 | |||
| .align 5 | |||
| dtrmm_kernel_L2_M4_22: | |||
| .Ldtrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M4_22 | |||
| bgt .Ldtrmm_kernel_L2_M4_22 | |||
| dtrmm_kernel_L2_M4_40: | |||
| .Ldtrmm_kernel_L2_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M4_100 | |||
| ble .Ldtrmm_kernel_L2_M4_100 | |||
| dtrmm_kernel_L2_M4_42: | |||
| .Ldtrmm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M4_42 | |||
| bgt .Ldtrmm_kernel_L2_M4_42 | |||
| dtrmm_kernel_L2_M4_100: | |||
| .Ldtrmm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| @@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L2_M4_END: | |||
| .Ldtrmm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dtrmm_kernel_L2_M4_20 | |||
| bgt .Ldtrmm_kernel_L2_M4_20 | |||
| dtrmm_kernel_L2_M2_BEGIN: | |||
| .Ldtrmm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L2_END | |||
| ble .Ldtrmm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L2_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L2_M1_BEGIN | |||
| dtrmm_kernel_L2_M2_20: | |||
| .Ldtrmm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dtrmm_kernel_L2_M2_40 | |||
| ble .Ldtrmm_kernel_L2_M2_40 | |||
| dtrmm_kernel_L2_M2_22: | |||
| .Ldtrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M2_22 | |||
| bgt .Ldtrmm_kernel_L2_M2_22 | |||
| dtrmm_kernel_L2_M2_40: | |||
| .Ldtrmm_kernel_L2_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M2_100 | |||
| ble .Ldtrmm_kernel_L2_M2_100 | |||
| dtrmm_kernel_L2_M2_42: | |||
| .Ldtrmm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M2_42 | |||
| bgt .Ldtrmm_kernel_L2_M2_42 | |||
| dtrmm_kernel_L2_M2_100: | |||
| .Ldtrmm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| @@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L2_M2_END: | |||
| .Ldtrmm_kernel_L2_M2_END: | |||
| dtrmm_kernel_L2_M1_BEGIN: | |||
| .Ldtrmm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L2_END | |||
| ble .Ldtrmm_kernel_L2_END | |||
| dtrmm_kernel_L2_M1_20: | |||
| .Ldtrmm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dtrmm_kernel_L2_M1_40 | |||
| ble .Ldtrmm_kernel_L2_M1_40 | |||
| dtrmm_kernel_L2_M1_22: | |||
| .Ldtrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M1_22 | |||
| bgt .Ldtrmm_kernel_L2_M1_22 | |||
| dtrmm_kernel_L2_M1_40: | |||
| .Ldtrmm_kernel_L2_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M1_100 | |||
| ble .Ldtrmm_kernel_L2_M1_100 | |||
| dtrmm_kernel_L2_M1_42: | |||
| .Ldtrmm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M1_42 | |||
| bgt .Ldtrmm_kernel_L2_M1_42 | |||
| dtrmm_kernel_L2_M1_100: | |||
| .Ldtrmm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| @@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| dtrmm_kernel_L2_END: | |||
| .Ldtrmm_kernel_L2_END: | |||
| #if !defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| @@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END: | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L1_BEGIN: | |||
| .Ldtrmm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble dtrmm_kernel_L999 // done | |||
| ble .Ldtrmm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN: | |||
| mov pA, origPA // pA = A | |||
| dtrmm_kernel_L1_M4_BEGIN: | |||
| .Ldtrmm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L1_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L1_M2_BEGIN | |||
| dtrmm_kernel_L1_M4_20: | |||
| .Ldtrmm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| @@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M4_40 | |||
| ble .Ldtrmm_kernel_L1_M4_40 | |||
| .align 5 | |||
| dtrmm_kernel_L1_M4_22: | |||
| .Ldtrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M4_22 | |||
| bgt .Ldtrmm_kernel_L1_M4_22 | |||
| dtrmm_kernel_L1_M4_40: | |||
| .Ldtrmm_kernel_L1_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M4_100 | |||
| ble .Ldtrmm_kernel_L1_M4_100 | |||
| dtrmm_kernel_L1_M4_42: | |||
| .Ldtrmm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M4_42 | |||
| bgt .Ldtrmm_kernel_L1_M4_42 | |||
| dtrmm_kernel_L1_M4_100: | |||
| .Ldtrmm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| @@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L1_M4_END: | |||
| .Ldtrmm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dtrmm_kernel_L1_M4_20 | |||
| bgt .Ldtrmm_kernel_L1_M4_20 | |||
| dtrmm_kernel_L1_M2_BEGIN: | |||
| .Ldtrmm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L1_END | |||
| ble .Ldtrmm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L1_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L1_M1_BEGIN | |||
| dtrmm_kernel_L1_M2_20: | |||
| .Ldtrmm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M2_40 | |||
| ble .Ldtrmm_kernel_L1_M2_40 | |||
| dtrmm_kernel_L1_M2_22: | |||
| .Ldtrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M2_22 | |||
| bgt .Ldtrmm_kernel_L1_M2_22 | |||
| dtrmm_kernel_L1_M2_40: | |||
| .Ldtrmm_kernel_L1_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M2_100 | |||
| ble .Ldtrmm_kernel_L1_M2_100 | |||
| dtrmm_kernel_L1_M2_42: | |||
| .Ldtrmm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M2_42 | |||
| bgt .Ldtrmm_kernel_L1_M2_42 | |||
| dtrmm_kernel_L1_M2_100: | |||
| .Ldtrmm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| @@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L1_M2_END: | |||
| .Ldtrmm_kernel_L1_M2_END: | |||
| dtrmm_kernel_L1_M1_BEGIN: | |||
| .Ldtrmm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L1_END | |||
| ble .Ldtrmm_kernel_L1_END | |||
| dtrmm_kernel_L1_M1_20: | |||
| .Ldtrmm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M1_40 | |||
| ble .Ldtrmm_kernel_L1_M1_40 | |||
| dtrmm_kernel_L1_M1_22: | |||
| .Ldtrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M1_22 | |||
| bgt .Ldtrmm_kernel_L1_M1_22 | |||
| dtrmm_kernel_L1_M1_40: | |||
| .Ldtrmm_kernel_L1_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M1_100 | |||
| ble .Ldtrmm_kernel_L1_M1_100 | |||
| dtrmm_kernel_L1_M1_42: | |||
| .Ldtrmm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M1_42 | |||
| bgt .Ldtrmm_kernel_L1_M1_42 | |||
| dtrmm_kernel_L1_M1_100: | |||
| .Ldtrmm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| dtrmm_kernel_L1_END: | |||
| .Ldtrmm_kernel_L1_END: | |||
| dtrmm_kernel_L999: | |||
| .Ldtrmm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #3 // J = J / 8 | |||
| cmp counterJ, #0 | |||
| ble dtrmm_kernel_L4_BEGIN | |||
| ble .Ldtrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L8_BEGIN: | |||
| .Ldtrmm_kernel_L8_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #3 | |||
| @@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| dtrmm_kernel_L8_M4_BEGIN: | |||
| .Ldtrmm_kernel_L8_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L8_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L8_M2_BEGIN | |||
| dtrmm_kernel_L8_M4_20: | |||
| .Ldtrmm_kernel_L8_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20: | |||
| asr counterL, tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dtrmm_kernel_L8_M4_32 | |||
| blt .Ldtrmm_kernel_L8_M4_32 | |||
| KERNEL4x8_I // do one in the K | |||
| KERNEL4x8_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble dtrmm_kernel_L8_M4_22a | |||
| ble .Ldtrmm_kernel_L8_M4_22a | |||
| .align 5 | |||
| dtrmm_kernel_L8_M4_22: | |||
| .Ldtrmm_kernel_L8_M4_22: | |||
| KERNEL4x8_M1 | |||
| KERNEL4x8_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L8_M4_22 | |||
| bgt .Ldtrmm_kernel_L8_M4_22 | |||
| dtrmm_kernel_L8_M4_22a: | |||
| .Ldtrmm_kernel_L8_M4_22a: | |||
| KERNEL4x8_M1 | |||
| KERNEL4x8_E | |||
| b dtrmm_kernel_L8_M4_44 | |||
| b .Ldtrmm_kernel_L8_M4_44 | |||
| dtrmm_kernel_L8_M4_32: | |||
| .Ldtrmm_kernel_L8_M4_32: | |||
| tst counterL, #1 | |||
| ble dtrmm_kernel_L8_M4_40 | |||
| ble .Ldtrmm_kernel_L8_M4_40 | |||
| KERNEL4x8_I | |||
| KERNEL4x8_E | |||
| b dtrmm_kernel_L8_M4_44 | |||
| b .Ldtrmm_kernel_L8_M4_44 | |||
| dtrmm_kernel_L8_M4_40: | |||
| .Ldtrmm_kernel_L8_M4_40: | |||
| INIT4x8 | |||
| dtrmm_kernel_L8_M4_44: | |||
| .Ldtrmm_kernel_L8_M4_44: | |||
| ands counterL, tempK, #1 | |||
| ble dtrmm_kernel_L8_M4_100 | |||
| ble .Ldtrmm_kernel_L8_M4_100 | |||
| dtrmm_kernel_L8_M4_46: | |||
| .Ldtrmm_kernel_L8_M4_46: | |||
| KERNEL4x8_SUB | |||
| dtrmm_kernel_L8_M4_100: | |||
| .Ldtrmm_kernel_L8_M4_100: | |||
| SAVE4x8 | |||
| @@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L8_M4_END: | |||
| .Ldtrmm_kernel_L8_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne dtrmm_kernel_L8_M4_20 | |||
| bne .Ldtrmm_kernel_L8_M4_20 | |||
| dtrmm_kernel_L8_M2_BEGIN: | |||
| .Ldtrmm_kernel_L8_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L8_END | |||
| ble .Ldtrmm_kernel_L8_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L8_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L8_M1_BEGIN | |||
| dtrmm_kernel_L8_M2_20: | |||
| .Ldtrmm_kernel_L8_M2_20: | |||
| INIT2x8 | |||
| @@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20: | |||
| asr counterL, tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L8_M2_40 | |||
| ble .Ldtrmm_kernel_L8_M2_40 | |||
| dtrmm_kernel_L8_M2_22: | |||
| .Ldtrmm_kernel_L8_M2_22: | |||
| KERNEL2x8_SUB | |||
| KERNEL2x8_SUB | |||
| @@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22: | |||
| KERNEL2x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L8_M2_22 | |||
| bgt .Ldtrmm_kernel_L8_M2_22 | |||
| dtrmm_kernel_L8_M2_40: | |||
| .Ldtrmm_kernel_L8_M2_40: | |||
| ands counterL, tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L8_M2_100 | |||
| ble .Ldtrmm_kernel_L8_M2_100 | |||
| dtrmm_kernel_L8_M2_42: | |||
| .Ldtrmm_kernel_L8_M2_42: | |||
| KERNEL2x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L8_M2_42 | |||
| bgt .Ldtrmm_kernel_L8_M2_42 | |||
| dtrmm_kernel_L8_M2_100: | |||
| .Ldtrmm_kernel_L8_M2_100: | |||
| SAVE2x8 | |||
| @@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L8_M2_END: | |||
| .Ldtrmm_kernel_L8_M2_END: | |||
| dtrmm_kernel_L8_M1_BEGIN: | |||
| .Ldtrmm_kernel_L8_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L8_END | |||
| ble .Ldtrmm_kernel_L8_END | |||
| dtrmm_kernel_L8_M1_20: | |||
| .Ldtrmm_kernel_L8_M1_20: | |||
| INIT1x8 | |||
| @@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20: | |||
| asr counterL, tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L8_M1_40 | |||
| ble .Ldtrmm_kernel_L8_M1_40 | |||
| dtrmm_kernel_L8_M1_22: | |||
| .Ldtrmm_kernel_L8_M1_22: | |||
| KERNEL1x8_SUB | |||
| KERNEL1x8_SUB | |||
| KERNEL1x8_SUB | |||
| @@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22: | |||
| KERNEL1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L8_M1_22 | |||
| bgt .Ldtrmm_kernel_L8_M1_22 | |||
| dtrmm_kernel_L8_M1_40: | |||
| .Ldtrmm_kernel_L8_M1_40: | |||
| ands counterL, tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L8_M1_100 | |||
| ble .Ldtrmm_kernel_L8_M1_100 | |||
| dtrmm_kernel_L8_M1_42: | |||
| .Ldtrmm_kernel_L8_M1_42: | |||
| KERNEL1x8_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L8_M1_42 | |||
| bgt .Ldtrmm_kernel_L8_M1_42 | |||
| dtrmm_kernel_L8_M1_100: | |||
| .Ldtrmm_kernel_L8_M1_100: | |||
| SAVE1x8 | |||
| @@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| dtrmm_kernel_L8_END: | |||
| .Ldtrmm_kernel_L8_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 8 * 8 | |||
| @@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END: | |||
| #endif | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt dtrmm_kernel_L8_BEGIN | |||
| bgt .Ldtrmm_kernel_L8_BEGIN | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L4_BEGIN: | |||
| .Ldtrmm_kernel_L4_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #7 | |||
| ble dtrmm_kernel_L999 | |||
| ble .Ldtrmm_kernel_L999 | |||
| tst counterJ , #4 | |||
| ble dtrmm_kernel_L2_BEGIN | |||
| ble .Ldtrmm_kernel_L2_BEGIN | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| @@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| dtrmm_kernel_L4_M4_BEGIN: | |||
| .Ldtrmm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L4_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L4_M2_BEGIN | |||
| dtrmm_kernel_L4_M4_20: | |||
| .Ldtrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20: | |||
| asr counterL, tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dtrmm_kernel_L4_M4_32 | |||
| blt .Ldtrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble dtrmm_kernel_L4_M4_22a | |||
| ble .Ldtrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| dtrmm_kernel_L4_M4_22: | |||
| .Ldtrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M4_22 | |||
| bgt .Ldtrmm_kernel_L4_M4_22 | |||
| dtrmm_kernel_L4_M4_22a: | |||
| .Ldtrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b dtrmm_kernel_L4_M4_44 | |||
| b .Ldtrmm_kernel_L4_M4_44 | |||
| dtrmm_kernel_L4_M4_32: | |||
| .Ldtrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble dtrmm_kernel_L4_M4_40 | |||
| ble .Ldtrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b dtrmm_kernel_L4_M4_44 | |||
| b .Ldtrmm_kernel_L4_M4_44 | |||
| dtrmm_kernel_L4_M4_40: | |||
| .Ldtrmm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| dtrmm_kernel_L4_M4_44: | |||
| .Ldtrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ble dtrmm_kernel_L4_M4_100 | |||
| ble .Ldtrmm_kernel_L4_M4_100 | |||
| dtrmm_kernel_L4_M4_46: | |||
| .Ldtrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| dtrmm_kernel_L4_M4_100: | |||
| .Ldtrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L4_M4_END: | |||
| .Ldtrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne dtrmm_kernel_L4_M4_20 | |||
| bne .Ldtrmm_kernel_L4_M4_20 | |||
| dtrmm_kernel_L4_M2_BEGIN: | |||
| .Ldtrmm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L4_END | |||
| ble .Ldtrmm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L4_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L4_M1_BEGIN | |||
| dtrmm_kernel_L4_M2_20: | |||
| .Ldtrmm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L4_M2_40 | |||
| ble .Ldtrmm_kernel_L4_M2_40 | |||
| dtrmm_kernel_L4_M2_22: | |||
| .Ldtrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M2_22 | |||
| bgt .Ldtrmm_kernel_L4_M2_22 | |||
| dtrmm_kernel_L4_M2_40: | |||
| .Ldtrmm_kernel_L4_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L4_M2_100 | |||
| ble .Ldtrmm_kernel_L4_M2_100 | |||
| dtrmm_kernel_L4_M2_42: | |||
| .Ldtrmm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M2_42 | |||
| bgt .Ldtrmm_kernel_L4_M2_42 | |||
| dtrmm_kernel_L4_M2_100: | |||
| .Ldtrmm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| @@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L4_M2_END: | |||
| .Ldtrmm_kernel_L4_M2_END: | |||
| dtrmm_kernel_L4_M1_BEGIN: | |||
| .Ldtrmm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L4_END | |||
| ble .Ldtrmm_kernel_L4_END | |||
| dtrmm_kernel_L4_M1_20: | |||
| .Ldtrmm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L4_M1_40 | |||
| ble .Ldtrmm_kernel_L4_M1_40 | |||
| dtrmm_kernel_L4_M1_22: | |||
| .Ldtrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M1_22 | |||
| bgt .Ldtrmm_kernel_L4_M1_22 | |||
| dtrmm_kernel_L4_M1_40: | |||
| .Ldtrmm_kernel_L4_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L4_M1_100 | |||
| ble .Ldtrmm_kernel_L4_M1_100 | |||
| dtrmm_kernel_L4_M1_42: | |||
| .Ldtrmm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M1_42 | |||
| bgt .Ldtrmm_kernel_L4_M1_42 | |||
| dtrmm_kernel_L4_M1_100: | |||
| .Ldtrmm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| @@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| dtrmm_kernel_L4_END: | |||
| .Ldtrmm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| @@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END: | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble dtrmm_kernel_L999 // error, N was less than 4? | |||
| ble .Ldtrmm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble dtrmm_kernel_L1_BEGIN | |||
| ble .Ldtrmm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| dtrmm_kernel_L2_M4_BEGIN: | |||
| .Ldtrmm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble dtrmm_kernel_L2_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L2_M2_BEGIN | |||
| dtrmm_kernel_L2_M4_20: | |||
| .Ldtrmm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dtrmm_kernel_L2_M4_40 | |||
| ble .Ldtrmm_kernel_L2_M4_40 | |||
| .align 5 | |||
| dtrmm_kernel_L2_M4_22: | |||
| .Ldtrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M4_22 | |||
| bgt .Ldtrmm_kernel_L2_M4_22 | |||
| dtrmm_kernel_L2_M4_40: | |||
| .Ldtrmm_kernel_L2_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M4_100 | |||
| ble .Ldtrmm_kernel_L2_M4_100 | |||
| dtrmm_kernel_L2_M4_42: | |||
| .Ldtrmm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M4_42 | |||
| bgt .Ldtrmm_kernel_L2_M4_42 | |||
| dtrmm_kernel_L2_M4_100: | |||
| .Ldtrmm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L2_M4_END: | |||
| .Ldtrmm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dtrmm_kernel_L2_M4_20 | |||
| bgt .Ldtrmm_kernel_L2_M4_20 | |||
| dtrmm_kernel_L2_M2_BEGIN: | |||
| .Ldtrmm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L2_END | |||
| ble .Ldtrmm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L2_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L2_M1_BEGIN | |||
| dtrmm_kernel_L2_M2_20: | |||
| .Ldtrmm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dtrmm_kernel_L2_M2_40 | |||
| ble .Ldtrmm_kernel_L2_M2_40 | |||
| dtrmm_kernel_L2_M2_22: | |||
| .Ldtrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M2_22 | |||
| bgt .Ldtrmm_kernel_L2_M2_22 | |||
| dtrmm_kernel_L2_M2_40: | |||
| .Ldtrmm_kernel_L2_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M2_100 | |||
| ble .Ldtrmm_kernel_L2_M2_100 | |||
| dtrmm_kernel_L2_M2_42: | |||
| .Ldtrmm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M2_42 | |||
| bgt .Ldtrmm_kernel_L2_M2_42 | |||
| dtrmm_kernel_L2_M2_100: | |||
| .Ldtrmm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| @@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L2_M2_END: | |||
| .Ldtrmm_kernel_L2_M2_END: | |||
| dtrmm_kernel_L2_M1_BEGIN: | |||
| .Ldtrmm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L2_END | |||
| ble .Ldtrmm_kernel_L2_END | |||
| dtrmm_kernel_L2_M1_20: | |||
| .Ldtrmm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dtrmm_kernel_L2_M1_40 | |||
| ble .Ldtrmm_kernel_L2_M1_40 | |||
| dtrmm_kernel_L2_M1_22: | |||
| .Ldtrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M1_22 | |||
| bgt .Ldtrmm_kernel_L2_M1_22 | |||
| dtrmm_kernel_L2_M1_40: | |||
| .Ldtrmm_kernel_L2_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M1_100 | |||
| ble .Ldtrmm_kernel_L2_M1_100 | |||
| dtrmm_kernel_L2_M1_42: | |||
| .Ldtrmm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M1_42 | |||
| bgt .Ldtrmm_kernel_L2_M1_42 | |||
| dtrmm_kernel_L2_M1_100: | |||
| .Ldtrmm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| @@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| dtrmm_kernel_L2_END: | |||
| .Ldtrmm_kernel_L2_END: | |||
| #if !defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| @@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END: | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L1_BEGIN: | |||
| .Ldtrmm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble dtrmm_kernel_L999 // done | |||
| ble .Ldtrmm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN: | |||
| #endif | |||
| mov pA, origPA // pA = A | |||
| dtrmm_kernel_L1_M4_BEGIN: | |||
| .Ldtrmm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L1_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L1_M2_BEGIN | |||
| dtrmm_kernel_L1_M4_20: | |||
| .Ldtrmm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| @@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M4_40 | |||
| ble .Ldtrmm_kernel_L1_M4_40 | |||
| .align 5 | |||
| dtrmm_kernel_L1_M4_22: | |||
| .Ldtrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M4_22 | |||
| bgt .Ldtrmm_kernel_L1_M4_22 | |||
| dtrmm_kernel_L1_M4_40: | |||
| .Ldtrmm_kernel_L1_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M4_100 | |||
| ble .Ldtrmm_kernel_L1_M4_100 | |||
| dtrmm_kernel_L1_M4_42: | |||
| .Ldtrmm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M4_42 | |||
| bgt .Ldtrmm_kernel_L1_M4_42 | |||
| dtrmm_kernel_L1_M4_100: | |||
| .Ldtrmm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L1_M4_END: | |||
| .Ldtrmm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dtrmm_kernel_L1_M4_20 | |||
| bgt .Ldtrmm_kernel_L1_M4_20 | |||
| dtrmm_kernel_L1_M2_BEGIN: | |||
| .Ldtrmm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L1_END | |||
| ble .Ldtrmm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L1_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L1_M1_BEGIN | |||
| dtrmm_kernel_L1_M2_20: | |||
| .Ldtrmm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M2_40 | |||
| ble .Ldtrmm_kernel_L1_M2_40 | |||
| dtrmm_kernel_L1_M2_22: | |||
| .Ldtrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M2_22 | |||
| bgt .Ldtrmm_kernel_L1_M2_22 | |||
| dtrmm_kernel_L1_M2_40: | |||
| .Ldtrmm_kernel_L1_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M2_100 | |||
| ble .Ldtrmm_kernel_L1_M2_100 | |||
| dtrmm_kernel_L1_M2_42: | |||
| .Ldtrmm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M2_42 | |||
| bgt .Ldtrmm_kernel_L1_M2_42 | |||
| dtrmm_kernel_L1_M2_100: | |||
| .Ldtrmm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| @@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L1_M2_END: | |||
| .Ldtrmm_kernel_L1_M2_END: | |||
| dtrmm_kernel_L1_M1_BEGIN: | |||
| .Ldtrmm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L1_END | |||
| ble .Ldtrmm_kernel_L1_END | |||
| dtrmm_kernel_L1_M1_20: | |||
| .Ldtrmm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M1_40 | |||
| ble .Ldtrmm_kernel_L1_M1_40 | |||
| dtrmm_kernel_L1_M1_22: | |||
| .Ldtrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M1_22 | |||
| bgt .Ldtrmm_kernel_L1_M1_22 | |||
| dtrmm_kernel_L1_M1_40: | |||
| .Ldtrmm_kernel_L1_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M1_100 | |||
| ble .Ldtrmm_kernel_L1_M1_100 | |||
| dtrmm_kernel_L1_M1_42: | |||
| .Ldtrmm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M1_42 | |||
| bgt .Ldtrmm_kernel_L1_M1_42 | |||
| dtrmm_kernel_L1_M1_100: | |||
| .Ldtrmm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| dtrmm_kernel_L1_END: | |||
| .Ldtrmm_kernel_L1_END: | |||
| dtrmm_kernel_L999: | |||
| .Ldtrmm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble dtrmm_kernel_L2_BEGIN | |||
| ble .Ldtrmm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L4_BEGIN: | |||
| .Ldtrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN: | |||
| #endif | |||
| mov pA, origPA // pA = start of A array | |||
| dtrmm_kernel_L4_M8_BEGIN: | |||
| .Ldtrmm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L4_M4_BEGIN | |||
| ble .Ldtrmm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_20: | |||
| .Ldtrmm_kernel_L4_M8_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20: | |||
| asr counterL , tempK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dtrmm_kernel_L4_M8_32 | |||
| blt .Ldtrmm_kernel_L4_M8_32 | |||
| KERNEL8x4_I // do one in the K | |||
| KERNEL8x4_M2 // do another in the K | |||
| @@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dtrmm_kernel_L4_M8_22a | |||
| ble .Ldtrmm_kernel_L4_M8_22a | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22: | |||
| .Ldtrmm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22: | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M8_22 | |||
| bgt .Ldtrmm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22a: | |||
| .Ldtrmm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| @@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| b .Ldtrmm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_32: | |||
| .Ldtrmm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dtrmm_kernel_L4_M8_40 | |||
| ble .Ldtrmm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| @@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| b .Ldtrmm_kernel_L4_M8_44 | |||
| dtrmm_kernel_L4_M8_40: | |||
| .Ldtrmm_kernel_L4_M8_40: | |||
| INIT8x4 | |||
| dtrmm_kernel_L4_M8_44: | |||
| .Ldtrmm_kernel_L4_M8_44: | |||
| ands counterL , tempK, #7 | |||
| ble dtrmm_kernel_L4_M8_100 | |||
| ble .Ldtrmm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_46: | |||
| .Ldtrmm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dtrmm_kernel_L4_M8_46 | |||
| bne .Ldtrmm_kernel_L4_M8_46 | |||
| dtrmm_kernel_L4_M8_100: | |||
| .Ldtrmm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| @@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100: | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| dtrmm_kernel_L4_M8_END: | |||
| .Ldtrmm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bne dtrmm_kernel_L4_M8_20 | |||
| bne .Ldtrmm_kernel_L4_M8_20 | |||
| dtrmm_kernel_L4_M4_BEGIN: | |||
| .Ldtrmm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dtrmm_kernel_L4_END | |||
| ble .Ldtrmm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble dtrmm_kernel_L4_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L4_M2_BEGIN | |||
| dtrmm_kernel_L4_M4_20: | |||
| .Ldtrmm_kernel_L4_M4_20: | |||
| INIT4x4 | |||
| @@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L4_M4_40 | |||
| ble .Ldtrmm_kernel_L4_M4_40 | |||
| dtrmm_kernel_L4_M4_22: | |||
| .Ldtrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| KERNEL4x4_SUB | |||
| @@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M4_22 | |||
| bgt .Ldtrmm_kernel_L4_M4_22 | |||
| dtrmm_kernel_L4_M4_40: | |||
| .Ldtrmm_kernel_L4_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L4_M4_100 | |||
| ble .Ldtrmm_kernel_L4_M4_100 | |||
| dtrmm_kernel_L4_M4_42: | |||
| .Ldtrmm_kernel_L4_M4_42: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M4_42 | |||
| bgt .Ldtrmm_kernel_L4_M4_42 | |||
| dtrmm_kernel_L4_M4_100: | |||
| .Ldtrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L4_M4_END: | |||
| .Ldtrmm_kernel_L4_M4_END: | |||
| dtrmm_kernel_L4_M2_BEGIN: | |||
| .Ldtrmm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L4_END | |||
| ble .Ldtrmm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L4_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L4_M1_BEGIN | |||
| dtrmm_kernel_L4_M2_20: | |||
| .Ldtrmm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20: | |||
| #endif | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L4_M2_40 | |||
| ble .Ldtrmm_kernel_L4_M2_40 | |||
| dtrmm_kernel_L4_M2_22: | |||
| .Ldtrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M2_22 | |||
| bgt .Ldtrmm_kernel_L4_M2_22 | |||
| dtrmm_kernel_L4_M2_40: | |||
| .Ldtrmm_kernel_L4_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L4_M2_100 | |||
| ble .Ldtrmm_kernel_L4_M2_100 | |||
| dtrmm_kernel_L4_M2_42: | |||
| .Ldtrmm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M2_42 | |||
| bgt .Ldtrmm_kernel_L4_M2_42 | |||
| dtrmm_kernel_L4_M2_100: | |||
| .Ldtrmm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| @@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L4_M2_END: | |||
| .Ldtrmm_kernel_L4_M2_END: | |||
| dtrmm_kernel_L4_M1_BEGIN: | |||
| .Ldtrmm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L4_END | |||
| ble .Ldtrmm_kernel_L4_END | |||
| dtrmm_kernel_L4_M1_20: | |||
| .Ldtrmm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L4_M1_40 | |||
| ble .Ldtrmm_kernel_L4_M1_40 | |||
| dtrmm_kernel_L4_M1_22: | |||
| .Ldtrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M1_22 | |||
| bgt .Ldtrmm_kernel_L4_M1_22 | |||
| dtrmm_kernel_L4_M1_40: | |||
| .Ldtrmm_kernel_L4_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L4_M1_100 | |||
| ble .Ldtrmm_kernel_L4_M1_100 | |||
| dtrmm_kernel_L4_M1_42: | |||
| .Ldtrmm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M1_42 | |||
| bgt .Ldtrmm_kernel_L4_M1_42 | |||
| dtrmm_kernel_L4_M1_100: | |||
| .Ldtrmm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| @@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| dtrmm_kernel_L4_END: | |||
| .Ldtrmm_kernel_L4_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 | |||
| @@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END: | |||
| #endif | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt dtrmm_kernel_L4_BEGIN | |||
| bgt .Ldtrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble dtrmm_kernel_L999 // error, N was less than 4? | |||
| ble .Ldtrmm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble dtrmm_kernel_L1_BEGIN | |||
| ble .Ldtrmm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| #endif | |||
| mov pA, origPA // pA = A | |||
| dtrmm_kernel_L2_M8_BEGIN: | |||
| .Ldtrmm_kernel_L2_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L2_M4_BEGIN | |||
| ble .Ldtrmm_kernel_L2_M4_BEGIN | |||
| dtrmm_kernel_L2_M8_20: | |||
| .Ldtrmm_kernel_L2_M8_20: | |||
| INIT8x2 | |||
| @@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dtrmm_kernel_L2_M8_40 | |||
| ble .Ldtrmm_kernel_L2_M8_40 | |||
| .align 5 | |||
| dtrmm_kernel_L2_M8_22: | |||
| .Ldtrmm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| KERNEL8x2_SUB | |||
| @@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M8_22 | |||
| bgt .Ldtrmm_kernel_L2_M8_22 | |||
| dtrmm_kernel_L2_M8_40: | |||
| .Ldtrmm_kernel_L2_M8_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M8_100 | |||
| ble .Ldtrmm_kernel_L2_M8_100 | |||
| dtrmm_kernel_L2_M8_42: | |||
| .Ldtrmm_kernel_L2_M8_42: | |||
| KERNEL8x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M8_42 | |||
| bgt .Ldtrmm_kernel_L2_M8_42 | |||
| dtrmm_kernel_L2_M8_100: | |||
| .Ldtrmm_kernel_L2_M8_100: | |||
| SAVE8x2 | |||
| @@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100: | |||
| add tempOffset, tempOffset, #8 | |||
| #endif | |||
| dtrmm_kernel_L2_M8_END: | |||
| .Ldtrmm_kernel_L2_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dtrmm_kernel_L2_M8_20 | |||
| bgt .Ldtrmm_kernel_L2_M8_20 | |||
| dtrmm_kernel_L2_M4_BEGIN: | |||
| .Ldtrmm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dtrmm_kernel_L2_END | |||
| ble .Ldtrmm_kernel_L2_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L2_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L2_M2_BEGIN | |||
| dtrmm_kernel_L2_M4_20: | |||
| .Ldtrmm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dtrmm_kernel_L2_M4_40 | |||
| ble .Ldtrmm_kernel_L2_M4_40 | |||
| .align 5 | |||
| dtrmm_kernel_L2_M4_22: | |||
| .Ldtrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M4_22 | |||
| bgt .Ldtrmm_kernel_L2_M4_22 | |||
| dtrmm_kernel_L2_M4_40: | |||
| .Ldtrmm_kernel_L2_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M4_100 | |||
| ble .Ldtrmm_kernel_L2_M4_100 | |||
| dtrmm_kernel_L2_M4_42: | |||
| .Ldtrmm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M4_42 | |||
| bgt .Ldtrmm_kernel_L2_M4_42 | |||
| dtrmm_kernel_L2_M4_100: | |||
| .Ldtrmm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| @@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L2_M4_END: | |||
| .Ldtrmm_kernel_L2_M4_END: | |||
| dtrmm_kernel_L2_M2_BEGIN: | |||
| .Ldtrmm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L2_END | |||
| ble .Ldtrmm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L2_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L2_M1_BEGIN | |||
| dtrmm_kernel_L2_M2_20: | |||
| .Ldtrmm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble dtrmm_kernel_L2_M2_40 | |||
| ble .Ldtrmm_kernel_L2_M2_40 | |||
| dtrmm_kernel_L2_M2_22: | |||
| .Ldtrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M2_22 | |||
| bgt .Ldtrmm_kernel_L2_M2_22 | |||
| dtrmm_kernel_L2_M2_40: | |||
| .Ldtrmm_kernel_L2_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M2_100 | |||
| ble .Ldtrmm_kernel_L2_M2_100 | |||
| dtrmm_kernel_L2_M2_42: | |||
| .Ldtrmm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M2_42 | |||
| bgt .Ldtrmm_kernel_L2_M2_42 | |||
| dtrmm_kernel_L2_M2_100: | |||
| .Ldtrmm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| @@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L2_M2_END: | |||
| .Ldtrmm_kernel_L2_M2_END: | |||
| dtrmm_kernel_L2_M1_BEGIN: | |||
| .Ldtrmm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L2_END | |||
| ble .Ldtrmm_kernel_L2_END | |||
| dtrmm_kernel_L2_M1_20: | |||
| .Ldtrmm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble dtrmm_kernel_L2_M1_40 | |||
| ble .Ldtrmm_kernel_L2_M1_40 | |||
| dtrmm_kernel_L2_M1_22: | |||
| .Ldtrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M1_22 | |||
| bgt .Ldtrmm_kernel_L2_M1_22 | |||
| dtrmm_kernel_L2_M1_40: | |||
| .Ldtrmm_kernel_L2_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L2_M1_100 | |||
| ble .Ldtrmm_kernel_L2_M1_100 | |||
| dtrmm_kernel_L2_M1_42: | |||
| .Ldtrmm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L2_M1_42 | |||
| bgt .Ldtrmm_kernel_L2_M1_42 | |||
| dtrmm_kernel_L2_M1_100: | |||
| .Ldtrmm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| @@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| dtrmm_kernel_L2_END: | |||
| .Ldtrmm_kernel_L2_END: | |||
| #if !defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| @@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END: | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L1_BEGIN: | |||
| .Ldtrmm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble dtrmm_kernel_L999 // done | |||
| ble .Ldtrmm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC , pC , LDC // Update pC to point to next | |||
| @@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN: | |||
| #endif | |||
| mov pA, origPA // pA = A | |||
| dtrmm_kernel_L1_M8_BEGIN: | |||
| .Ldtrmm_kernel_L1_M8_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #3 // counterI = counterI / 8 | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L1_M4_BEGIN | |||
| ble .Ldtrmm_kernel_L1_M4_BEGIN | |||
| dtrmm_kernel_L1_M8_20: | |||
| .Ldtrmm_kernel_L1_M8_20: | |||
| INIT8x1 | |||
| @@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M8_40 | |||
| ble .Ldtrmm_kernel_L1_M8_40 | |||
| .align 5 | |||
| dtrmm_kernel_L1_M8_22: | |||
| .Ldtrmm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| KERNEL8x1_SUB | |||
| @@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M8_22 | |||
| bgt .Ldtrmm_kernel_L1_M8_22 | |||
| dtrmm_kernel_L1_M8_40: | |||
| .Ldtrmm_kernel_L1_M8_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M8_100 | |||
| ble .Ldtrmm_kernel_L1_M8_100 | |||
| dtrmm_kernel_L1_M8_42: | |||
| .Ldtrmm_kernel_L1_M8_42: | |||
| KERNEL8x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M8_42 | |||
| bgt .Ldtrmm_kernel_L1_M8_42 | |||
| dtrmm_kernel_L1_M8_100: | |||
| .Ldtrmm_kernel_L1_M8_100: | |||
| SAVE8x1 | |||
| @@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100: | |||
| add tempOffset, tempOffset, #8 | |||
| #endif | |||
| dtrmm_kernel_L1_M8_END: | |||
| .Ldtrmm_kernel_L1_M8_END: | |||
| subs counterI, counterI, #1 | |||
| bgt dtrmm_kernel_L1_M8_20 | |||
| bgt .Ldtrmm_kernel_L1_M8_20 | |||
| dtrmm_kernel_L1_M4_BEGIN: | |||
| .Ldtrmm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble dtrmm_kernel_L1_END | |||
| ble .Ldtrmm_kernel_L1_END | |||
| tst counterI, #4 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L1_M2_BEGIN | |||
| ble .Ldtrmm_kernel_L1_M2_BEGIN | |||
| dtrmm_kernel_L1_M4_20: | |||
| .Ldtrmm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| @@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M4_40 | |||
| ble .Ldtrmm_kernel_L1_M4_40 | |||
| .align 5 | |||
| dtrmm_kernel_L1_M4_22: | |||
| .Ldtrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M4_22 | |||
| bgt .Ldtrmm_kernel_L1_M4_22 | |||
| dtrmm_kernel_L1_M4_40: | |||
| .Ldtrmm_kernel_L1_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M4_100 | |||
| ble .Ldtrmm_kernel_L1_M4_100 | |||
| dtrmm_kernel_L1_M4_42: | |||
| .Ldtrmm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M4_42 | |||
| bgt .Ldtrmm_kernel_L1_M4_42 | |||
| dtrmm_kernel_L1_M4_100: | |||
| .Ldtrmm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| @@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| dtrmm_kernel_L1_M4_END: | |||
| .Ldtrmm_kernel_L1_M4_END: | |||
| dtrmm_kernel_L1_M2_BEGIN: | |||
| .Ldtrmm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble dtrmm_kernel_L1_END | |||
| ble .Ldtrmm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble dtrmm_kernel_L1_M1_BEGIN | |||
| ble .Ldtrmm_kernel_L1_M1_BEGIN | |||
| dtrmm_kernel_L1_M2_20: | |||
| .Ldtrmm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M2_40 | |||
| ble .Ldtrmm_kernel_L1_M2_40 | |||
| dtrmm_kernel_L1_M2_22: | |||
| .Ldtrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M2_22 | |||
| bgt .Ldtrmm_kernel_L1_M2_22 | |||
| dtrmm_kernel_L1_M2_40: | |||
| .Ldtrmm_kernel_L1_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M2_100 | |||
| ble .Ldtrmm_kernel_L1_M2_100 | |||
| dtrmm_kernel_L1_M2_42: | |||
| .Ldtrmm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M2_42 | |||
| bgt .Ldtrmm_kernel_L1_M2_42 | |||
| dtrmm_kernel_L1_M2_100: | |||
| .Ldtrmm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| @@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| dtrmm_kernel_L1_M2_END: | |||
| .Ldtrmm_kernel_L1_M2_END: | |||
| dtrmm_kernel_L1_M1_BEGIN: | |||
| .Ldtrmm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble dtrmm_kernel_L1_END | |||
| ble .Ldtrmm_kernel_L1_END | |||
| dtrmm_kernel_L1_M1_20: | |||
| .Ldtrmm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble dtrmm_kernel_L1_M1_40 | |||
| ble .Ldtrmm_kernel_L1_M1_40 | |||
| dtrmm_kernel_L1_M1_22: | |||
| .Ldtrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M1_22 | |||
| bgt .Ldtrmm_kernel_L1_M1_22 | |||
| dtrmm_kernel_L1_M1_40: | |||
| .Ldtrmm_kernel_L1_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble dtrmm_kernel_L1_M1_100 | |||
| ble .Ldtrmm_kernel_L1_M1_100 | |||
| dtrmm_kernel_L1_M1_42: | |||
| .Ldtrmm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L1_M1_42 | |||
| bgt .Ldtrmm_kernel_L1_M1_42 | |||
| dtrmm_kernel_L1_M1_100: | |||
| .Ldtrmm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| dtrmm_kernel_L1_END: | |||
| .Ldtrmm_kernel_L1_END: | |||
| dtrmm_kernel_L999: | |||
| .Ldtrmm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble gemv_n_kernel_L999 | |||
| ble .Lgemv_n_kernel_L999 | |||
| cmp M, xzr | |||
| ble gemv_n_kernel_L999 | |||
| ble .Lgemv_n_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_X, INC_X, #SHZ | |||
| mov J, N | |||
| cmp INC_Y, #1 | |||
| bne gemv_n_kernel_S_BEGIN | |||
| bne .Lgemv_n_kernel_S_BEGIN | |||
| gemv_n_kernel_F_LOOP: | |||
| .Lgemv_n_kernel_F_LOOP: | |||
| ld1 TEMPV, [X], INC_X | |||
| fmul TEMP, ALPHA, TEMP | |||
| @@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP: | |||
| mov Y_IPTR, Y | |||
| mov Y_OPTR, Y | |||
| gemv_n_kernel_F32: | |||
| .Lgemv_n_kernel_F32: | |||
| asr I, M, #5 | |||
| cmp I, xzr | |||
| beq gemv_n_kernel_F4 | |||
| beq .Lgemv_n_kernel_F4 | |||
| gemv_n_kernel_F320: | |||
| .Lgemv_n_kernel_F320: | |||
| KERNEL_F16 | |||
| KERNEL_F16 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_F320 | |||
| bne .Lgemv_n_kernel_F320 | |||
| gemv_n_kernel_F4: | |||
| .Lgemv_n_kernel_F4: | |||
| ands I, M, #31 | |||
| asr I, I, #2 | |||
| cmp I, xzr | |||
| beq gemv_n_kernel_F1 | |||
| beq .Lgemv_n_kernel_F1 | |||
| gemv_n_kernel_F40: | |||
| .Lgemv_n_kernel_F40: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_F40 | |||
| bne .Lgemv_n_kernel_F40 | |||
| gemv_n_kernel_F1: | |||
| .Lgemv_n_kernel_F1: | |||
| ands I, M, #3 | |||
| ble gemv_n_kernel_F_END | |||
| ble .Lgemv_n_kernel_F_END | |||
| gemv_n_kernel_F10: | |||
| .Lgemv_n_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_F10 | |||
| bne .Lgemv_n_kernel_F10 | |||
| gemv_n_kernel_F_END: | |||
| .Lgemv_n_kernel_F_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne gemv_n_kernel_F_LOOP | |||
| bne .Lgemv_n_kernel_F_LOOP | |||
| b gemv_n_kernel_L999 | |||
| b .Lgemv_n_kernel_L999 | |||
| gemv_n_kernel_S_BEGIN: | |||
| .Lgemv_n_kernel_S_BEGIN: | |||
| INIT_S | |||
| gemv_n_kernel_S_LOOP: | |||
| .Lgemv_n_kernel_S_LOOP: | |||
| ld1 TEMPV, [X], INC_X | |||
| fmul TEMP, ALPHA, TEMP | |||
| @@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP: | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble gemv_n_kernel_S1 | |||
| ble .Lgemv_n_kernel_S1 | |||
| gemv_n_kernel_S4: | |||
| .Lgemv_n_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -298,27 +298,27 @@ gemv_n_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_S4 | |||
| bne .Lgemv_n_kernel_S4 | |||
| gemv_n_kernel_S1: | |||
| .Lgemv_n_kernel_S1: | |||
| ands I, M, #3 | |||
| ble gemv_n_kernel_S_END | |||
| ble .Lgemv_n_kernel_S_END | |||
| gemv_n_kernel_S10: | |||
| .Lgemv_n_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_n_kernel_S10 | |||
| bne .Lgemv_n_kernel_S10 | |||
| gemv_n_kernel_S_END: | |||
| .Lgemv_n_kernel_S_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne gemv_n_kernel_S_LOOP | |||
| bne .Lgemv_n_kernel_S_LOOP | |||
| gemv_n_kernel_L999: | |||
| .Lgemv_n_kernel_L999: | |||
| mov w0, wzr | |||
| @@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble gemv_t_kernel_L999 | |||
| ble .Lgemv_t_kernel_L999 | |||
| cmp M, xzr | |||
| ble gemv_t_kernel_L999 | |||
| ble .Lgemv_t_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_Y, INC_Y, #SHZ | |||
| mov J, N | |||
| cmp INC_X, #1 | |||
| bne gemv_t_kernel_S_BEGIN | |||
| bne .Lgemv_t_kernel_S_BEGIN | |||
| gemv_t_kernel_F_LOOP: | |||
| .Lgemv_t_kernel_F_LOOP: | |||
| fmov TEMP, REG0 | |||
| fmov TEMP1, REG0 | |||
| @@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP: | |||
| mov A_PTR, A | |||
| mov X_PTR, X | |||
| gemv_t_kernel_F32: | |||
| .Lgemv_t_kernel_F32: | |||
| asr I, M, #5 | |||
| cmp I, xzr | |||
| beq gemv_t_kernel_F4 | |||
| beq .Lgemv_t_kernel_F4 | |||
| gemv_t_kernel_F320: | |||
| .Lgemv_t_kernel_F320: | |||
| KERNEL_F32 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_F320 | |||
| bne .Lgemv_t_kernel_F320 | |||
| KERNEL_F32_FINALIZE | |||
| gemv_t_kernel_F4: | |||
| .Lgemv_t_kernel_F4: | |||
| ands I, M, #31 | |||
| asr I, I, #2 | |||
| cmp I, xzr | |||
| beq gemv_t_kernel_F1 | |||
| beq .Lgemv_t_kernel_F1 | |||
| gemv_t_kernel_F40: | |||
| .Lgemv_t_kernel_F40: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_F40 | |||
| bne .Lgemv_t_kernel_F40 | |||
| gemv_t_kernel_F1: | |||
| .Lgemv_t_kernel_F1: | |||
| KERNEL_F4_FINALIZE | |||
| ands I, M, #3 | |||
| ble gemv_t_kernel_F_END | |||
| ble .Lgemv_t_kernel_F_END | |||
| gemv_t_kernel_F10: | |||
| .Lgemv_t_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_F10 | |||
| bne .Lgemv_t_kernel_F10 | |||
| gemv_t_kernel_F_END: | |||
| .Lgemv_t_kernel_F_END: | |||
| ld1 TMPV1, [Y] | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| fmadd TMP1, ALPHA, TEMP, TMP1 | |||
| st1 TMPV1, [Y], INC_Y | |||
| bne gemv_t_kernel_F_LOOP | |||
| bne .Lgemv_t_kernel_F_LOOP | |||
| b gemv_t_kernel_L999 | |||
| b .Lgemv_t_kernel_L999 | |||
| gemv_t_kernel_S_BEGIN: | |||
| .Lgemv_t_kernel_S_BEGIN: | |||
| INIT_S | |||
| gemv_t_kernel_S_LOOP: | |||
| .Lgemv_t_kernel_S_LOOP: | |||
| fmov TEMP, REG0 | |||
| mov A_PTR, A | |||
| @@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP: | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble gemv_t_kernel_S1 | |||
| ble .Lgemv_t_kernel_S1 | |||
| gemv_t_kernel_S4: | |||
| .Lgemv_t_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -329,30 +329,30 @@ gemv_t_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_S4 | |||
| bne .Lgemv_t_kernel_S4 | |||
| gemv_t_kernel_S1: | |||
| .Lgemv_t_kernel_S1: | |||
| ands I, M, #3 | |||
| ble gemv_t_kernel_S_END | |||
| ble .Lgemv_t_kernel_S_END | |||
| gemv_t_kernel_S10: | |||
| .Lgemv_t_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne gemv_t_kernel_S10 | |||
| bne .Lgemv_t_kernel_S10 | |||
| gemv_t_kernel_S_END: | |||
| .Lgemv_t_kernel_S_END: | |||
| ld1 TMPV1, [Y] | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| fmadd TMP1, ALPHA, TEMP, TMP1 | |||
| st1 TMPV1, [Y], INC_Y | |||
| bne gemv_t_kernel_S_LOOP | |||
| bne .Lgemv_t_kernel_S_LOOP | |||
| gemv_t_kernel_L999: | |||
| .Lgemv_t_kernel_L999: | |||
| RESTORE_REGS | |||
| @@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble iamax_kernel_zero | |||
| ble .Liamax_kernel_zero | |||
| cmp INC_X, xzr | |||
| ble iamax_kernel_zero | |||
| ble .Liamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne iamax_kernel_S_BEGIN | |||
| bne .Liamax_kernel_S_BEGIN | |||
| mov x7, X | |||
| iamax_kernel_F_BEGIN: | |||
| .Liamax_kernel_F_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| ble .Liamax_kernel_L999 | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq iamax_kernel_F1 | |||
| beq .Liamax_kernel_F1 | |||
| add Z, Z, #1 | |||
| iamax_kernel_F8: | |||
| .Liamax_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F8 | |||
| bne .Liamax_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| sub Z, Z, #1 | |||
| iamax_kernel_F1: | |||
| .Liamax_kernel_F1: | |||
| ands I, N, #7 | |||
| ble iamax_kernel_L999 | |||
| ble .Liamax_kernel_L999 | |||
| iamax_kernel_F10: | |||
| .Liamax_kernel_F10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F10 | |||
| bne .Liamax_kernel_F10 | |||
| b iamax_kernel_L999 | |||
| b .Liamax_kernel_L999 | |||
| iamax_kernel_S_BEGIN: | |||
| .Liamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| ble .Liamax_kernel_L999 | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble iamax_kernel_S1 | |||
| ble .Liamax_kernel_S1 | |||
| iamax_kernel_S4: | |||
| .Liamax_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -293,25 +293,25 @@ iamax_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_S4 | |||
| bne .Liamax_kernel_S4 | |||
| iamax_kernel_S1: | |||
| .Liamax_kernel_S1: | |||
| ands I, N, #3 | |||
| ble iamax_kernel_L999 | |||
| ble .Liamax_kernel_L999 | |||
| iamax_kernel_S10: | |||
| .Liamax_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_S10 | |||
| bne .Liamax_kernel_S10 | |||
| iamax_kernel_L999: | |||
| .Liamax_kernel_L999: | |||
| mov x0, INDEX | |||
| ret | |||
| iamax_kernel_zero: | |||
| .Liamax_kernel_zero: | |||
| mov x0, xzr | |||
| ret | |||
| @@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble iamax_kernel_zero | |||
| ble .Lizamax_kernel_zero | |||
| cmp INC_X, xzr | |||
| ble iamax_kernel_zero | |||
| ble .Lizamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne iamax_kernel_S_BEGIN | |||
| bne .Lizamax_kernel_S_BEGIN | |||
| mov x7, X | |||
| iamax_kernel_F_BEGIN: | |||
| .Lizamax_kernel_F_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| ble .Lizamax_kernel_L999 | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| ble iamax_kernel_F1 | |||
| ble .Lizamax_kernel_F1 | |||
| add Z, Z, #1 | |||
| iamax_kernel_F8: | |||
| .Lizamax_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F8 | |||
| bne .Lizamax_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| sub Z, Z, #1 | |||
| iamax_kernel_F1: | |||
| .Lizamax_kernel_F1: | |||
| ands I, N, #7 | |||
| ble iamax_kernel_L999 | |||
| ble .Lizamax_kernel_L999 | |||
| iamax_kernel_F10: | |||
| .Lizamax_kernel_F10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F10 | |||
| bne .Lizamax_kernel_F10 | |||
| b iamax_kernel_L999 | |||
| b .Lizamax_kernel_L999 | |||
| iamax_kernel_S_BEGIN: | |||
| .Lizamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| ble .Lizamax_kernel_L999 | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble iamax_kernel_S1 | |||
| ble .Lizamax_kernel_S1 | |||
| iamax_kernel_S4: | |||
| .Lizamax_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -341,26 +341,26 @@ iamax_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_S4 | |||
| bne .Lizamax_kernel_S4 | |||
| iamax_kernel_S1: | |||
| .Lizamax_kernel_S1: | |||
| ands I, N, #3 | |||
| ble iamax_kernel_L999 | |||
| ble .Lizamax_kernel_L999 | |||
| iamax_kernel_S10: | |||
| .Lizamax_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_S10 | |||
| bne .Lizamax_kernel_S10 | |||
| iamax_kernel_L999: | |||
| .Lizamax_kernel_L999: | |||
| mov x0, INDEX | |||
| ret | |||
| iamax_kernel_zero: | |||
| .Lizamax_kernel_zero: | |||
| mov x0, xzr | |||
| ret | |||
| @@ -162,44 +162,44 @@ KERNEL_S1_NEXT: | |||
| INIT | |||
| cmp N, #0 | |||
| ble nrm2_kernel_L999 | |||
| ble .Lnrm2_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq nrm2_kernel_L999 | |||
| beq .Lnrm2_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne nrm2_kernel_S_BEGIN | |||
| bne .Lnrm2_kernel_S_BEGIN | |||
| nrm2_kernel_F_BEGIN: | |||
| .Lnrm2_kernel_F_BEGIN: | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, xzr | |||
| ble nrm2_kernel_F1 | |||
| ble .Lnrm2_kernel_F1 | |||
| nrm2_kernel_F8: | |||
| .Lnrm2_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne nrm2_kernel_F8 | |||
| bne .Lnrm2_kernel_F8 | |||
| nrm2_kernel_F1: | |||
| .Lnrm2_kernel_F1: | |||
| ands I, N, #7 | |||
| ble nrm2_kernel_L999 | |||
| ble .Lnrm2_kernel_L999 | |||
| nrm2_kernel_F10: | |||
| .Lnrm2_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne nrm2_kernel_F10 | |||
| bne .Lnrm2_kernel_F10 | |||
| b nrm2_kernel_L999 | |||
| b .Lnrm2_kernel_L999 | |||
| nrm2_kernel_S_BEGIN: | |||
| .Lnrm2_kernel_S_BEGIN: | |||
| INIT_S | |||
| @@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN: | |||
| .align 5 | |||
| nrm2_kernel_S10: | |||
| .Lnrm2_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne nrm2_kernel_S10 | |||
| bne .Lnrm2_kernel_S10 | |||
| nrm2_kernel_L999: | |||
| .Lnrm2_kernel_L999: | |||
| fsqrt SSQ, SSQ | |||
| fmul SSQ, SCALE, SSQ | |||
| @@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble rot_kernel_L999 | |||
| ble .Lrot_kernel_L999 | |||
| INIT | |||
| cmp INC_X, #1 | |||
| bne rot_kernel_S_BEGIN | |||
| bne .Lrot_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne rot_kernel_S_BEGIN | |||
| bne .Lrot_kernel_S_BEGIN | |||
| rot_kernel_F_BEGIN: | |||
| .Lrot_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq rot_kernel_F1 | |||
| beq .Lrot_kernel_F1 | |||
| KERNEL_INIT_F4 | |||
| rot_kernel_F4: | |||
| .Lrot_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne rot_kernel_F4 | |||
| bne .Lrot_kernel_F4 | |||
| rot_kernel_F1: | |||
| .Lrot_kernel_F1: | |||
| ands I, N, #3 | |||
| ble rot_kernel_L999 | |||
| ble .Lrot_kernel_L999 | |||
| INIT_F1 | |||
| rot_kernel_F10: | |||
| .Lrot_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne rot_kernel_F10 | |||
| bne .Lrot_kernel_F10 | |||
| mov w0, wzr | |||
| ret | |||
| rot_kernel_S_BEGIN: | |||
| .Lrot_kernel_S_BEGIN: | |||
| INIT_S | |||
| INIT_F1 | |||
| @@ -214,9 +214,9 @@ rot_kernel_S_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble rot_kernel_S1 | |||
| ble .Lrot_kernel_S1 | |||
| rot_kernel_S4: | |||
| .Lrot_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -224,22 +224,22 @@ rot_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne rot_kernel_S4 | |||
| bne .Lrot_kernel_S4 | |||
| rot_kernel_S1: | |||
| .Lrot_kernel_S1: | |||
| ands I, N, #3 | |||
| ble rot_kernel_L999 | |||
| ble .Lrot_kernel_L999 | |||
| rot_kernel_S10: | |||
| .Lrot_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne rot_kernel_S10 | |||
| bne .Lrot_kernel_S10 | |||
| rot_kernel_L999: | |||
| .Lrot_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble scal_kernel_L999 | |||
| ble .Lscal_kernel_L999 | |||
| fcmp DA, #0.0 | |||
| beq scal_kernel_zero | |||
| beq .Lscal_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne scal_kernel_S_BEGIN | |||
| bne .Lscal_kernel_S_BEGIN | |||
| scal_kernel_F_BEGIN: | |||
| .Lscal_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq scal_kernel_F1 | |||
| beq .Lscal_kernel_F1 | |||
| KERNEL_INIT_F8 | |||
| scal_kernel_F8: | |||
| .Lscal_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne scal_kernel_F8 | |||
| bne .Lscal_kernel_F8 | |||
| scal_kernel_F1: | |||
| .Lscal_kernel_F1: | |||
| ands I, N, #7 | |||
| ble scal_kernel_L999 | |||
| ble .Lscal_kernel_L999 | |||
| scal_kernel_F10: | |||
| .Lscal_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne scal_kernel_F10 | |||
| bne .Lscal_kernel_F10 | |||
| mov w0, wzr | |||
| ret | |||
| scal_kernel_S_BEGIN: | |||
| .Lscal_kernel_S_BEGIN: | |||
| INIT_S | |||
| mov X_COPY, X | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble scal_kernel_S1 | |||
| ble .Lscal_kernel_S1 | |||
| scal_kernel_S4: | |||
| .Lscal_kernel_S4: | |||
| KERNEL_S4 | |||
| subs I, I, #1 | |||
| bne scal_kernel_S4 | |||
| bne .Lscal_kernel_S4 | |||
| scal_kernel_S1: | |||
| .Lscal_kernel_S1: | |||
| ands I, N, #3 | |||
| ble scal_kernel_L999 | |||
| ble .Lscal_kernel_L999 | |||
| scal_kernel_S10: | |||
| .Lscal_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne scal_kernel_S10 | |||
| bne .Lscal_kernel_S10 | |||
| scal_kernel_L999: | |||
| .Lscal_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| scal_kernel_zero: | |||
| .Lscal_kernel_zero: | |||
| INIT_S | |||
| scal_kernel_Z1: | |||
| .Lscal_kernel_Z1: | |||
| st1 DAV, [X], INC_X | |||
| subs N, N, #1 | |||
| bne scal_kernel_Z1 | |||
| bne .Lscal_kernel_Z1 | |||
| mov w0, wzr | |||
| ret | |||
| @@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble sgemm_kernel_L2_BEGIN | |||
| ble .Lsgemm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| sgemm_kernel_L4_BEGIN: | |||
| .Lsgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| @@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN: | |||
| add pA_2, temp, pA_1 | |||
| add pA_3, temp, pA_2 | |||
| sgemm_kernel_L4_M16_BEGIN: | |||
| .Lsgemm_kernel_L4_M16_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #4 // counterI = counterI / 16 | |||
| cmp counterI, #0 | |||
| ble sgemm_kernel_L4_M8_BEGIN | |||
| ble .Lsgemm_kernel_L4_M8_BEGIN | |||
| sgemm_kernel_L4_M16_20: | |||
| .Lsgemm_kernel_L4_M16_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt sgemm_kernel_L4_M16_32 | |||
| blt .Lsgemm_kernel_L4_M16_32 | |||
| KERNEL16x4_I // do one in the K | |||
| KERNEL16x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble sgemm_kernel_L4_M16_22a | |||
| ble .Lsgemm_kernel_L4_M16_22a | |||
| .align 5 | |||
| sgemm_kernel_L4_M16_22: | |||
| .Lsgemm_kernel_L4_M16_22: | |||
| KERNEL16x4_M1 | |||
| KERNEL16x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M16_22 | |||
| bgt .Lsgemm_kernel_L4_M16_22 | |||
| sgemm_kernel_L4_M16_22a: | |||
| .Lsgemm_kernel_L4_M16_22a: | |||
| KERNEL16x4_M1 | |||
| KERNEL16x4_E | |||
| b sgemm_kernel_L4_M16_44 | |||
| b .Lsgemm_kernel_L4_M16_44 | |||
| sgemm_kernel_L4_M16_32: | |||
| .Lsgemm_kernel_L4_M16_32: | |||
| tst counterL, #1 | |||
| ble sgemm_kernel_L4_M16_40 | |||
| ble .Lsgemm_kernel_L4_M16_40 | |||
| KERNEL16x4_I | |||
| KERNEL16x4_E | |||
| b sgemm_kernel_L4_M16_44 | |||
| b .Lsgemm_kernel_L4_M16_44 | |||
| sgemm_kernel_L4_M16_40: | |||
| .Lsgemm_kernel_L4_M16_40: | |||
| INIT16x4 | |||
| sgemm_kernel_L4_M16_44: | |||
| .Lsgemm_kernel_L4_M16_44: | |||
| ands counterL , origK, #1 | |||
| ble sgemm_kernel_L4_M16_100 | |||
| ble .Lsgemm_kernel_L4_M16_100 | |||
| sgemm_kernel_L4_M16_46: | |||
| .Lsgemm_kernel_L4_M16_46: | |||
| KERNEL16x4_SUB | |||
| sgemm_kernel_L4_M16_100: | |||
| .Lsgemm_kernel_L4_M16_100: | |||
| SAVE16x4 | |||
| sgemm_kernel_L4_M16_END: | |||
| .Lsgemm_kernel_L4_M16_END: | |||
| lsl temp, origK, #4 // k * 4 * 4 = Four rows of A | |||
| add pA_0, pA_0, temp | |||
| add pA_0, pA_0, temp | |||
| @@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END: | |||
| add pA_2, pA_1, temp | |||
| add pA_3, pA_2, temp | |||
| subs counterI, counterI, #1 | |||
| bne sgemm_kernel_L4_M16_20 | |||
| bne .Lsgemm_kernel_L4_M16_20 | |||
| sgemm_kernel_L4_M8_BEGIN: | |||
| .Lsgemm_kernel_L4_M8_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #15 | |||
| ble sgemm_kernel_L4_END | |||
| ble .Lsgemm_kernel_L4_END | |||
| tst counterI, #8 | |||
| ble sgemm_kernel_L4_M4_BEGIN | |||
| ble .Lsgemm_kernel_L4_M4_BEGIN | |||
| sgemm_kernel_L4_M8_20: | |||
| .Lsgemm_kernel_L4_M8_20: | |||
| INIT8x4 | |||
| mov pB, origPB | |||
| asr counterL, origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble sgemm_kernel_L4_M8_40 | |||
| ble .Lsgemm_kernel_L4_M8_40 | |||
| sgemm_kernel_L4_M8_22: | |||
| .Lsgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_SUB | |||
| KERNEL8x4_SUB | |||
| @@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M8_22 | |||
| bgt .Lsgemm_kernel_L4_M8_22 | |||
| sgemm_kernel_L4_M8_40: | |||
| .Lsgemm_kernel_L4_M8_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L4_M8_100 | |||
| ble .Lsgemm_kernel_L4_M8_100 | |||
| sgemm_kernel_L4_M8_42: | |||
| .Lsgemm_kernel_L4_M8_42: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M8_42 | |||
| bgt .Lsgemm_kernel_L4_M8_42 | |||
| sgemm_kernel_L4_M8_100: | |||
| .Lsgemm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| sgemm_kernel_L4_M8_END: | |||
| .Lsgemm_kernel_L4_M8_END: | |||
| lsl temp, origK, #4 // k * 4 * 4 | |||
| add pA_0, pA_0, temp | |||
| sgemm_kernel_L4_M4_BEGIN: | |||
| .Lsgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #7 | |||
| ble sgemm_kernel_L4_END | |||
| ble .Lsgemm_kernel_L4_END | |||
| tst counterI, #4 | |||
| ble sgemm_kernel_L4_M2_BEGIN | |||
| ble .Lsgemm_kernel_L4_M2_BEGIN | |||
| sgemm_kernel_L4_M4_20: | |||
| .Lsgemm_kernel_L4_M4_20: | |||
| INIT4x4 | |||
| mov pB, origPB | |||
| asr counterL, origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble sgemm_kernel_L4_M4_40 | |||
| ble .Lsgemm_kernel_L4_M4_40 | |||
| sgemm_kernel_L4_M4_22: | |||
| .Lsgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| KERNEL4x4_SUB | |||
| @@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M4_22 | |||
| bgt .Lsgemm_kernel_L4_M4_22 | |||
| sgemm_kernel_L4_M4_40: | |||
| .Lsgemm_kernel_L4_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L4_M4_100 | |||
| ble .Lsgemm_kernel_L4_M4_100 | |||
| sgemm_kernel_L4_M4_42: | |||
| .Lsgemm_kernel_L4_M4_42: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M4_42 | |||
| bgt .Lsgemm_kernel_L4_M4_42 | |||
| sgemm_kernel_L4_M4_100: | |||
| .Lsgemm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| sgemm_kernel_L4_M4_END: | |||
| .Lsgemm_kernel_L4_M4_END: | |||
| sgemm_kernel_L4_M2_BEGIN: | |||
| .Lsgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble sgemm_kernel_L4_END | |||
| ble .Lsgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble sgemm_kernel_L4_M1_BEGIN | |||
| ble .Lsgemm_kernel_L4_M1_BEGIN | |||
| sgemm_kernel_L4_M2_20: | |||
| .Lsgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble sgemm_kernel_L4_M2_40 | |||
| ble .Lsgemm_kernel_L4_M2_40 | |||
| sgemm_kernel_L4_M2_22: | |||
| .Lsgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M2_22 | |||
| bgt .Lsgemm_kernel_L4_M2_22 | |||
| sgemm_kernel_L4_M2_40: | |||
| .Lsgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L4_M2_100 | |||
| ble .Lsgemm_kernel_L4_M2_100 | |||
| sgemm_kernel_L4_M2_42: | |||
| .Lsgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M2_42 | |||
| bgt .Lsgemm_kernel_L4_M2_42 | |||
| sgemm_kernel_L4_M2_100: | |||
| .Lsgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| sgemm_kernel_L4_M2_END: | |||
| .Lsgemm_kernel_L4_M2_END: | |||
| sgemm_kernel_L4_M1_BEGIN: | |||
| .Lsgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble sgemm_kernel_L4_END | |||
| ble .Lsgemm_kernel_L4_END | |||
| sgemm_kernel_L4_M1_20: | |||
| .Lsgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble sgemm_kernel_L4_M1_40 | |||
| ble .Lsgemm_kernel_L4_M1_40 | |||
| sgemm_kernel_L4_M1_22: | |||
| .Lsgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M1_22 | |||
| bgt .Lsgemm_kernel_L4_M1_22 | |||
| sgemm_kernel_L4_M1_40: | |||
| .Lsgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L4_M1_100 | |||
| ble .Lsgemm_kernel_L4_M1_100 | |||
| sgemm_kernel_L4_M1_42: | |||
| .Lsgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L4_M1_42 | |||
| bgt .Lsgemm_kernel_L4_M1_42 | |||
| sgemm_kernel_L4_M1_100: | |||
| .Lsgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| sgemm_kernel_L4_END: | |||
| .Lsgemm_kernel_L4_END: | |||
| lsl temp, origK, #4 | |||
| add origPB, origPB, temp // B = B + K * 4 * 4 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt sgemm_kernel_L4_BEGIN | |||
| bgt .Lsgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| sgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble sgemm_kernel_L999 | |||
| ble .Lsgemm_kernel_L999 | |||
| tst counterJ , #2 | |||
| ble sgemm_kernel_L1_BEGIN | |||
| ble .Lsgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| sgemm_kernel_L2_M4_BEGIN: | |||
| .Lsgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble sgemm_kernel_L2_M2_BEGIN | |||
| ble .Lsgemm_kernel_L2_M2_BEGIN | |||
| sgemm_kernel_L2_M4_20: | |||
| .Lsgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble sgemm_kernel_L2_M4_40 | |||
| ble .Lsgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| sgemm_kernel_L2_M4_22: | |||
| .Lsgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L2_M4_22 | |||
| bgt .Lsgemm_kernel_L2_M4_22 | |||
| sgemm_kernel_L2_M4_40: | |||
| .Lsgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L2_M4_100 | |||
| ble .Lsgemm_kernel_L2_M4_100 | |||
| sgemm_kernel_L2_M4_42: | |||
| .Lsgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L2_M4_42 | |||
| bgt .Lsgemm_kernel_L2_M4_42 | |||
| sgemm_kernel_L2_M4_100: | |||
| .Lsgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| sgemm_kernel_L2_M4_END: | |||
| .Lsgemm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt sgemm_kernel_L2_M4_20 | |||
| bgt .Lsgemm_kernel_L2_M4_20 | |||
| sgemm_kernel_L2_M2_BEGIN: | |||
| .Lsgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble sgemm_kernel_L2_END | |||
| ble .Lsgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble sgemm_kernel_L2_M1_BEGIN | |||
| ble .Lsgemm_kernel_L2_M1_BEGIN | |||
| sgemm_kernel_L2_M2_20: | |||
| .Lsgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble sgemm_kernel_L2_M2_40 | |||
| ble .Lsgemm_kernel_L2_M2_40 | |||
| sgemm_kernel_L2_M2_22: | |||
| .Lsgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L2_M2_22 | |||
| bgt .Lsgemm_kernel_L2_M2_22 | |||
| sgemm_kernel_L2_M2_40: | |||
| .Lsgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L2_M2_100 | |||
| ble .Lsgemm_kernel_L2_M2_100 | |||
| sgemm_kernel_L2_M2_42: | |||
| .Lsgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L2_M2_42 | |||
| bgt .Lsgemm_kernel_L2_M2_42 | |||
| sgemm_kernel_L2_M2_100: | |||
| .Lsgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| sgemm_kernel_L2_M2_END: | |||
| .Lsgemm_kernel_L2_M2_END: | |||
| sgemm_kernel_L2_M1_BEGIN: | |||
| .Lsgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble sgemm_kernel_L2_END | |||
| ble .Lsgemm_kernel_L2_END | |||
| sgemm_kernel_L2_M1_20: | |||
| .Lsgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble sgemm_kernel_L2_M1_40 | |||
| ble .Lsgemm_kernel_L2_M1_40 | |||
| sgemm_kernel_L2_M1_22: | |||
| .Lsgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L2_M1_22 | |||
| bgt .Lsgemm_kernel_L2_M1_22 | |||
| sgemm_kernel_L2_M1_40: | |||
| .Lsgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L2_M1_100 | |||
| ble .Lsgemm_kernel_L2_M1_100 | |||
| sgemm_kernel_L2_M1_42: | |||
| .Lsgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L2_M1_42 | |||
| bgt .Lsgemm_kernel_L2_M1_42 | |||
| sgemm_kernel_L2_M1_100: | |||
| .Lsgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| sgemm_kernel_L2_END: | |||
| .Lsgemm_kernel_L2_END: | |||
| add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 | |||
| /******************************************************************************/ | |||
| sgemm_kernel_L1_BEGIN: | |||
| .Lsgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble sgemm_kernel_L999 // done | |||
| ble .Lsgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN: | |||
| sgemm_kernel_L1_M4_BEGIN: | |||
| .Lsgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble sgemm_kernel_L1_M2_BEGIN | |||
| ble .Lsgemm_kernel_L1_M2_BEGIN | |||
| sgemm_kernel_L1_M4_20: | |||
| .Lsgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble sgemm_kernel_L1_M4_40 | |||
| ble .Lsgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| sgemm_kernel_L1_M4_22: | |||
| .Lsgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L1_M4_22 | |||
| bgt .Lsgemm_kernel_L1_M4_22 | |||
| sgemm_kernel_L1_M4_40: | |||
| .Lsgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L1_M4_100 | |||
| ble .Lsgemm_kernel_L1_M4_100 | |||
| sgemm_kernel_L1_M4_42: | |||
| .Lsgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L1_M4_42 | |||
| bgt .Lsgemm_kernel_L1_M4_42 | |||
| sgemm_kernel_L1_M4_100: | |||
| .Lsgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| sgemm_kernel_L1_M4_END: | |||
| .Lsgemm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt sgemm_kernel_L1_M4_20 | |||
| bgt .Lsgemm_kernel_L1_M4_20 | |||
| sgemm_kernel_L1_M2_BEGIN: | |||
| .Lsgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble sgemm_kernel_L1_END | |||
| ble .Lsgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble sgemm_kernel_L1_M1_BEGIN | |||
| ble .Lsgemm_kernel_L1_M1_BEGIN | |||
| sgemm_kernel_L1_M2_20: | |||
| .Lsgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble sgemm_kernel_L1_M2_40 | |||
| ble .Lsgemm_kernel_L1_M2_40 | |||
| sgemm_kernel_L1_M2_22: | |||
| .Lsgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L1_M2_22 | |||
| bgt .Lsgemm_kernel_L1_M2_22 | |||
| sgemm_kernel_L1_M2_40: | |||
| .Lsgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L1_M2_100 | |||
| ble .Lsgemm_kernel_L1_M2_100 | |||
| sgemm_kernel_L1_M2_42: | |||
| .Lsgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L1_M2_42 | |||
| bgt .Lsgemm_kernel_L1_M2_42 | |||
| sgemm_kernel_L1_M2_100: | |||
| .Lsgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| sgemm_kernel_L1_M2_END: | |||
| .Lsgemm_kernel_L1_M2_END: | |||
| sgemm_kernel_L1_M1_BEGIN: | |||
| .Lsgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble sgemm_kernel_L1_END | |||
| ble .Lsgemm_kernel_L1_END | |||
| sgemm_kernel_L1_M1_20: | |||
| .Lsgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble sgemm_kernel_L1_M1_40 | |||
| ble .Lsgemm_kernel_L1_M1_40 | |||
| sgemm_kernel_L1_M1_22: | |||
| .Lsgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L1_M1_22 | |||
| bgt .Lsgemm_kernel_L1_M1_22 | |||
| sgemm_kernel_L1_M1_40: | |||
| .Lsgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble sgemm_kernel_L1_M1_100 | |||
| ble .Lsgemm_kernel_L1_M1_100 | |||
| sgemm_kernel_L1_M1_42: | |||
| .Lsgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt sgemm_kernel_L1_M1_42 | |||
| bgt .Lsgemm_kernel_L1_M1_42 | |||
| sgemm_kernel_L1_M1_100: | |||
| .Lsgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| sgemm_kernel_L1_END: | |||
| .Lsgemm_kernel_L1_END: | |||
| sgemm_kernel_L999: | |||
| .Lsgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| strmm_kernel_begin: | |||
| .Lstrmm_kernel_begin: | |||
| .align 5 | |||
| add sp, sp, #-(11 * 16) | |||
| @@ -539,11 +539,11 @@ strmm_kernel_begin: | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble strmm_kernel_L2_BEGIN | |||
| ble .Lstrmm_kernel_L2_BEGIN | |||
| /******************************************************************************/ | |||
| strmm_kernel_L4_BEGIN: | |||
| .Lstrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| @@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| strmm_kernel_L4_M4_BEGIN: | |||
| .Lstrmm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble strmm_kernel_L4_M2_BEGIN | |||
| ble .Lstrmm_kernel_L4_M2_BEGIN | |||
| strmm_kernel_L4_M4_20: | |||
| .Lstrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20: | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt strmm_kernel_L4_M4_32 | |||
| blt .Lstrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| subs counterL, counterL, #2 | |||
| ble strmm_kernel_L4_M4_22a | |||
| ble .Lstrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| strmm_kernel_L4_M4_22: | |||
| .Lstrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L4_M4_22 | |||
| bgt .Lstrmm_kernel_L4_M4_22 | |||
| strmm_kernel_L4_M4_22a: | |||
| .Lstrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b strmm_kernel_L4_M4_44 | |||
| b .Lstrmm_kernel_L4_M4_44 | |||
| strmm_kernel_L4_M4_32: | |||
| .Lstrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble strmm_kernel_L4_M4_40 | |||
| ble .Lstrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_E | |||
| b strmm_kernel_L4_M4_44 | |||
| b .Lstrmm_kernel_L4_M4_44 | |||
| strmm_kernel_L4_M4_40: | |||
| .Lstrmm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| strmm_kernel_L4_M4_44: | |||
| .Lstrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ble strmm_kernel_L4_M4_100 | |||
| ble .Lstrmm_kernel_L4_M4_100 | |||
| strmm_kernel_L4_M4_46: | |||
| .Lstrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| strmm_kernel_L4_M4_100: | |||
| .Lstrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| strmm_kernel_L4_M4_END: | |||
| .Lstrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne strmm_kernel_L4_M4_20 | |||
| bne .Lstrmm_kernel_L4_M4_20 | |||
| strmm_kernel_L4_M2_BEGIN: | |||
| .Lstrmm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble strmm_kernel_L4_END | |||
| ble .Lstrmm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble strmm_kernel_L4_M1_BEGIN | |||
| ble .Lstrmm_kernel_L4_M1_BEGIN | |||
| strmm_kernel_L4_M2_20: | |||
| .Lstrmm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble strmm_kernel_L4_M2_40 | |||
| ble .Lstrmm_kernel_L4_M2_40 | |||
| strmm_kernel_L4_M2_22: | |||
| .Lstrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L4_M2_22 | |||
| bgt .Lstrmm_kernel_L4_M2_22 | |||
| strmm_kernel_L4_M2_40: | |||
| .Lstrmm_kernel_L4_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L4_M2_100 | |||
| ble .Lstrmm_kernel_L4_M2_100 | |||
| strmm_kernel_L4_M2_42: | |||
| .Lstrmm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L4_M2_42 | |||
| bgt .Lstrmm_kernel_L4_M2_42 | |||
| strmm_kernel_L4_M2_100: | |||
| .Lstrmm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| @@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100: | |||
| #endif | |||
| strmm_kernel_L4_M2_END: | |||
| .Lstrmm_kernel_L4_M2_END: | |||
| strmm_kernel_L4_M1_BEGIN: | |||
| .Lstrmm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble strmm_kernel_L4_END | |||
| ble .Lstrmm_kernel_L4_END | |||
| strmm_kernel_L4_M1_20: | |||
| .Lstrmm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble strmm_kernel_L4_M1_40 | |||
| ble .Lstrmm_kernel_L4_M1_40 | |||
| strmm_kernel_L4_M1_22: | |||
| .Lstrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L4_M1_22 | |||
| bgt .Lstrmm_kernel_L4_M1_22 | |||
| strmm_kernel_L4_M1_40: | |||
| .Lstrmm_kernel_L4_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L4_M1_100 | |||
| ble .Lstrmm_kernel_L4_M1_100 | |||
| strmm_kernel_L4_M1_42: | |||
| .Lstrmm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L4_M1_42 | |||
| bgt .Lstrmm_kernel_L4_M1_42 | |||
| strmm_kernel_L4_M1_100: | |||
| .Lstrmm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| @@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100: | |||
| #endif | |||
| strmm_kernel_L4_END: | |||
| .Lstrmm_kernel_L4_END: | |||
| add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 | |||
| #if !defined(LEFT) | |||
| @@ -825,19 +825,19 @@ strmm_kernel_L4_END: | |||
| #endif | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt strmm_kernel_L4_BEGIN | |||
| bgt .Lstrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| strmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble strmm_kernel_L999 | |||
| ble .Lstrmm_kernel_L999 | |||
| tst counterJ , #2 | |||
| ble strmm_kernel_L1_BEGIN | |||
| ble .Lstrmm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| strmm_kernel_L2_M4_BEGIN: | |||
| .Lstrmm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble strmm_kernel_L2_M2_BEGIN | |||
| ble .Lstrmm_kernel_L2_M2_BEGIN | |||
| strmm_kernel_L2_M4_20: | |||
| .Lstrmm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble strmm_kernel_L2_M4_40 | |||
| ble .Lstrmm_kernel_L2_M4_40 | |||
| .align 5 | |||
| strmm_kernel_L2_M4_22: | |||
| .Lstrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L2_M4_22 | |||
| bgt .Lstrmm_kernel_L2_M4_22 | |||
| strmm_kernel_L2_M4_40: | |||
| .Lstrmm_kernel_L2_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L2_M4_100 | |||
| ble .Lstrmm_kernel_L2_M4_100 | |||
| strmm_kernel_L2_M4_42: | |||
| .Lstrmm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L2_M4_42 | |||
| bgt .Lstrmm_kernel_L2_M4_42 | |||
| strmm_kernel_L2_M4_100: | |||
| .Lstrmm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| @@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| strmm_kernel_L2_M4_END: | |||
| .Lstrmm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt strmm_kernel_L2_M4_20 | |||
| bgt .Lstrmm_kernel_L2_M4_20 | |||
| strmm_kernel_L2_M2_BEGIN: | |||
| .Lstrmm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble strmm_kernel_L2_END | |||
| ble .Lstrmm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble strmm_kernel_L2_M1_BEGIN | |||
| ble .Lstrmm_kernel_L2_M1_BEGIN | |||
| strmm_kernel_L2_M2_20: | |||
| .Lstrmm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble strmm_kernel_L2_M2_40 | |||
| ble .Lstrmm_kernel_L2_M2_40 | |||
| strmm_kernel_L2_M2_22: | |||
| .Lstrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L2_M2_22 | |||
| bgt .Lstrmm_kernel_L2_M2_22 | |||
| strmm_kernel_L2_M2_40: | |||
| .Lstrmm_kernel_L2_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L2_M2_100 | |||
| ble .Lstrmm_kernel_L2_M2_100 | |||
| strmm_kernel_L2_M2_42: | |||
| .Lstrmm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L2_M2_42 | |||
| bgt .Lstrmm_kernel_L2_M2_42 | |||
| strmm_kernel_L2_M2_100: | |||
| .Lstrmm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| strmm_kernel_L2_M2_END: | |||
| .Lstrmm_kernel_L2_M2_END: | |||
| strmm_kernel_L2_M1_BEGIN: | |||
| .Lstrmm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble strmm_kernel_L2_END | |||
| ble .Lstrmm_kernel_L2_END | |||
| strmm_kernel_L2_M1_20: | |||
| .Lstrmm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble strmm_kernel_L2_M1_40 | |||
| ble .Lstrmm_kernel_L2_M1_40 | |||
| strmm_kernel_L2_M1_22: | |||
| .Lstrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L2_M1_22 | |||
| bgt .Lstrmm_kernel_L2_M1_22 | |||
| strmm_kernel_L2_M1_40: | |||
| .Lstrmm_kernel_L2_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L2_M1_100 | |||
| ble .Lstrmm_kernel_L2_M1_100 | |||
| strmm_kernel_L2_M1_42: | |||
| .Lstrmm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L2_M1_42 | |||
| bgt .Lstrmm_kernel_L2_M1_42 | |||
| strmm_kernel_L2_M1_100: | |||
| .Lstrmm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| @@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100: | |||
| add tempOffset, tempOffset, #1 | |||
| #endif | |||
| strmm_kernel_L2_END: | |||
| .Lstrmm_kernel_L2_END: | |||
| #if !defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| @@ -1107,11 +1107,11 @@ strmm_kernel_L2_END: | |||
| /******************************************************************************/ | |||
| strmm_kernel_L1_BEGIN: | |||
| .Lstrmm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble strmm_kernel_L999 // done | |||
| ble .Lstrmm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN: | |||
| mov pA, origPA // pA = A | |||
| strmm_kernel_L1_M4_BEGIN: | |||
| .Lstrmm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble strmm_kernel_L1_M2_BEGIN | |||
| ble .Lstrmm_kernel_L1_M2_BEGIN | |||
| strmm_kernel_L1_M4_20: | |||
| .Lstrmm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| @@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble strmm_kernel_L1_M4_40 | |||
| ble .Lstrmm_kernel_L1_M4_40 | |||
| .align 5 | |||
| strmm_kernel_L1_M4_22: | |||
| .Lstrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L1_M4_22 | |||
| bgt .Lstrmm_kernel_L1_M4_22 | |||
| strmm_kernel_L1_M4_40: | |||
| .Lstrmm_kernel_L1_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L1_M4_100 | |||
| ble .Lstrmm_kernel_L1_M4_100 | |||
| strmm_kernel_L1_M4_42: | |||
| .Lstrmm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L1_M4_42 | |||
| bgt .Lstrmm_kernel_L1_M4_42 | |||
| strmm_kernel_L1_M4_100: | |||
| .Lstrmm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| @@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| strmm_kernel_L1_M4_END: | |||
| .Lstrmm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt strmm_kernel_L1_M4_20 | |||
| bgt .Lstrmm_kernel_L1_M4_20 | |||
| strmm_kernel_L1_M2_BEGIN: | |||
| .Lstrmm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble strmm_kernel_L1_END | |||
| ble .Lstrmm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble strmm_kernel_L1_M1_BEGIN | |||
| ble .Lstrmm_kernel_L1_M1_BEGIN | |||
| strmm_kernel_L1_M2_20: | |||
| .Lstrmm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble strmm_kernel_L1_M2_40 | |||
| ble .Lstrmm_kernel_L1_M2_40 | |||
| strmm_kernel_L1_M2_22: | |||
| .Lstrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L1_M2_22 | |||
| bgt .Lstrmm_kernel_L1_M2_22 | |||
| strmm_kernel_L1_M2_40: | |||
| .Lstrmm_kernel_L1_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L1_M2_100 | |||
| ble .Lstrmm_kernel_L1_M2_100 | |||
| strmm_kernel_L1_M2_42: | |||
| .Lstrmm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L1_M2_42 | |||
| bgt .Lstrmm_kernel_L1_M2_42 | |||
| strmm_kernel_L1_M2_100: | |||
| .Lstrmm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| @@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100: | |||
| #endif | |||
| strmm_kernel_L1_M2_END: | |||
| .Lstrmm_kernel_L1_M2_END: | |||
| strmm_kernel_L1_M1_BEGIN: | |||
| .Lstrmm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble strmm_kernel_L1_END | |||
| ble .Lstrmm_kernel_L1_END | |||
| strmm_kernel_L1_M1_20: | |||
| .Lstrmm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble strmm_kernel_L1_M1_40 | |||
| ble .Lstrmm_kernel_L1_M1_40 | |||
| strmm_kernel_L1_M1_22: | |||
| .Lstrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L1_M1_22 | |||
| bgt .Lstrmm_kernel_L1_M1_22 | |||
| strmm_kernel_L1_M1_40: | |||
| .Lstrmm_kernel_L1_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble strmm_kernel_L1_M1_100 | |||
| ble .Lstrmm_kernel_L1_M1_100 | |||
| strmm_kernel_L1_M1_42: | |||
| .Lstrmm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt strmm_kernel_L1_M1_42 | |||
| bgt .Lstrmm_kernel_L1_M1_42 | |||
| strmm_kernel_L1_M1_100: | |||
| .Lstrmm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| @@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100: | |||
| #endif | |||
| #endif | |||
| strmm_kernel_L1_END: | |||
| .Lstrmm_kernel_L1_END: | |||
| #if 0 | |||
| #if !defined(LEFT) | |||
| @@ -1385,7 +1385,7 @@ strmm_kernel_L1_END: | |||
| #endif | |||
| #endif | |||
| strmm_kernel_L999: | |||
| .Lstrmm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble swap_kernel_L999 | |||
| ble .Lswap_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne swap_kernel_S_BEGIN | |||
| bne .Lswap_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne swap_kernel_S_BEGIN | |||
| bne .Lswap_kernel_S_BEGIN | |||
| swap_kernel_F_BEGIN: | |||
| .Lswap_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq swap_kernel_F1 | |||
| beq .Lswap_kernel_F1 | |||
| swap_kernel_F8: | |||
| .Lswap_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne swap_kernel_F8 | |||
| bne .Lswap_kernel_F8 | |||
| swap_kernel_F1: | |||
| .Lswap_kernel_F1: | |||
| ands I, N, #7 | |||
| ble swap_kernel_L999 | |||
| ble .Lswap_kernel_L999 | |||
| swap_kernel_F10: | |||
| .Lswap_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne swap_kernel_F10 | |||
| bne .Lswap_kernel_F10 | |||
| b swap_kernel_L999 | |||
| b .Lswap_kernel_L999 | |||
| swap_kernel_S_BEGIN: | |||
| .Lswap_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble swap_kernel_S1 | |||
| ble .Lswap_kernel_S1 | |||
| swap_kernel_S4: | |||
| .Lswap_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -244,21 +244,21 @@ swap_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne swap_kernel_S4 | |||
| bne .Lswap_kernel_S4 | |||
| swap_kernel_S1: | |||
| .Lswap_kernel_S1: | |||
| ands I, N, #3 | |||
| ble swap_kernel_L999 | |||
| ble .Lswap_kernel_L999 | |||
| swap_kernel_S10: | |||
| .Lswap_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne swap_kernel_S10 | |||
| bne .Lswap_kernel_S10 | |||
| swap_kernel_L999: | |||
| .Lswap_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble amax_kernel_zero | |||
| ble .Lzamax_kernel_zero | |||
| cmp INC_X, xzr | |||
| ble amax_kernel_zero | |||
| ble .Lzamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne amax_kernel_S_BEGIN | |||
| bne .Lzamax_kernel_S_BEGIN | |||
| amax_kernel_F_BEGIN: | |||
| .Lzamax_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq amax_kernel_F1_INIT | |||
| beq .Lzamax_kernel_F1_INIT | |||
| INIT_F4 | |||
| subs I, I, #1 | |||
| beq amax_kernel_F1 | |||
| beq .Lzamax_kernel_F1 | |||
| amax_kernel_F4: | |||
| .Lzamax_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne amax_kernel_F4 | |||
| bne .Lzamax_kernel_F4 | |||
| amax_kernel_F1: | |||
| .Lzamax_kernel_F1: | |||
| ands I, N, #3 | |||
| ble amax_kernel_L999 | |||
| ble .Lzamax_kernel_L999 | |||
| amax_kernel_F10: | |||
| .Lzamax_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne amax_kernel_F10 | |||
| bne .Lzamax_kernel_F10 | |||
| ret | |||
| amax_kernel_F1_INIT: | |||
| .Lzamax_kernel_F1_INIT: | |||
| INIT_F1 | |||
| subs N, N, #1 | |||
| b amax_kernel_F1 | |||
| b .Lzamax_kernel_F1 | |||
| amax_kernel_S_BEGIN: | |||
| .Lzamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble amax_kernel_L999 | |||
| ble .Lzamax_kernel_L999 | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble amax_kernel_S1 | |||
| ble .Lzamax_kernel_S1 | |||
| amax_kernel_S4: | |||
| .Lzamax_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -247,25 +247,25 @@ amax_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne amax_kernel_S4 | |||
| bne .Lzamax_kernel_S4 | |||
| amax_kernel_S1: | |||
| .Lzamax_kernel_S1: | |||
| ands I, N, #3 | |||
| ble amax_kernel_L999 | |||
| ble .Lzamax_kernel_L999 | |||
| amax_kernel_S10: | |||
| .Lzamax_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne amax_kernel_S10 | |||
| bne .Lzamax_kernel_S10 | |||
| amax_kernel_L999: | |||
| .Lzamax_kernel_L999: | |||
| ret | |||
| amax_kernel_zero: | |||
| .Lzamax_kernel_zero: | |||
| fmov MAXF, REG0 | |||
| ret | |||
| @@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fmov SUMF, REG0 | |||
| cmp N, xzr | |||
| ble asum_kernel_L999 | |||
| ble .Lzasum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble asum_kernel_L999 | |||
| ble .Lzasum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne asum_kernel_S_BEGIN | |||
| bne .Lzasum_kernel_S_BEGIN | |||
| asum_kernel_F_BEGIN: | |||
| .Lzasum_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq asum_kernel_F1 | |||
| beq .Lzasum_kernel_F1 | |||
| asum_kernel_F4: | |||
| .Lzasum_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F4 | |||
| bne .Lzasum_kernel_F4 | |||
| KERNEL_F4_FINALIZE | |||
| asum_kernel_F1: | |||
| .Lzasum_kernel_F1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| ble .Lzasum_kernel_L999 | |||
| asum_kernel_F10: | |||
| .Lzasum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F10 | |||
| bne .Lzasum_kernel_F10 | |||
| asum_kernel_L999: | |||
| .Lzasum_kernel_L999: | |||
| ret | |||
| asum_kernel_S_BEGIN: | |||
| .Lzasum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble asum_kernel_S1 | |||
| ble .Lzasum_kernel_S1 | |||
| asum_kernel_S4: | |||
| .Lzasum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -145,19 +145,19 @@ asum_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S4 | |||
| bne .Lzasum_kernel_S4 | |||
| asum_kernel_S1: | |||
| .Lzasum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| ble .Lzasum_kernel_L999 | |||
| asum_kernel_S10: | |||
| .Lzasum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S10 | |||
| bne .Lzasum_kernel_S10 | |||
| ret | |||
| @@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble zaxpy_kernel_L999 | |||
| ble .Lzaxpy_kernel_L999 | |||
| mov Y_COPY, Y | |||
| fcmp DA_R, #0.0 | |||
| bne .L1 | |||
| fcmp DA_I, #0.0 | |||
| beq zaxpy_kernel_L999 | |||
| beq .Lzaxpy_kernel_L999 | |||
| .L1: | |||
| INIT | |||
| cmp INC_X, #1 | |||
| bne zaxpy_kernel_S_BEGIN | |||
| bne .Lzaxpy_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne zaxpy_kernel_S_BEGIN | |||
| bne .Lzaxpy_kernel_S_BEGIN | |||
| zaxpy_kernel_F_BEGIN: | |||
| .Lzaxpy_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq zaxpy_kernel_F1 | |||
| beq .Lzaxpy_kernel_F1 | |||
| KERNEL_INIT_F4 | |||
| zaxpy_kernel_F4: | |||
| .Lzaxpy_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne zaxpy_kernel_F4 | |||
| bne .Lzaxpy_kernel_F4 | |||
| zaxpy_kernel_F1: | |||
| .Lzaxpy_kernel_F1: | |||
| ands I, N, #3 | |||
| ble zaxpy_kernel_L999 | |||
| ble .Lzaxpy_kernel_L999 | |||
| zaxpy_kernel_F10: | |||
| .Lzaxpy_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne zaxpy_kernel_F10 | |||
| bne .Lzaxpy_kernel_F10 | |||
| mov w0, wzr | |||
| ret | |||
| zaxpy_kernel_S_BEGIN: | |||
| .Lzaxpy_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble zaxpy_kernel_S1 | |||
| ble .Lzaxpy_kernel_S1 | |||
| zaxpy_kernel_S4: | |||
| .Lzaxpy_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -304,21 +304,21 @@ zaxpy_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zaxpy_kernel_S4 | |||
| bne .Lzaxpy_kernel_S4 | |||
| zaxpy_kernel_S1: | |||
| .Lzaxpy_kernel_S1: | |||
| ands I, N, #3 | |||
| ble zaxpy_kernel_L999 | |||
| ble .Lzaxpy_kernel_L999 | |||
| zaxpy_kernel_S10: | |||
| .Lzaxpy_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zaxpy_kernel_S10 | |||
| bne .Lzaxpy_kernel_S10 | |||
| zaxpy_kernel_L999: | |||
| .Lzaxpy_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| cmp N, xzr | |||
| ble dot_kernel_L999 | |||
| ble .Lzdot_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne dot_kernel_S_BEGIN | |||
| bne .Lzdot_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne dot_kernel_S_BEGIN | |||
| bne .Lzdot_kernel_S_BEGIN | |||
| dot_kernel_F_BEGIN: | |||
| .Lzdot_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq dot_kernel_F1 | |||
| beq .Lzdot_kernel_F1 | |||
| dot_kernel_F4: | |||
| .Lzdot_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne dot_kernel_F4 | |||
| bne .Lzdot_kernel_F4 | |||
| KERNEL_F4_FINALIZE | |||
| dot_kernel_F1: | |||
| .Lzdot_kernel_F1: | |||
| ands I, N, #3 | |||
| ble dot_kernel_L999 | |||
| ble .Lzdot_kernel_L999 | |||
| dot_kernel_F10: | |||
| .Lzdot_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne dot_kernel_F10 | |||
| bne .Lzdot_kernel_F10 | |||
| ret | |||
| dot_kernel_S_BEGIN: | |||
| .Lzdot_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble dot_kernel_S1 | |||
| ble .Lzdot_kernel_S1 | |||
| dot_kernel_S4: | |||
| .Lzdot_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -281,21 +281,21 @@ dot_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne dot_kernel_S4 | |||
| bne .Lzdot_kernel_S4 | |||
| dot_kernel_S1: | |||
| .Lzdot_kernel_S1: | |||
| ands I, N, #3 | |||
| ble dot_kernel_L999 | |||
| ble .Lzdot_kernel_L999 | |||
| dot_kernel_S10: | |||
| .Lzdot_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne dot_kernel_S10 | |||
| bne .Lzdot_kernel_S10 | |||
| dot_kernel_L999: | |||
| .Lzdot_kernel_L999: | |||
| ret | |||
| @@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble zgemm_kernel_L2_BEGIN | |||
| ble .Lzgemm_kernel_L2_BEGIN | |||
| zgemm_kernel_L4_BEGIN: | |||
| .Lzgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| zgemm_kernel_L4_M4_BEGIN: | |||
| .Lzgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble zgemm_kernel_L4_M2_BEGIN | |||
| ble .Lzgemm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_20: | |||
| .Lzgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt zgemm_kernel_L4_M4_32 | |||
| blt .Lzgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| @@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20: | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble zgemm_kernel_L4_M4_22a | |||
| ble .Lzgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22: | |||
| .Lzgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| @@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M4_22 | |||
| bgt .Lzgemm_kernel_L4_M4_22 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22a: | |||
| .Lzgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| @@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| b .Lzgemm_kernel_L4_M4_44 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_32: | |||
| .Lzgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble zgemm_kernel_L4_M4_40 | |||
| ble .Lzgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| @@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| b .Lzgemm_kernel_L4_M4_44 | |||
| zgemm_kernel_L4_M4_40: | |||
| .Lzgemm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| zgemm_kernel_L4_M4_44: | |||
| .Lzgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #7 | |||
| ble zgemm_kernel_L4_M4_100 | |||
| ble .Lzgemm_kernel_L4_M4_100 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_46: | |||
| .Lzgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne zgemm_kernel_L4_M4_46 | |||
| bne .Lzgemm_kernel_L4_M4_46 | |||
| zgemm_kernel_L4_M4_100: | |||
| .Lzgemm_kernel_L4_M4_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE4x4 | |||
| zgemm_kernel_L4_M4_END: | |||
| .Lzgemm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne zgemm_kernel_L4_M4_20 | |||
| bne .Lzgemm_kernel_L4_M4_20 | |||
| zgemm_kernel_L4_M2_BEGIN: | |||
| .Lzgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble zgemm_kernel_L4_END | |||
| ble .Lzgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble zgemm_kernel_L4_M1_BEGIN | |||
| ble .Lzgemm_kernel_L4_M1_BEGIN | |||
| zgemm_kernel_L4_M2_20: | |||
| .Lzgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L4_M2_40 | |||
| ble .Lzgemm_kernel_L4_M2_40 | |||
| zgemm_kernel_L4_M2_22: | |||
| .Lzgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M2_22 | |||
| bgt .Lzgemm_kernel_L4_M2_22 | |||
| zgemm_kernel_L4_M2_40: | |||
| .Lzgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L4_M2_100 | |||
| ble .Lzgemm_kernel_L4_M2_100 | |||
| zgemm_kernel_L4_M2_42: | |||
| .Lzgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M2_42 | |||
| bgt .Lzgemm_kernel_L4_M2_42 | |||
| zgemm_kernel_L4_M2_100: | |||
| .Lzgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| zgemm_kernel_L4_M2_END: | |||
| .Lzgemm_kernel_L4_M2_END: | |||
| zgemm_kernel_L4_M1_BEGIN: | |||
| .Lzgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble zgemm_kernel_L4_END | |||
| ble .Lzgemm_kernel_L4_END | |||
| zgemm_kernel_L4_M1_20: | |||
| .Lzgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L4_M1_40 | |||
| ble .Lzgemm_kernel_L4_M1_40 | |||
| zgemm_kernel_L4_M1_22: | |||
| .Lzgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M1_22 | |||
| bgt .Lzgemm_kernel_L4_M1_22 | |||
| zgemm_kernel_L4_M1_40: | |||
| .Lzgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L4_M1_100 | |||
| ble .Lzgemm_kernel_L4_M1_100 | |||
| zgemm_kernel_L4_M1_42: | |||
| .Lzgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M1_42 | |||
| bgt .Lzgemm_kernel_L4_M1_42 | |||
| zgemm_kernel_L4_M1_100: | |||
| .Lzgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| zgemm_kernel_L4_END: | |||
| .Lzgemm_kernel_L4_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 * 2 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt zgemm_kernel_L4_BEGIN | |||
| bgt .Lzgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| zgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble zgemm_kernel_L999 | |||
| ble .Lzgemm_kernel_L999 | |||
| tst counterJ , #2 | |||
| ble zgemm_kernel_L1_BEGIN | |||
| ble .Lzgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| zgemm_kernel_L2_M4_BEGIN: | |||
| .Lzgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble zgemm_kernel_L2_M2_BEGIN | |||
| ble .Lzgemm_kernel_L2_M2_BEGIN | |||
| zgemm_kernel_L2_M4_20: | |||
| .Lzgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble zgemm_kernel_L2_M4_40 | |||
| ble .Lzgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| zgemm_kernel_L2_M4_22: | |||
| .Lzgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M4_22 | |||
| bgt .Lzgemm_kernel_L2_M4_22 | |||
| zgemm_kernel_L2_M4_40: | |||
| .Lzgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L2_M4_100 | |||
| ble .Lzgemm_kernel_L2_M4_100 | |||
| zgemm_kernel_L2_M4_42: | |||
| .Lzgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M4_42 | |||
| bgt .Lzgemm_kernel_L2_M4_42 | |||
| zgemm_kernel_L2_M4_100: | |||
| .Lzgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| zgemm_kernel_L2_M4_END: | |||
| .Lzgemm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt zgemm_kernel_L2_M4_20 | |||
| bgt .Lzgemm_kernel_L2_M4_20 | |||
| zgemm_kernel_L2_M2_BEGIN: | |||
| .Lzgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble zgemm_kernel_L2_END | |||
| ble .Lzgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble zgemm_kernel_L2_M1_BEGIN | |||
| ble .Lzgemm_kernel_L2_M1_BEGIN | |||
| zgemm_kernel_L2_M2_20: | |||
| .Lzgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble zgemm_kernel_L2_M2_40 | |||
| ble .Lzgemm_kernel_L2_M2_40 | |||
| zgemm_kernel_L2_M2_22: | |||
| .Lzgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M2_22 | |||
| bgt .Lzgemm_kernel_L2_M2_22 | |||
| zgemm_kernel_L2_M2_40: | |||
| .Lzgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L2_M2_100 | |||
| ble .Lzgemm_kernel_L2_M2_100 | |||
| zgemm_kernel_L2_M2_42: | |||
| .Lzgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M2_42 | |||
| bgt .Lzgemm_kernel_L2_M2_42 | |||
| zgemm_kernel_L2_M2_100: | |||
| .Lzgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| zgemm_kernel_L2_M2_END: | |||
| .Lzgemm_kernel_L2_M2_END: | |||
| zgemm_kernel_L2_M1_BEGIN: | |||
| .Lzgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble zgemm_kernel_L2_END | |||
| ble .Lzgemm_kernel_L2_END | |||
| zgemm_kernel_L2_M1_20: | |||
| .Lzgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble zgemm_kernel_L2_M1_40 | |||
| ble .Lzgemm_kernel_L2_M1_40 | |||
| zgemm_kernel_L2_M1_22: | |||
| .Lzgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M1_22 | |||
| bgt .Lzgemm_kernel_L2_M1_22 | |||
| zgemm_kernel_L2_M1_40: | |||
| .Lzgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L2_M1_100 | |||
| ble .Lzgemm_kernel_L2_M1_100 | |||
| zgemm_kernel_L2_M1_42: | |||
| .Lzgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M1_42 | |||
| bgt .Lzgemm_kernel_L2_M1_42 | |||
| zgemm_kernel_L2_M1_100: | |||
| .Lzgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| zgemm_kernel_L2_END: | |||
| .Lzgemm_kernel_L2_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 2 * 8 * 2 | |||
| /******************************************************************************/ | |||
| zgemm_kernel_L1_BEGIN: | |||
| .Lzgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble zgemm_kernel_L999 // done | |||
| ble .Lzgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN: | |||
| zgemm_kernel_L1_M4_BEGIN: | |||
| .Lzgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble zgemm_kernel_L1_M2_BEGIN | |||
| ble .Lzgemm_kernel_L1_M2_BEGIN | |||
| zgemm_kernel_L1_M4_20: | |||
| .Lzgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L1_M4_40 | |||
| ble .Lzgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| zgemm_kernel_L1_M4_22: | |||
| .Lzgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M4_22 | |||
| bgt .Lzgemm_kernel_L1_M4_22 | |||
| zgemm_kernel_L1_M4_40: | |||
| .Lzgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L1_M4_100 | |||
| ble .Lzgemm_kernel_L1_M4_100 | |||
| zgemm_kernel_L1_M4_42: | |||
| .Lzgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M4_42 | |||
| bgt .Lzgemm_kernel_L1_M4_42 | |||
| zgemm_kernel_L1_M4_100: | |||
| .Lzgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| zgemm_kernel_L1_M4_END: | |||
| .Lzgemm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt zgemm_kernel_L1_M4_20 | |||
| bgt .Lzgemm_kernel_L1_M4_20 | |||
| zgemm_kernel_L1_M2_BEGIN: | |||
| .Lzgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble zgemm_kernel_L1_END | |||
| ble .Lzgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble zgemm_kernel_L1_M1_BEGIN | |||
| ble .Lzgemm_kernel_L1_M1_BEGIN | |||
| zgemm_kernel_L1_M2_20: | |||
| .Lzgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L1_M2_40 | |||
| ble .Lzgemm_kernel_L1_M2_40 | |||
| zgemm_kernel_L1_M2_22: | |||
| .Lzgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M2_22 | |||
| bgt .Lzgemm_kernel_L1_M2_22 | |||
| zgemm_kernel_L1_M2_40: | |||
| .Lzgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L1_M2_100 | |||
| ble .Lzgemm_kernel_L1_M2_100 | |||
| zgemm_kernel_L1_M2_42: | |||
| .Lzgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M2_42 | |||
| bgt .Lzgemm_kernel_L1_M2_42 | |||
| zgemm_kernel_L1_M2_100: | |||
| .Lzgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| zgemm_kernel_L1_M2_END: | |||
| .Lzgemm_kernel_L1_M2_END: | |||
| zgemm_kernel_L1_M1_BEGIN: | |||
| .Lzgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble zgemm_kernel_L1_END | |||
| ble .Lzgemm_kernel_L1_END | |||
| zgemm_kernel_L1_M1_20: | |||
| .Lzgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L1_M1_40 | |||
| ble .Lzgemm_kernel_L1_M1_40 | |||
| zgemm_kernel_L1_M1_22: | |||
| .Lzgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M1_22 | |||
| bgt .Lzgemm_kernel_L1_M1_22 | |||
| zgemm_kernel_L1_M1_40: | |||
| .Lzgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L1_M1_100 | |||
| ble .Lzgemm_kernel_L1_M1_100 | |||
| zgemm_kernel_L1_M1_42: | |||
| .Lzgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M1_42 | |||
| bgt .Lzgemm_kernel_L1_M1_42 | |||
| zgemm_kernel_L1_M1_100: | |||
| .Lzgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| zgemm_kernel_L1_END: | |||
| .Lzgemm_kernel_L1_END: | |||
| zgemm_kernel_L999: | |||
| .Lzgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble zgemm_kernel_L2_BEGIN | |||
| ble .Lzgemm_kernel_L2_BEGIN | |||
| zgemm_kernel_L4_BEGIN: | |||
| .Lzgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN: | |||
| mov pA, origPA // pA = start of A array | |||
| zgemm_kernel_L4_M4_BEGIN: | |||
| .Lzgemm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble zgemm_kernel_L4_M2_BEGIN | |||
| ble .Lzgemm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_20: | |||
| .Lzgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt zgemm_kernel_L4_M4_32 | |||
| blt .Lzgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| @@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20: | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble zgemm_kernel_L4_M4_22a | |||
| ble .Lzgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22: | |||
| .Lzgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| @@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M4_22 | |||
| bgt .Lzgemm_kernel_L4_M4_22 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22a: | |||
| .Lzgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| @@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| b .Lzgemm_kernel_L4_M4_44 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_32: | |||
| .Lzgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble zgemm_kernel_L4_M4_40 | |||
| ble .Lzgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| @@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| b .Lzgemm_kernel_L4_M4_44 | |||
| zgemm_kernel_L4_M4_40: | |||
| .Lzgemm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| zgemm_kernel_L4_M4_44: | |||
| .Lzgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #7 | |||
| ble zgemm_kernel_L4_M4_100 | |||
| ble .Lzgemm_kernel_L4_M4_100 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_46: | |||
| .Lzgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne zgemm_kernel_L4_M4_46 | |||
| bne .Lzgemm_kernel_L4_M4_46 | |||
| zgemm_kernel_L4_M4_100: | |||
| .Lzgemm_kernel_L4_M4_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE4x4 | |||
| zgemm_kernel_L4_M4_END: | |||
| .Lzgemm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne zgemm_kernel_L4_M4_20 | |||
| bne .Lzgemm_kernel_L4_M4_20 | |||
| zgemm_kernel_L4_M2_BEGIN: | |||
| .Lzgemm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble zgemm_kernel_L4_END | |||
| ble .Lzgemm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble zgemm_kernel_L4_M1_BEGIN | |||
| ble .Lzgemm_kernel_L4_M1_BEGIN | |||
| zgemm_kernel_L4_M2_20: | |||
| .Lzgemm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L4_M2_40 | |||
| ble .Lzgemm_kernel_L4_M2_40 | |||
| zgemm_kernel_L4_M2_22: | |||
| .Lzgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M2_22 | |||
| bgt .Lzgemm_kernel_L4_M2_22 | |||
| zgemm_kernel_L4_M2_40: | |||
| .Lzgemm_kernel_L4_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L4_M2_100 | |||
| ble .Lzgemm_kernel_L4_M2_100 | |||
| zgemm_kernel_L4_M2_42: | |||
| .Lzgemm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M2_42 | |||
| bgt .Lzgemm_kernel_L4_M2_42 | |||
| zgemm_kernel_L4_M2_100: | |||
| .Lzgemm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| zgemm_kernel_L4_M2_END: | |||
| .Lzgemm_kernel_L4_M2_END: | |||
| zgemm_kernel_L4_M1_BEGIN: | |||
| .Lzgemm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble zgemm_kernel_L4_END | |||
| ble .Lzgemm_kernel_L4_END | |||
| zgemm_kernel_L4_M1_20: | |||
| .Lzgemm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L4_M1_40 | |||
| ble .Lzgemm_kernel_L4_M1_40 | |||
| zgemm_kernel_L4_M1_22: | |||
| .Lzgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M1_22 | |||
| bgt .Lzgemm_kernel_L4_M1_22 | |||
| zgemm_kernel_L4_M1_40: | |||
| .Lzgemm_kernel_L4_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L4_M1_100 | |||
| ble .Lzgemm_kernel_L4_M1_100 | |||
| zgemm_kernel_L4_M1_42: | |||
| .Lzgemm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M1_42 | |||
| bgt .Lzgemm_kernel_L4_M1_42 | |||
| zgemm_kernel_L4_M1_100: | |||
| .Lzgemm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| zgemm_kernel_L4_END: | |||
| .Lzgemm_kernel_L4_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 * 2 | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt zgemm_kernel_L4_BEGIN | |||
| bgt .Lzgemm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| zgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble zgemm_kernel_L999 | |||
| ble .Lzgemm_kernel_L999 | |||
| tst counterJ , #2 | |||
| ble zgemm_kernel_L1_BEGIN | |||
| ble .Lzgemm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| zgemm_kernel_L2_M4_BEGIN: | |||
| .Lzgemm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble zgemm_kernel_L2_M2_BEGIN | |||
| ble .Lzgemm_kernel_L2_M2_BEGIN | |||
| zgemm_kernel_L2_M4_20: | |||
| .Lzgemm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble zgemm_kernel_L2_M4_40 | |||
| ble .Lzgemm_kernel_L2_M4_40 | |||
| .align 5 | |||
| zgemm_kernel_L2_M4_22: | |||
| .Lzgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M4_22 | |||
| bgt .Lzgemm_kernel_L2_M4_22 | |||
| zgemm_kernel_L2_M4_40: | |||
| .Lzgemm_kernel_L2_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L2_M4_100 | |||
| ble .Lzgemm_kernel_L2_M4_100 | |||
| zgemm_kernel_L2_M4_42: | |||
| .Lzgemm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M4_42 | |||
| bgt .Lzgemm_kernel_L2_M4_42 | |||
| zgemm_kernel_L2_M4_100: | |||
| .Lzgemm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| zgemm_kernel_L2_M4_END: | |||
| .Lzgemm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt zgemm_kernel_L2_M4_20 | |||
| bgt .Lzgemm_kernel_L2_M4_20 | |||
| zgemm_kernel_L2_M2_BEGIN: | |||
| .Lzgemm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble zgemm_kernel_L2_END | |||
| ble .Lzgemm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble zgemm_kernel_L2_M1_BEGIN | |||
| ble .Lzgemm_kernel_L2_M1_BEGIN | |||
| zgemm_kernel_L2_M2_20: | |||
| .Lzgemm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble zgemm_kernel_L2_M2_40 | |||
| ble .Lzgemm_kernel_L2_M2_40 | |||
| zgemm_kernel_L2_M2_22: | |||
| .Lzgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M2_22 | |||
| bgt .Lzgemm_kernel_L2_M2_22 | |||
| zgemm_kernel_L2_M2_40: | |||
| .Lzgemm_kernel_L2_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L2_M2_100 | |||
| ble .Lzgemm_kernel_L2_M2_100 | |||
| zgemm_kernel_L2_M2_42: | |||
| .Lzgemm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M2_42 | |||
| bgt .Lzgemm_kernel_L2_M2_42 | |||
| zgemm_kernel_L2_M2_100: | |||
| .Lzgemm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| zgemm_kernel_L2_M2_END: | |||
| .Lzgemm_kernel_L2_M2_END: | |||
| zgemm_kernel_L2_M1_BEGIN: | |||
| .Lzgemm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble zgemm_kernel_L2_END | |||
| ble .Lzgemm_kernel_L2_END | |||
| zgemm_kernel_L2_M1_20: | |||
| .Lzgemm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble zgemm_kernel_L2_M1_40 | |||
| ble .Lzgemm_kernel_L2_M1_40 | |||
| zgemm_kernel_L2_M1_22: | |||
| .Lzgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M1_22 | |||
| bgt .Lzgemm_kernel_L2_M1_22 | |||
| zgemm_kernel_L2_M1_40: | |||
| .Lzgemm_kernel_L2_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L2_M1_100 | |||
| ble .Lzgemm_kernel_L2_M1_100 | |||
| zgemm_kernel_L2_M1_42: | |||
| .Lzgemm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L2_M1_42 | |||
| bgt .Lzgemm_kernel_L2_M1_42 | |||
| zgemm_kernel_L2_M1_100: | |||
| .Lzgemm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| zgemm_kernel_L2_END: | |||
| .Lzgemm_kernel_L2_END: | |||
| lsl temp, origK, #5 | |||
| add origPB, origPB, temp // B = B + K * 2 * 8 * 2 | |||
| /******************************************************************************/ | |||
| zgemm_kernel_L1_BEGIN: | |||
| .Lzgemm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble zgemm_kernel_L999 // done | |||
| ble .Lzgemm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN: | |||
| zgemm_kernel_L1_M4_BEGIN: | |||
| .Lzgemm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble zgemm_kernel_L1_M2_BEGIN | |||
| ble .Lzgemm_kernel_L1_M2_BEGIN | |||
| zgemm_kernel_L1_M4_20: | |||
| .Lzgemm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L1_M4_40 | |||
| ble .Lzgemm_kernel_L1_M4_40 | |||
| .align 5 | |||
| zgemm_kernel_L1_M4_22: | |||
| .Lzgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M4_22 | |||
| bgt .Lzgemm_kernel_L1_M4_22 | |||
| zgemm_kernel_L1_M4_40: | |||
| .Lzgemm_kernel_L1_M4_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L1_M4_100 | |||
| ble .Lzgemm_kernel_L1_M4_100 | |||
| zgemm_kernel_L1_M4_42: | |||
| .Lzgemm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M4_42 | |||
| bgt .Lzgemm_kernel_L1_M4_42 | |||
| zgemm_kernel_L1_M4_100: | |||
| .Lzgemm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| zgemm_kernel_L1_M4_END: | |||
| .Lzgemm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt zgemm_kernel_L1_M4_20 | |||
| bgt .Lzgemm_kernel_L1_M4_20 | |||
| zgemm_kernel_L1_M2_BEGIN: | |||
| .Lzgemm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble zgemm_kernel_L1_END | |||
| ble .Lzgemm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble zgemm_kernel_L1_M1_BEGIN | |||
| ble .Lzgemm_kernel_L1_M1_BEGIN | |||
| zgemm_kernel_L1_M2_20: | |||
| .Lzgemm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L1_M2_40 | |||
| ble .Lzgemm_kernel_L1_M2_40 | |||
| zgemm_kernel_L1_M2_22: | |||
| .Lzgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M2_22 | |||
| bgt .Lzgemm_kernel_L1_M2_22 | |||
| zgemm_kernel_L1_M2_40: | |||
| .Lzgemm_kernel_L1_M2_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L1_M2_100 | |||
| ble .Lzgemm_kernel_L1_M2_100 | |||
| zgemm_kernel_L1_M2_42: | |||
| .Lzgemm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M2_42 | |||
| bgt .Lzgemm_kernel_L1_M2_42 | |||
| zgemm_kernel_L1_M2_100: | |||
| .Lzgemm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| zgemm_kernel_L1_M2_END: | |||
| .Lzgemm_kernel_L1_M2_END: | |||
| zgemm_kernel_L1_M1_BEGIN: | |||
| .Lzgemm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble zgemm_kernel_L1_END | |||
| ble .Lzgemm_kernel_L1_END | |||
| zgemm_kernel_L1_M1_20: | |||
| .Lzgemm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| mov pB, origPB | |||
| asr counterL , origK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble zgemm_kernel_L1_M1_40 | |||
| ble .Lzgemm_kernel_L1_M1_40 | |||
| zgemm_kernel_L1_M1_22: | |||
| .Lzgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M1_22 | |||
| bgt .Lzgemm_kernel_L1_M1_22 | |||
| zgemm_kernel_L1_M1_40: | |||
| .Lzgemm_kernel_L1_M1_40: | |||
| ands counterL , origK, #7 // counterL = counterL % 8 | |||
| ble zgemm_kernel_L1_M1_100 | |||
| ble .Lzgemm_kernel_L1_M1_100 | |||
| zgemm_kernel_L1_M1_42: | |||
| .Lzgemm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L1_M1_42 | |||
| bgt .Lzgemm_kernel_L1_M1_42 | |||
| zgemm_kernel_L1_M1_100: | |||
| .Lzgemm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| zgemm_kernel_L1_END: | |||
| .Lzgemm_kernel_L1_END: | |||
| zgemm_kernel_L999: | |||
| .Lzgemm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| @@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble zgemv_n_kernel_L999 | |||
| ble .Lzgemv_n_kernel_L999 | |||
| cmp M, xzr | |||
| ble zgemv_n_kernel_L999 | |||
| ble .Lzgemv_n_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_X, INC_X, #SHZ | |||
| @@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| INIT | |||
| cmp INC_Y, #1 | |||
| bne zgemv_n_kernel_S_BEGIN | |||
| bne .Lzgemv_n_kernel_S_BEGIN | |||
| zgemv_n_kernel_F_LOOP: | |||
| .Lzgemv_n_kernel_F_LOOP: | |||
| mov A_PTR, A | |||
| mov Y_IPTR, Y | |||
| mov Y_OPTR, Y | |||
| @@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP: | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| beq zgemv_n_kernel_F1 | |||
| beq .Lzgemv_n_kernel_F1 | |||
| zgemv_n_kernel_F4: | |||
| .Lzgemv_n_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_F4 | |||
| bne .Lzgemv_n_kernel_F4 | |||
| zgemv_n_kernel_F1: | |||
| .Lzgemv_n_kernel_F1: | |||
| ands I, M, #3 | |||
| ble zgemv_n_kernel_F_END | |||
| ble .Lzgemv_n_kernel_F_END | |||
| zgemv_n_kernel_F10: | |||
| .Lzgemv_n_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_F10 | |||
| bne .Lzgemv_n_kernel_F10 | |||
| zgemv_n_kernel_F_END: | |||
| .Lzgemv_n_kernel_F_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_n_kernel_F_LOOP | |||
| bne .Lzgemv_n_kernel_F_LOOP | |||
| b zgemv_n_kernel_L999 | |||
| b .Lzgemv_n_kernel_L999 | |||
| zgemv_n_kernel_S_BEGIN: | |||
| .Lzgemv_n_kernel_S_BEGIN: | |||
| INIT_S | |||
| zgemv_n_kernel_S_LOOP: | |||
| .Lzgemv_n_kernel_S_LOOP: | |||
| mov A_PTR, A | |||
| mov Y_IPTR, Y | |||
| mov Y_OPTR, Y | |||
| @@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP: | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble zgemv_n_kernel_S1 | |||
| ble .Lzgemv_n_kernel_S1 | |||
| zgemv_n_kernel_S4: | |||
| .Lzgemv_n_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -440,27 +440,27 @@ zgemv_n_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_S4 | |||
| bne .Lzgemv_n_kernel_S4 | |||
| zgemv_n_kernel_S1: | |||
| .Lzgemv_n_kernel_S1: | |||
| ands I, M, #3 | |||
| ble zgemv_n_kernel_S_END | |||
| ble .Lzgemv_n_kernel_S_END | |||
| zgemv_n_kernel_S10: | |||
| .Lzgemv_n_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_S10 | |||
| bne .Lzgemv_n_kernel_S10 | |||
| zgemv_n_kernel_S_END: | |||
| .Lzgemv_n_kernel_S_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_n_kernel_S_LOOP | |||
| bne .Lzgemv_n_kernel_S_LOOP | |||
| zgemv_n_kernel_L999: | |||
| .Lzgemv_n_kernel_L999: | |||
| RESTORE_REGS | |||
| mov w0, wzr | |||
| @@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| SAVE_REGS | |||
| cmp N, xzr | |||
| ble zgemv_t_kernel_L999 | |||
| ble .Lzgemv_t_kernel_L999 | |||
| cmp M, xzr | |||
| ble zgemv_t_kernel_L999 | |||
| ble .Lzgemv_t_kernel_L999 | |||
| lsl LDA, LDA, #SHZ | |||
| lsl INC_Y, INC_Y, #SHZ | |||
| @@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| INIT | |||
| cmp INC_X, #1 | |||
| bne zgemv_t_kernel_S_BEGIN | |||
| bne .Lzgemv_t_kernel_S_BEGIN | |||
| zgemv_t_kernel_F_LOOP: | |||
| .Lzgemv_t_kernel_F_LOOP: | |||
| mov A_PTR, A | |||
| mov X_PTR, X | |||
| @@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP: | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| beq zgemv_t_kernel_F1 | |||
| beq .Lzgemv_t_kernel_F1 | |||
| zgemv_t_kernel_F4: | |||
| .Lzgemv_t_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_F4 | |||
| bne .Lzgemv_t_kernel_F4 | |||
| KERNEL_F4_FINALIZE | |||
| zgemv_t_kernel_F1: | |||
| .Lzgemv_t_kernel_F1: | |||
| ands I, M, #3 | |||
| ble zgemv_t_kernel_F_END | |||
| ble .Lzgemv_t_kernel_F_END | |||
| zgemv_t_kernel_F10: | |||
| .Lzgemv_t_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_F10 | |||
| bne .Lzgemv_t_kernel_F10 | |||
| zgemv_t_kernel_F_END: | |||
| .Lzgemv_t_kernel_F_END: | |||
| #if !defined(DOUBLE) | |||
| ld1 {v4.2s}, [Y] | |||
| @@ -355,15 +355,15 @@ zgemv_t_kernel_F_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_t_kernel_F_LOOP | |||
| bne .Lzgemv_t_kernel_F_LOOP | |||
| b zgemv_t_kernel_L999 | |||
| b .Lzgemv_t_kernel_L999 | |||
| zgemv_t_kernel_S_BEGIN: | |||
| .Lzgemv_t_kernel_S_BEGIN: | |||
| INIT_S | |||
| zgemv_t_kernel_S_LOOP: | |||
| .Lzgemv_t_kernel_S_LOOP: | |||
| mov A_PTR, A | |||
| mov X_PTR, X | |||
| @@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP: | |||
| asr I, M, #2 | |||
| cmp I, xzr | |||
| ble zgemv_t_kernel_S1 | |||
| ble .Lzgemv_t_kernel_S1 | |||
| zgemv_t_kernel_S4: | |||
| .Lzgemv_t_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -381,21 +381,21 @@ zgemv_t_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_S4 | |||
| bne .Lzgemv_t_kernel_S4 | |||
| zgemv_t_kernel_S1: | |||
| .Lzgemv_t_kernel_S1: | |||
| ands I, M, #3 | |||
| ble zgemv_t_kernel_S_END | |||
| ble .Lzgemv_t_kernel_S_END | |||
| zgemv_t_kernel_S10: | |||
| .Lzgemv_t_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zgemv_t_kernel_S10 | |||
| bne .Lzgemv_t_kernel_S10 | |||
| zgemv_t_kernel_S_END: | |||
| .Lzgemv_t_kernel_S_END: | |||
| #if !defined(DOUBLE) | |||
| ld1 {v4.2s}, [Y] | |||
| @@ -413,9 +413,9 @@ zgemv_t_kernel_S_END: | |||
| add A, A, LDA | |||
| subs J, J, #1 | |||
| bne zgemv_t_kernel_S_LOOP | |||
| bne .Lzgemv_t_kernel_S_LOOP | |||
| zgemv_t_kernel_L999: | |||
| .Lzgemv_t_kernel_L999: | |||
| RESTORE_REGS | |||
| mov w0, wzr | |||
| ret | |||
| @@ -226,43 +226,43 @@ KERNEL_S1_END_\@: | |||
| INIT | |||
| cmp N, #0 | |||
| ble nrm2_kernel_L999 | |||
| ble .Lznrm2_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq nrm2_kernel_L999 | |||
| beq .Lznrm2_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne nrm2_kernel_S_BEGIN | |||
| bne .Lznrm2_kernel_S_BEGIN | |||
| nrm2_kernel_F_BEGIN: | |||
| .Lznrm2_kernel_F_BEGIN: | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, xzr | |||
| ble nrm2_kernel_F1 | |||
| ble .Lznrm2_kernel_F1 | |||
| nrm2_kernel_F8: | |||
| .Lznrm2_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne nrm2_kernel_F8 | |||
| bne .Lznrm2_kernel_F8 | |||
| nrm2_kernel_F1: | |||
| .Lznrm2_kernel_F1: | |||
| ands I, N, #7 | |||
| ble nrm2_kernel_L999 | |||
| ble .Lznrm2_kernel_L999 | |||
| nrm2_kernel_F10: | |||
| .Lznrm2_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne nrm2_kernel_F10 | |||
| bne .Lznrm2_kernel_F10 | |||
| b nrm2_kernel_L999 | |||
| b .Lznrm2_kernel_L999 | |||
| nrm2_kernel_S_BEGIN: | |||
| .Lznrm2_kernel_S_BEGIN: | |||
| INIT_S | |||
| @@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN: | |||
| .align 5 | |||
| nrm2_kernel_S10: | |||
| .Lznrm2_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne nrm2_kernel_S10 | |||
| bne .Lznrm2_kernel_S10 | |||
| nrm2_kernel_L999: | |||
| .Lznrm2_kernel_L999: | |||
| fsqrt SSQ, SSQ | |||
| fmul SSQ, SCALE, SSQ | |||
| @@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| cmp N, xzr | |||
| ble rot_kernel_L999 | |||
| ble .Lzrot_kernel_L999 | |||
| INIT | |||
| cmp INC_X, #1 | |||
| bne rot_kernel_S_BEGIN | |||
| bne .Lzrot_kernel_S_BEGIN | |||
| cmp INC_Y, #1 | |||
| bne rot_kernel_S_BEGIN | |||
| bne .Lzrot_kernel_S_BEGIN | |||
| rot_kernel_F_BEGIN: | |||
| .Lzrot_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq rot_kernel_F1 | |||
| beq .Lzrot_kernel_F1 | |||
| KERNEL_INIT_F4 | |||
| rot_kernel_F4: | |||
| .Lzrot_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne rot_kernel_F4 | |||
| bne .Lzrot_kernel_F4 | |||
| rot_kernel_F1: | |||
| .Lzrot_kernel_F1: | |||
| ands I, N, #3 | |||
| ble rot_kernel_L999 | |||
| ble .Lzrot_kernel_L999 | |||
| rot_kernel_F10: | |||
| .Lzrot_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne rot_kernel_F10 | |||
| bne .Lzrot_kernel_F10 | |||
| mov w0, wzr | |||
| ret | |||
| rot_kernel_S_BEGIN: | |||
| .Lzrot_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble rot_kernel_S1 | |||
| ble .Lzrot_kernel_S1 | |||
| rot_kernel_S4: | |||
| .Lzrot_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -236,21 +236,21 @@ rot_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne rot_kernel_S4 | |||
| bne .Lzrot_kernel_S4 | |||
| rot_kernel_S1: | |||
| .Lzrot_kernel_S1: | |||
| ands I, N, #3 | |||
| ble rot_kernel_L999 | |||
| ble .Lzrot_kernel_L999 | |||
| rot_kernel_S10: | |||
| .Lzrot_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne rot_kernel_S10 | |||
| bne .Lzrot_kernel_S10 | |||
| rot_kernel_L999: | |||
| .Lzrot_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -215,71 +215,71 @@ zscal_begin: | |||
| mov X_COPY, X | |||
| cmp N, xzr | |||
| ble zscal_kernel_L999 | |||
| ble .Lzscal_kernel_L999 | |||
| fcmp DA_R, #0.0 | |||
| bne zscal_kernel_R_non_zero | |||
| bne .Lzscal_kernel_R_non_zero | |||
| fcmp DA_I, #0.0 | |||
| beq zscal_kernel_RI_zero | |||
| beq .Lzscal_kernel_RI_zero | |||
| b zscal_kernel_R_zero | |||
| b .Lzscal_kernel_R_zero | |||
| zscal_kernel_R_non_zero: | |||
| .Lzscal_kernel_R_non_zero: | |||
| fcmp DA_I, #0.0 | |||
| beq zscal_kernel_I_zero | |||
| beq .Lzscal_kernel_I_zero | |||
| /******************************************************************************* | |||
| * A_R != 0 && A_I != 0 | |||
| *******************************************************************************/ | |||
| zscal_kernel_RI_non_zero: | |||
| .Lzscal_kernel_RI_non_zero: | |||
| INIT | |||
| cmp INC_X, #1 | |||
| bne zscal_kernel_S_BEGIN | |||
| bne .Lzscal_kernel_S_BEGIN | |||
| zscal_kernel_F_BEGIN: | |||
| .Lzscal_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq zscal_kernel_F1 | |||
| beq .Lzscal_kernel_F1 | |||
| KERNEL_INIT_F4 | |||
| zscal_kernel_F4: | |||
| .Lzscal_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne zscal_kernel_F4 | |||
| bne .Lzscal_kernel_F4 | |||
| zscal_kernel_F1: | |||
| .Lzscal_kernel_F1: | |||
| ands I, N, #3 | |||
| ble zscal_kernel_L999 | |||
| ble .Lzscal_kernel_L999 | |||
| zscal_kernel_F10: | |||
| .Lzscal_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne zscal_kernel_F10 | |||
| bne .Lzscal_kernel_F10 | |||
| mov w0, wzr | |||
| ret | |||
| zscal_kernel_S_BEGIN: | |||
| .Lzscal_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble zscal_kernel_S1 | |||
| ble .Lzscal_kernel_S1 | |||
| zscal_kernel_S4: | |||
| .Lzscal_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| @@ -287,21 +287,21 @@ zscal_kernel_S4: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zscal_kernel_S4 | |||
| bne .Lzscal_kernel_S4 | |||
| zscal_kernel_S1: | |||
| .Lzscal_kernel_S1: | |||
| ands I, N, #3 | |||
| ble zscal_kernel_L999 | |||
| ble .Lzscal_kernel_L999 | |||
| zscal_kernel_S10: | |||
| .Lzscal_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne zscal_kernel_S10 | |||
| bne .Lzscal_kernel_S10 | |||
| zscal_kernel_L999: | |||
| .Lzscal_kernel_L999: | |||
| mov w0, wzr | |||
| ret | |||
| @@ -310,7 +310,7 @@ zscal_kernel_L999: | |||
| * A_R == 0 && A_I != 0 | |||
| *******************************************************************************/ | |||
| zscal_kernel_R_zero: | |||
| .Lzscal_kernel_R_zero: | |||
| INIT_S | |||
| #if !defined(DOUBLE) | |||
| @@ -323,7 +323,7 @@ zscal_kernel_R_zero: | |||
| ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | |||
| #endif | |||
| zscal_kernel_R_zero_1: | |||
| .Lzscal_kernel_R_zero_1: | |||
| #if !defined(DOUBLE) | |||
| ld1 {v2.2s}, [X] // X1, X0 | |||
| fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 | |||
| @@ -337,7 +337,7 @@ zscal_kernel_R_zero_1: | |||
| #endif | |||
| add X, X, INC_X | |||
| subs N, N, #1 | |||
| bne zscal_kernel_R_zero_1 | |||
| bne .Lzscal_kernel_R_zero_1 | |||
| mov w0, wzr | |||
| ret | |||
| @@ -346,7 +346,7 @@ zscal_kernel_R_zero_1: | |||
| * A_R != 0 && A_I == 0 | |||
| *******************************************************************************/ | |||
| zscal_kernel_I_zero: | |||
| .Lzscal_kernel_I_zero: | |||
| INIT_S | |||
| #if !defined(DOUBLE) | |||
| ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||
| @@ -354,7 +354,7 @@ zscal_kernel_I_zero: | |||
| ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||
| #endif | |||
| zscal_kernel_I_zero_1: | |||
| .Lzscal_kernel_I_zero_1: | |||
| #if !defined(DOUBLE) | |||
| ld1 {v2.2s}, [X] // X1, X0 | |||
| fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||
| @@ -366,7 +366,7 @@ zscal_kernel_I_zero_1: | |||
| #endif | |||
| add X, X, INC_X | |||
| subs N, N, #1 | |||
| bne zscal_kernel_I_zero_1 | |||
| bne .Lzscal_kernel_I_zero_1 | |||
| mov w0, wzr | |||
| ret | |||
| @@ -375,16 +375,16 @@ zscal_kernel_I_zero_1: | |||
| * A_R == 0 && A_I == 0 | |||
| *******************************************************************************/ | |||
| zscal_kernel_RI_zero: | |||
| .Lzscal_kernel_RI_zero: | |||
| INIT_S | |||
| zscal_kernel_RI_zero_1: | |||
| .Lzscal_kernel_RI_zero_1: | |||
| stp DA_R, DA_I, [X] | |||
| add X, X, INC_X | |||
| subs N, N, #1 | |||
| bne zscal_kernel_RI_zero_1 | |||
| bne .Lzscal_kernel_RI_zero_1 | |||
| mov w0, wzr | |||
| ret | |||
| @@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| mov counterJ, origN | |||
| asr counterJ, counterJ, #2 // J = J / 4 | |||
| cmp counterJ, #0 | |||
| ble ztrmm_kernel_L2_BEGIN | |||
| ble .Lztrmm_kernel_L2_BEGIN | |||
| ztrmm_kernel_L4_BEGIN: | |||
| .Lztrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| @@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN: | |||
| #endif | |||
| mov pA, origPA // pA = start of A array | |||
| ztrmm_kernel_L4_M4_BEGIN: | |||
| .Lztrmm_kernel_L4_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble ztrmm_kernel_L4_M2_BEGIN | |||
| ble .Lztrmm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_20: | |||
| .Lztrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| mov pB, origPB | |||
| @@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20: | |||
| asr counterL , tempK, #3 | |||
| cmp counterL , #2 | |||
| blt ztrmm_kernel_L4_M4_32 | |||
| blt .Lztrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| @@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20: | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 | |||
| ble ztrmm_kernel_L4_M4_22a | |||
| ble .Lztrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22: | |||
| .Lztrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| @@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M4_22 | |||
| bgt .Lztrmm_kernel_L4_M4_22 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22a: | |||
| .Lztrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| @@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| b .Lztrmm_kernel_L4_M4_44 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_32: | |||
| .Lztrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble ztrmm_kernel_L4_M4_40 | |||
| ble .Lztrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| @@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| b .Lztrmm_kernel_L4_M4_44 | |||
| ztrmm_kernel_L4_M4_40: | |||
| .Lztrmm_kernel_L4_M4_40: | |||
| INIT4x4 | |||
| ztrmm_kernel_L4_M4_44: | |||
| .Lztrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #7 | |||
| ble ztrmm_kernel_L4_M4_100 | |||
| ble .Lztrmm_kernel_L4_M4_100 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_46: | |||
| .Lztrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne ztrmm_kernel_L4_M4_46 | |||
| bne .Lztrmm_kernel_L4_M4_46 | |||
| ztrmm_kernel_L4_M4_100: | |||
| .Lztrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100: | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| ztrmm_kernel_L4_M4_END: | |||
| .Lztrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne ztrmm_kernel_L4_M4_20 | |||
| bne .Lztrmm_kernel_L4_M4_20 | |||
| ztrmm_kernel_L4_M2_BEGIN: | |||
| .Lztrmm_kernel_L4_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ztrmm_kernel_L4_END | |||
| ble .Lztrmm_kernel_L4_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ztrmm_kernel_L4_M1_BEGIN | |||
| ble .Lztrmm_kernel_L4_M1_BEGIN | |||
| ztrmm_kernel_L4_M2_20: | |||
| .Lztrmm_kernel_L4_M2_20: | |||
| INIT2x4 | |||
| @@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ztrmm_kernel_L4_M2_40 | |||
| ble .Lztrmm_kernel_L4_M2_40 | |||
| ztrmm_kernel_L4_M2_22: | |||
| .Lztrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| KERNEL2x4_SUB | |||
| @@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M2_22 | |||
| bgt .Lztrmm_kernel_L4_M2_22 | |||
| ztrmm_kernel_L4_M2_40: | |||
| .Lztrmm_kernel_L4_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L4_M2_100 | |||
| ble .Lztrmm_kernel_L4_M2_100 | |||
| ztrmm_kernel_L4_M2_42: | |||
| .Lztrmm_kernel_L4_M2_42: | |||
| KERNEL2x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M2_42 | |||
| bgt .Lztrmm_kernel_L4_M2_42 | |||
| ztrmm_kernel_L4_M2_100: | |||
| .Lztrmm_kernel_L4_M2_100: | |||
| SAVE2x4 | |||
| @@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ztrmm_kernel_L4_M2_END: | |||
| .Lztrmm_kernel_L4_M2_END: | |||
| ztrmm_kernel_L4_M1_BEGIN: | |||
| .Lztrmm_kernel_L4_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ztrmm_kernel_L4_END | |||
| ble .Lztrmm_kernel_L4_END | |||
| ztrmm_kernel_L4_M1_20: | |||
| .Lztrmm_kernel_L4_M1_20: | |||
| INIT1x4 | |||
| @@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ztrmm_kernel_L4_M1_40 | |||
| ble .Lztrmm_kernel_L4_M1_40 | |||
| ztrmm_kernel_L4_M1_22: | |||
| .Lztrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| KERNEL1x4_SUB | |||
| @@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M1_22 | |||
| bgt .Lztrmm_kernel_L4_M1_22 | |||
| ztrmm_kernel_L4_M1_40: | |||
| .Lztrmm_kernel_L4_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L4_M1_100 | |||
| ble .Lztrmm_kernel_L4_M1_100 | |||
| ztrmm_kernel_L4_M1_42: | |||
| .Lztrmm_kernel_L4_M1_42: | |||
| KERNEL1x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M1_42 | |||
| bgt .Lztrmm_kernel_L4_M1_42 | |||
| ztrmm_kernel_L4_M1_100: | |||
| .Lztrmm_kernel_L4_M1_100: | |||
| SAVE1x4 | |||
| @@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100: | |||
| #endif | |||
| ztrmm_kernel_L4_END: | |||
| .Lztrmm_kernel_L4_END: | |||
| lsl temp, origK, #6 | |||
| add origPB, origPB, temp // B = B + K * 4 * 8 * 2 | |||
| @@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END: | |||
| #endif | |||
| subs counterJ, counterJ , #1 // j-- | |||
| bgt ztrmm_kernel_L4_BEGIN | |||
| bgt .Lztrmm_kernel_L4_BEGIN | |||
| /******************************************************************************/ | |||
| ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| .Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov counterJ , origN | |||
| tst counterJ , #3 | |||
| ble ztrmm_kernel_L999 // error, N was less than 4? | |||
| ble .Lztrmm_kernel_L999 // error, N was less than 4? | |||
| tst counterJ , #2 | |||
| ble ztrmm_kernel_L1_BEGIN | |||
| ble .Lztrmm_kernel_L1_BEGIN | |||
| mov pCRow0, pC // pCRow0 = pC | |||
| @@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction | |||
| mov pA, origPA // pA = A | |||
| ztrmm_kernel_L2_M4_BEGIN: | |||
| .Lztrmm_kernel_L2_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI,#0 | |||
| ble ztrmm_kernel_L2_M2_BEGIN | |||
| ble .Lztrmm_kernel_L2_M2_BEGIN | |||
| ztrmm_kernel_L2_M4_20: | |||
| .Lztrmm_kernel_L2_M4_20: | |||
| INIT4x2 | |||
| @@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble ztrmm_kernel_L2_M4_40 | |||
| ble .Lztrmm_kernel_L2_M4_40 | |||
| .align 5 | |||
| ztrmm_kernel_L2_M4_22: | |||
| .Lztrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| KERNEL4x2_SUB | |||
| @@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L2_M4_22 | |||
| bgt .Lztrmm_kernel_L2_M4_22 | |||
| ztrmm_kernel_L2_M4_40: | |||
| .Lztrmm_kernel_L2_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L2_M4_100 | |||
| ble .Lztrmm_kernel_L2_M4_100 | |||
| ztrmm_kernel_L2_M4_42: | |||
| .Lztrmm_kernel_L2_M4_42: | |||
| KERNEL4x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L2_M4_42 | |||
| bgt .Lztrmm_kernel_L2_M4_42 | |||
| ztrmm_kernel_L2_M4_100: | |||
| .Lztrmm_kernel_L2_M4_100: | |||
| SAVE4x2 | |||
| @@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ztrmm_kernel_L2_M4_END: | |||
| .Lztrmm_kernel_L2_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt ztrmm_kernel_L2_M4_20 | |||
| bgt .Lztrmm_kernel_L2_M4_20 | |||
| ztrmm_kernel_L2_M2_BEGIN: | |||
| .Lztrmm_kernel_L2_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ztrmm_kernel_L2_END | |||
| ble .Lztrmm_kernel_L2_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ztrmm_kernel_L2_M1_BEGIN | |||
| ble .Lztrmm_kernel_L2_M1_BEGIN | |||
| ztrmm_kernel_L2_M2_20: | |||
| .Lztrmm_kernel_L2_M2_20: | |||
| INIT2x2 | |||
| @@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL,#0 | |||
| ble ztrmm_kernel_L2_M2_40 | |||
| ble .Lztrmm_kernel_L2_M2_40 | |||
| ztrmm_kernel_L2_M2_22: | |||
| .Lztrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| KERNEL2x2_SUB | |||
| @@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L2_M2_22 | |||
| bgt .Lztrmm_kernel_L2_M2_22 | |||
| ztrmm_kernel_L2_M2_40: | |||
| .Lztrmm_kernel_L2_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L2_M2_100 | |||
| ble .Lztrmm_kernel_L2_M2_100 | |||
| ztrmm_kernel_L2_M2_42: | |||
| .Lztrmm_kernel_L2_M2_42: | |||
| KERNEL2x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L2_M2_42 | |||
| bgt .Lztrmm_kernel_L2_M2_42 | |||
| ztrmm_kernel_L2_M2_100: | |||
| .Lztrmm_kernel_L2_M2_100: | |||
| SAVE2x2 | |||
| @@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ztrmm_kernel_L2_M2_END: | |||
| .Lztrmm_kernel_L2_M2_END: | |||
| ztrmm_kernel_L2_M1_BEGIN: | |||
| .Lztrmm_kernel_L2_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ztrmm_kernel_L2_END | |||
| ble .Lztrmm_kernel_L2_END | |||
| ztrmm_kernel_L2_M1_20: | |||
| .Lztrmm_kernel_L2_M1_20: | |||
| INIT1x2 | |||
| @@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL, #0 | |||
| ble ztrmm_kernel_L2_M1_40 | |||
| ble .Lztrmm_kernel_L2_M1_40 | |||
| ztrmm_kernel_L2_M1_22: | |||
| .Lztrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| KERNEL1x2_SUB | |||
| @@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L2_M1_22 | |||
| bgt .Lztrmm_kernel_L2_M1_22 | |||
| ztrmm_kernel_L2_M1_40: | |||
| .Lztrmm_kernel_L2_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L2_M1_100 | |||
| ble .Lztrmm_kernel_L2_M1_100 | |||
| ztrmm_kernel_L2_M1_42: | |||
| .Lztrmm_kernel_L2_M1_42: | |||
| KERNEL1x2_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L2_M1_42 | |||
| bgt .Lztrmm_kernel_L2_M1_42 | |||
| ztrmm_kernel_L2_M1_100: | |||
| .Lztrmm_kernel_L2_M1_100: | |||
| SAVE1x2 | |||
| @@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100: | |||
| #endif | |||
| ztrmm_kernel_L2_END: | |||
| .Lztrmm_kernel_L2_END: | |||
| #if !defined(LEFT) | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| @@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END: | |||
| /******************************************************************************/ | |||
| ztrmm_kernel_L1_BEGIN: | |||
| .Lztrmm_kernel_L1_BEGIN: | |||
| mov counterJ , origN | |||
| tst counterJ , #1 | |||
| ble ztrmm_kernel_L999 // done | |||
| ble .Lztrmm_kernel_L999 // done | |||
| mov pCRow0, pC // pCRow0 = C | |||
| @@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN: | |||
| ztrmm_kernel_L1_M4_BEGIN: | |||
| .Lztrmm_kernel_L1_M4_BEGIN: | |||
| mov counterI, origM | |||
| asr counterI, counterI, #2 // counterI = counterI / 4 | |||
| cmp counterI, #0 | |||
| ble ztrmm_kernel_L1_M2_BEGIN | |||
| ble .Lztrmm_kernel_L1_M2_BEGIN | |||
| ztrmm_kernel_L1_M4_20: | |||
| .Lztrmm_kernel_L1_M4_20: | |||
| INIT4x1 | |||
| @@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ztrmm_kernel_L1_M4_40 | |||
| ble .Lztrmm_kernel_L1_M4_40 | |||
| .align 5 | |||
| ztrmm_kernel_L1_M4_22: | |||
| .Lztrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| KERNEL4x1_SUB | |||
| @@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L1_M4_22 | |||
| bgt .Lztrmm_kernel_L1_M4_22 | |||
| ztrmm_kernel_L1_M4_40: | |||
| .Lztrmm_kernel_L1_M4_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L1_M4_100 | |||
| ble .Lztrmm_kernel_L1_M4_100 | |||
| ztrmm_kernel_L1_M4_42: | |||
| .Lztrmm_kernel_L1_M4_42: | |||
| KERNEL4x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L1_M4_42 | |||
| bgt .Lztrmm_kernel_L1_M4_42 | |||
| ztrmm_kernel_L1_M4_100: | |||
| .Lztrmm_kernel_L1_M4_100: | |||
| SAVE4x1 | |||
| @@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| ztrmm_kernel_L1_M4_END: | |||
| .Lztrmm_kernel_L1_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bgt ztrmm_kernel_L1_M4_20 | |||
| bgt .Lztrmm_kernel_L1_M4_20 | |||
| ztrmm_kernel_L1_M2_BEGIN: | |||
| .Lztrmm_kernel_L1_M2_BEGIN: | |||
| mov counterI, origM | |||
| tst counterI , #3 | |||
| ble ztrmm_kernel_L1_END | |||
| ble .Lztrmm_kernel_L1_END | |||
| tst counterI, #2 // counterI = counterI / 2 | |||
| ble ztrmm_kernel_L1_M1_BEGIN | |||
| ble .Lztrmm_kernel_L1_M1_BEGIN | |||
| ztrmm_kernel_L1_M2_20: | |||
| .Lztrmm_kernel_L1_M2_20: | |||
| INIT2x1 | |||
| @@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ztrmm_kernel_L1_M2_40 | |||
| ble .Lztrmm_kernel_L1_M2_40 | |||
| ztrmm_kernel_L1_M2_22: | |||
| .Lztrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| KERNEL2x1_SUB | |||
| @@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L1_M2_22 | |||
| bgt .Lztrmm_kernel_L1_M2_22 | |||
| ztrmm_kernel_L1_M2_40: | |||
| .Lztrmm_kernel_L1_M2_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L1_M2_100 | |||
| ble .Lztrmm_kernel_L1_M2_100 | |||
| ztrmm_kernel_L1_M2_42: | |||
| .Lztrmm_kernel_L1_M2_42: | |||
| KERNEL2x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L1_M2_42 | |||
| bgt .Lztrmm_kernel_L1_M2_42 | |||
| ztrmm_kernel_L1_M2_100: | |||
| .Lztrmm_kernel_L1_M2_100: | |||
| SAVE2x1 | |||
| @@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100: | |||
| add tempOffset, tempOffset, #2 | |||
| #endif | |||
| ztrmm_kernel_L1_M2_END: | |||
| .Lztrmm_kernel_L1_M2_END: | |||
| ztrmm_kernel_L1_M1_BEGIN: | |||
| .Lztrmm_kernel_L1_M1_BEGIN: | |||
| tst counterI, #1 // counterI = counterI % 2 | |||
| ble ztrmm_kernel_L1_END | |||
| ble .Lztrmm_kernel_L1_END | |||
| ztrmm_kernel_L1_M1_20: | |||
| .Lztrmm_kernel_L1_M1_20: | |||
| INIT1x1 | |||
| @@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20: | |||
| asr counterL , tempK, #3 // counterL = counterL / 8 | |||
| cmp counterL , #0 | |||
| ble ztrmm_kernel_L1_M1_40 | |||
| ble .Lztrmm_kernel_L1_M1_40 | |||
| ztrmm_kernel_L1_M1_22: | |||
| .Lztrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| KERNEL1x1_SUB | |||
| @@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L1_M1_22 | |||
| bgt .Lztrmm_kernel_L1_M1_22 | |||
| ztrmm_kernel_L1_M1_40: | |||
| .Lztrmm_kernel_L1_M1_40: | |||
| ands counterL , tempK, #7 // counterL = counterL % 8 | |||
| ble ztrmm_kernel_L1_M1_100 | |||
| ble .Lztrmm_kernel_L1_M1_100 | |||
| ztrmm_kernel_L1_M1_42: | |||
| .Lztrmm_kernel_L1_M1_42: | |||
| KERNEL1x1_SUB | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L1_M1_42 | |||
| bgt .Lztrmm_kernel_L1_M1_42 | |||
| ztrmm_kernel_L1_M1_100: | |||
| .Lztrmm_kernel_L1_M1_100: | |||
| SAVE1x1 | |||
| ztrmm_kernel_L1_END: | |||
| .Lztrmm_kernel_L1_END: | |||
| ztrmm_kernel_L999: | |||
| .Lztrmm_kernel_L999: | |||
| mov x0, #0 // set return value | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||