Browse Source

Merge pull request #1334 from ashwinyes/develop_aarch64_20171024_addlocallabels

ARM64: Convert all labels to local labels
tags/v0.3.0
Martin Kroeker GitHub 8 years ago
parent
commit
b71f4fe681
50 changed files with 4469 additions and 4469 deletions
  1. +25
    -25
      kernel/arm64/amax.S
  2. +20
    -20
      kernel/arm64/asum.S
  3. +21
    -21
      kernel/arm64/axpy.S
  4. +20
    -20
      kernel/arm64/casum.S
  5. +142
    -142
      kernel/arm64/cgemm_kernel_4x4.S
  6. +175
    -175
      kernel/arm64/cgemm_kernel_8x4.S
  7. +175
    -175
      kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S
  8. +20
    -20
      kernel/arm64/copy.S
  9. +129
    -129
      kernel/arm64/ctrmm_kernel_4x4.S
  10. +175
    -175
      kernel/arm64/ctrmm_kernel_8x4.S
  11. +22
    -22
      kernel/arm64/daxpy_thunderx2t99.S
  12. +143
    -143
      kernel/arm64/dgemm_kernel_4x4.S
  13. +176
    -176
      kernel/arm64/dgemm_kernel_4x8.S
  14. +169
    -169
      kernel/arm64/dgemm_kernel_8x4.S
  15. +169
    -169
      kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S
  16. +36
    -36
      kernel/arm64/dgemm_ncopy_4.S
  17. +48
    -48
      kernel/arm64/dgemm_ncopy_8.S
  18. +36
    -36
      kernel/arm64/dgemm_tcopy_4.S
  19. +56
    -56
      kernel/arm64/dgemm_tcopy_8.S
  20. +20
    -20
      kernel/arm64/dot.S
  21. +129
    -129
      kernel/arm64/dtrmm_kernel_4x4.S
  22. +176
    -176
      kernel/arm64/dtrmm_kernel_4x8.S
  23. +169
    -169
      kernel/arm64/dtrmm_kernel_8x4.S
  24. +31
    -31
      kernel/arm64/gemv_n.S
  25. +31
    -31
      kernel/arm64/gemv_t.S
  26. +24
    -24
      kernel/arm64/iamax.S
  27. +24
    -24
      kernel/arm64/izamax.S
  28. +16
    -16
      kernel/arm64/nrm2.S
  29. +20
    -20
      kernel/arm64/rot.S
  30. +23
    -23
      kernel/arm64/scal.S
  31. +221
    -221
      kernel/arm64/sgemm_kernel_16x4.S
  32. +221
    -221
      kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S
  33. +155
    -155
      kernel/arm64/sgemm_kernel_4x4.S
  34. +241
    -241
      kernel/arm64/sgemm_kernel_8x8.S
  35. +221
    -221
      kernel/arm64/strmm_kernel_16x4.S
  36. +130
    -130
      kernel/arm64/strmm_kernel_4x4.S
  37. +241
    -241
      kernel/arm64/strmm_kernel_8x8.S
  38. +21
    -21
      kernel/arm64/swap.S
  39. +25
    -25
      kernel/arm64/zamax.S
  40. +20
    -20
      kernel/arm64/zasum.S
  41. +21
    -21
      kernel/arm64/zaxpy.S
  42. +20
    -20
      kernel/arm64/zdot.S
  43. +130
    -130
      kernel/arm64/zgemm_kernel_4x4.S
  44. +130
    -130
      kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S
  45. +26
    -26
      kernel/arm64/zgemv_n.S
  46. +26
    -26
      kernel/arm64/zgemv_t.S
  47. +16
    -16
      kernel/arm64/znrm2.S
  48. +20
    -20
      kernel/arm64/zrot.S
  49. +34
    -34
      kernel/arm64/zscal.S
  50. +130
    -130
      kernel/arm64/ztrmm_kernel_4x4.S

+ 25
- 25
kernel/arm64/amax.S View File

@@ -160,62 +160,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble amax_kernel_zero
ble .Lamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lamax_kernel_S_BEGIN


amax_kernel_F_BEGIN:
.Lamax_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lamax_kernel_F1_INIT


INIT_F4 INIT_F4
subs I, I, #1 subs I, I, #1
beq amax_kernel_F1
beq .Lamax_kernel_F1


amax_kernel_F4:
.Lamax_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne amax_kernel_F4
bne .Lamax_kernel_F4


amax_kernel_F1:
.Lamax_kernel_F1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999


amax_kernel_F10:
.Lamax_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne amax_kernel_F10
bne .Lamax_kernel_F10


ret ret


amax_kernel_F1_INIT:
.Lamax_kernel_F1_INIT:


INIT_F1 INIT_F1
subs N, N, #1 subs N, N, #1
b amax_kernel_F1
b .Lamax_kernel_F1


amax_kernel_S_BEGIN:
.Lamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble amax_kernel_L999
ble .Lamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble amax_kernel_S1
ble .Lamax_kernel_S1


amax_kernel_S4:
.Lamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -223,25 +223,25 @@ amax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S4
bne .Lamax_kernel_S4


amax_kernel_S1:
.Lamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lamax_kernel_L999


amax_kernel_S10:
.Lamax_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S10
bne .Lamax_kernel_S10


amax_kernel_L999:
.Lamax_kernel_L999:


ret ret


amax_kernel_zero:
.Lamax_kernel_zero:


fmov MAXF, REG0 fmov MAXF, REG0
ret ret


+ 20
- 20
kernel/arm64/asum.S View File

@@ -122,52 +122,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif


cmp N, xzr cmp N, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999
ble .Lasum_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lasum_kernel_S_BEGIN


asum_kernel_F_BEGIN:
.Lasum_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq asum_kernel_F1
beq .Lasum_kernel_F1


asum_kernel_F8:
.Lasum_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne asum_kernel_F8
bne .Lasum_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


asum_kernel_F1:
.Lasum_kernel_F1:


ands I, N, #7 ands I, N, #7
ble asum_kernel_L999
ble .Lasum_kernel_L999


asum_kernel_F10:
.Lasum_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne asum_kernel_F10
bne .Lasum_kernel_F10


asum_kernel_L999:
.Lasum_kernel_L999:
ret ret


asum_kernel_S_BEGIN:
.Lasum_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1
ble .Lasum_kernel_S1


asum_kernel_S4:
.Lasum_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -175,19 +175,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S4
bne .Lasum_kernel_S4


asum_kernel_S1:
.Lasum_kernel_S1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lasum_kernel_L999


asum_kernel_S10:
.Lasum_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S10
bne .Lasum_kernel_S10


ret ret




+ 21
- 21
kernel/arm64/axpy.S View File

@@ -135,53 +135,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble axpy_kernel_L999
ble .Laxpy_kernel_L999


fcmp DA, #0.0 fcmp DA, #0.0
beq axpy_kernel_L999
beq .Laxpy_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Laxpy_kernel_S_BEGIN


axpy_kernel_F_BEGIN:
.Laxpy_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq axpy_kernel_F1
beq .Laxpy_kernel_F1


axpy_kernel_F8:
.Laxpy_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F8
bne .Laxpy_kernel_F8


axpy_kernel_F1:
.Laxpy_kernel_F1:


ands I, N, #7 ands I, N, #7
ble axpy_kernel_L999
ble .Laxpy_kernel_L999


axpy_kernel_F10:
.Laxpy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F10
bne .Laxpy_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


axpy_kernel_S_BEGIN:
.Laxpy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble axpy_kernel_S1
ble .Laxpy_kernel_S1


axpy_kernel_S4:
.Laxpy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -189,21 +189,21 @@ axpy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S4
bne .Laxpy_kernel_S4


axpy_kernel_S1:
.Laxpy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble axpy_kernel_L999
ble .Laxpy_kernel_L999


axpy_kernel_S10:
.Laxpy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S10
bne .Laxpy_kernel_S10


axpy_kernel_L999:
.Laxpy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 20
- 20
kernel/arm64/casum.S View File

@@ -98,52 +98,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov s1, SUMF fmov s1, SUMF


cmp N, xzr cmp N, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999
ble .Lcasum_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lcasum_kernel_S_BEGIN


asum_kernel_F_BEGIN:
.Lcasum_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq asum_kernel_F1
beq .Lcasum_kernel_F1


asum_kernel_F8:
.Lcasum_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne asum_kernel_F8
bne .Lcasum_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


asum_kernel_F1:
.Lcasum_kernel_F1:


ands I, N, #7 ands I, N, #7
ble asum_kernel_L999
ble .Lcasum_kernel_L999


asum_kernel_F10:
.Lcasum_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne asum_kernel_F10
bne .Lcasum_kernel_F10


asum_kernel_L999:
.Lcasum_kernel_L999:
ret ret


asum_kernel_S_BEGIN:
.Lcasum_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1
ble .Lcasum_kernel_S1


asum_kernel_S4:
.Lcasum_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -151,19 +151,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S4
bne .Lcasum_kernel_S4


asum_kernel_S1:
.Lcasum_kernel_S1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lcasum_kernel_L999


asum_kernel_S10:
.Lcasum_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S10
bne .Lcasum_kernel_S10


ret ret




+ 142
- 142
kernel/arm64/cgemm_kernel_4x4.S View File

@@ -1072,11 +1072,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -1084,96 +1084,96 @@ cgemm_kernel_L4_BEGIN:
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array
add ppA, temp, pA add ppA, temp, pA


cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN


cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32


KERNEL8x4_I // do one in the K KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K KERNEL8x4_M2 // do another in the K


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a
.align 5 .align 5


cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22




cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44




cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100


cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:
KERNEL8x4_SUB KERNEL8x4_SUB


cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8 lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp add pA, pA, temp
add ppA, ppA, temp add ppA, ppA, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20




cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN


cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40


cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -1186,47 +1186,47 @@ cgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22




cgemm_kernel_L4_M4_40:
.Lcgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100


cgemm_kernel_L4_M4_42:
.Lcgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_42
bgt .Lcgemm_kernel_L4_M4_42


cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:




cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN


cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40


cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1239,43 +1239,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22




cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100


cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42


cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:




cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40


cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1287,45 +1287,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22




cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100


cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42


cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1335,24 +1335,24 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction






cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN


cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5


cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1364,50 +1364,50 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22




cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100


cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42


cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M4_20
bgt .Lcgemm_kernel_L2_M4_20




cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN


cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40


cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1420,43 +1420,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22




cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100


cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42


cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:




cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40


cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1468,36 +1468,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22




cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100


cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42


cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1507,24 +1507,24 @@ cgemm_kernel_L1_BEGIN:






cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN


cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5


cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1536,50 +1536,50 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22




cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100


cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42


cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M4_20
bgt .Lcgemm_kernel_L1_M4_20




cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN


cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40


cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1592,43 +1592,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22




cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100


cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42


cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:




cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40


cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1640,30 +1640,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22




cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100


cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42


cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:




cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/cgemm_kernel_8x4.S View File

@@ -1407,11 +1407,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1421,21 +1421,21 @@ cgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1447,10 +1447,10 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a


.align 5 .align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1462,10 +1462,10 @@ cgemm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22


.align 5 .align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1476,13 +1476,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


.align 5 .align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1493,116 +1493,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100


.align 5 .align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46


cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE8x4 SAVE8x4


cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20


cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN




cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5 .align 5




cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22


cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100


cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:


cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN


cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40


cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1615,43 +1615,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22




cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100


cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42


cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:




cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40


cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1663,45 +1663,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22




cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100


cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42


cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1710,14 +1710,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN


cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1725,10 +1725,10 @@ cgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5 .align 5


cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1740,50 +1740,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22




cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100


cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42


cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20


cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN


cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5


cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1795,46 +1795,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22




cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100


cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42


cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:


cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN


cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40


cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1847,43 +1847,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22




cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100


cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42


cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:




cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40


cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1895,36 +1895,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22




cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100


cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42


cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1933,24 +1933,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN


cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5 .align 5


cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1962,51 +1962,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22




cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100


cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42


cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20


cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN




cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5


cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -2018,47 +2018,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22




cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100


cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42


cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:




cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN


cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40


cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -2071,43 +2071,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22




cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100


cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42


cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:




cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40


cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -2119,30 +2119,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22




cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100


cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42


cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:




cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/cgemm_kernel_8x4_thunderx2t99.S View File

@@ -1432,11 +1432,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble cgemm_kernel_L2_BEGIN
ble .Lcgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L4_BEGIN:
.Lcgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1446,21 +1446,21 @@ cgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


cgemm_kernel_L4_M8_BEGIN:
.Lcgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L4_M4_BEGIN
ble .Lcgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
cgemm_kernel_L4_M8_20:
.Lcgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #5 // origK / 32 asr counterL , origK, #5 // origK / 32
cmp counterL , #2 cmp counterL , #2
blt cgemm_kernel_L4_M8_32
blt .Lcgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1470,18 +1470,18 @@ cgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble cgemm_kernel_L4_M8_22a
ble .Lcgemm_kernel_L4_M8_22a


.align 5 .align 5
cgemm_kernel_L4_M8_22:
.Lcgemm_kernel_L4_M8_22:


KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x16


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M8_22
bgt .Lcgemm_kernel_L4_M8_22


.align 5 .align 5
cgemm_kernel_L4_M8_22a:
.Lcgemm_kernel_L4_M8_22a:


KERNEL8x4_M1_M2_x8 KERNEL8x4_M1_M2_x8
KERNEL8x4_M1_M2_x4 KERNEL8x4_M1_M2_x4
@@ -1490,13 +1490,13 @@ cgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


.align 5 .align 5
cgemm_kernel_L4_M8_32:
.Lcgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M8_40
ble .Lcgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1506,116 +1506,116 @@ cgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b cgemm_kernel_L4_M8_44
b .Lcgemm_kernel_L4_M8_44


cgemm_kernel_L4_M8_40:
.Lcgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


cgemm_kernel_L4_M8_44:
.Lcgemm_kernel_L4_M8_44:


ands counterL , origK, #31 ands counterL , origK, #31
ble cgemm_kernel_L4_M8_100
ble .Lcgemm_kernel_L4_M8_100


.align 5 .align 5
cgemm_kernel_L4_M8_46:
.Lcgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne cgemm_kernel_L4_M8_46
bne .Lcgemm_kernel_L4_M8_46


cgemm_kernel_L4_M8_100:
.Lcgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE8x4 SAVE8x4


cgemm_kernel_L4_M8_END:
.Lcgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne cgemm_kernel_L4_M8_20
bne .Lcgemm_kernel_L4_M8_20


cgemm_kernel_L4_M4_BEGIN:
.Lcgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble cgemm_kernel_L4_M2_BEGIN
ble .Lcgemm_kernel_L4_M2_BEGIN




cgemm_kernel_L4_M4_20:
.Lcgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt cgemm_kernel_L4_M4_32
blt .Lcgemm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble cgemm_kernel_L4_M4_22a
ble .Lcgemm_kernel_L4_M4_22a
.align 5 .align 5




cgemm_kernel_L4_M4_22:
.Lcgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M4_22
bgt .Lcgemm_kernel_L4_M4_22


cgemm_kernel_L4_M4_22a:
.Lcgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_32:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble cgemm_kernel_L4_M4_40
ble .Lcgemm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b cgemm_kernel_L4_M4_44
cgemm_kernel_L4_M4_40:
b .Lcgemm_kernel_L4_M4_44
.Lcgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


cgemm_kernel_L4_M4_44:
.Lcgemm_kernel_L4_M4_44:
ands counterL , origK, #1 ands counterL , origK, #1
ble cgemm_kernel_L4_M4_100
ble .Lcgemm_kernel_L4_M4_100


cgemm_kernel_L4_M4_46:
.Lcgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


cgemm_kernel_L4_M4_100:
.Lcgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


cgemm_kernel_L4_M4_END:
.Lcgemm_kernel_L4_M4_END:


cgemm_kernel_L4_M2_BEGIN:
.Lcgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L4_M1_BEGIN
ble .Lcgemm_kernel_L4_M1_BEGIN


cgemm_kernel_L4_M2_20:
.Lcgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M2_40
ble .Lcgemm_kernel_L4_M2_40


cgemm_kernel_L4_M2_22:
.Lcgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1628,43 +1628,43 @@ cgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_22
bgt .Lcgemm_kernel_L4_M2_22




cgemm_kernel_L4_M2_40:
.Lcgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M2_100
ble .Lcgemm_kernel_L4_M2_100


cgemm_kernel_L4_M2_42:
.Lcgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M2_42
bgt .Lcgemm_kernel_L4_M2_42


cgemm_kernel_L4_M2_100:
.Lcgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


cgemm_kernel_L4_M2_END:
.Lcgemm_kernel_L4_M2_END:




cgemm_kernel_L4_M1_BEGIN:
.Lcgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L4_END
ble .Lcgemm_kernel_L4_END


cgemm_kernel_L4_M1_20:
.Lcgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L4_M1_40
ble .Lcgemm_kernel_L4_M1_40


cgemm_kernel_L4_M1_22:
.Lcgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1676,45 +1676,45 @@ cgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_22
bgt .Lcgemm_kernel_L4_M1_22




cgemm_kernel_L4_M1_40:
.Lcgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L4_M1_100
ble .Lcgemm_kernel_L4_M1_100


cgemm_kernel_L4_M1_42:
.Lcgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L4_M1_42
bgt .Lcgemm_kernel_L4_M1_42


cgemm_kernel_L4_M1_100:
.Lcgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




cgemm_kernel_L4_END:
.Lcgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt cgemm_kernel_L4_BEGIN
bgt .Lcgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lcgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble cgemm_kernel_L999 // error, N was less than 4?
ble .Lcgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble cgemm_kernel_L1_BEGIN
ble .Lcgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1723,14 +1723,14 @@ cgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L2_M8_BEGIN:
.Lcgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L2_M4_BEGIN
ble .Lcgemm_kernel_L2_M4_BEGIN


cgemm_kernel_L2_M8_20:
.Lcgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1738,10 +1738,10 @@ cgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M8_40
ble .Lcgemm_kernel_L2_M8_40
.align 5 .align 5


cgemm_kernel_L2_M8_22:
.Lcgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1753,50 +1753,50 @@ cgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_22
bgt .Lcgemm_kernel_L2_M8_22




cgemm_kernel_L2_M8_40:
.Lcgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M8_100
ble .Lcgemm_kernel_L2_M8_100


cgemm_kernel_L2_M8_42:
.Lcgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M8_42
bgt .Lcgemm_kernel_L2_M8_42


cgemm_kernel_L2_M8_100:
.Lcgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


cgemm_kernel_L2_M8_END:
.Lcgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L2_M8_20
bgt .Lcgemm_kernel_L2_M8_20


cgemm_kernel_L2_M4_BEGIN:
.Lcgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L2_M2_BEGIN
ble .Lcgemm_kernel_L2_M2_BEGIN


cgemm_kernel_L2_M4_20:
.Lcgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M4_40
ble .Lcgemm_kernel_L2_M4_40
.align 5 .align 5


cgemm_kernel_L2_M4_22:
.Lcgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1808,46 +1808,46 @@ cgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_22
bgt .Lcgemm_kernel_L2_M4_22




cgemm_kernel_L2_M4_40:
.Lcgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M4_100
ble .Lcgemm_kernel_L2_M4_100


cgemm_kernel_L2_M4_42:
.Lcgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M4_42
bgt .Lcgemm_kernel_L2_M4_42


cgemm_kernel_L2_M4_100:
.Lcgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


cgemm_kernel_L2_M4_END:
.Lcgemm_kernel_L2_M4_END:


cgemm_kernel_L2_M2_BEGIN:
.Lcgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L2_M1_BEGIN
ble .Lcgemm_kernel_L2_M1_BEGIN


cgemm_kernel_L2_M2_20:
.Lcgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble cgemm_kernel_L2_M2_40
ble .Lcgemm_kernel_L2_M2_40


cgemm_kernel_L2_M2_22:
.Lcgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1860,43 +1860,43 @@ cgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_22
bgt .Lcgemm_kernel_L2_M2_22




cgemm_kernel_L2_M2_40:
.Lcgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M2_100
ble .Lcgemm_kernel_L2_M2_100


cgemm_kernel_L2_M2_42:
.Lcgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M2_42
bgt .Lcgemm_kernel_L2_M2_42


cgemm_kernel_L2_M2_100:
.Lcgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


cgemm_kernel_L2_M2_END:
.Lcgemm_kernel_L2_M2_END:




cgemm_kernel_L2_M1_BEGIN:
.Lcgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L2_END
ble .Lcgemm_kernel_L2_END


cgemm_kernel_L2_M1_20:
.Lcgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble cgemm_kernel_L2_M1_40
ble .Lcgemm_kernel_L2_M1_40


cgemm_kernel_L2_M1_22:
.Lcgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1908,36 +1908,36 @@ cgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_22
bgt .Lcgemm_kernel_L2_M1_22




cgemm_kernel_L2_M1_40:
.Lcgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L2_M1_100
ble .Lcgemm_kernel_L2_M1_100


cgemm_kernel_L2_M1_42:
.Lcgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L2_M1_42
bgt .Lcgemm_kernel_L2_M1_42


cgemm_kernel_L2_M1_100:
.Lcgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




cgemm_kernel_L2_END:
.Lcgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


cgemm_kernel_L1_BEGIN:
.Lcgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble cgemm_kernel_L999 // done
ble .Lcgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1946,24 +1946,24 @@ cgemm_kernel_L1_BEGIN:
mov pA, origPA // pA = A mov pA, origPA // pA = A




cgemm_kernel_L1_M8_BEGIN:
.Lcgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble cgemm_kernel_L1_M4_BEGIN
ble .Lcgemm_kernel_L1_M4_BEGIN


cgemm_kernel_L1_M8_20:
.Lcgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M8_40
ble .Lcgemm_kernel_L1_M8_40
.align 5 .align 5


cgemm_kernel_L1_M8_22:
.Lcgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1975,51 +1975,51 @@ cgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_22
bgt .Lcgemm_kernel_L1_M8_22




cgemm_kernel_L1_M8_40:
.Lcgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M8_100
ble .Lcgemm_kernel_L1_M8_100


cgemm_kernel_L1_M8_42:
.Lcgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M8_42
bgt .Lcgemm_kernel_L1_M8_42


cgemm_kernel_L1_M8_100:
.Lcgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


cgemm_kernel_L1_M8_END:
.Lcgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt cgemm_kernel_L1_M8_20
bgt .Lcgemm_kernel_L1_M8_20


cgemm_kernel_L1_M4_BEGIN:
.Lcgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble cgemm_kernel_L1_M2_BEGIN
ble .Lcgemm_kernel_L1_M2_BEGIN




cgemm_kernel_L1_M4_20:
.Lcgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M4_40
ble .Lcgemm_kernel_L1_M4_40
.align 5 .align 5


cgemm_kernel_L1_M4_22:
.Lcgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -2031,47 +2031,47 @@ cgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_22
bgt .Lcgemm_kernel_L1_M4_22




cgemm_kernel_L1_M4_40:
.Lcgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M4_100
ble .Lcgemm_kernel_L1_M4_100


cgemm_kernel_L1_M4_42:
.Lcgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M4_42
bgt .Lcgemm_kernel_L1_M4_42


cgemm_kernel_L1_M4_100:
.Lcgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


cgemm_kernel_L1_M4_END:
.Lcgemm_kernel_L1_M4_END:




cgemm_kernel_L1_M2_BEGIN:
.Lcgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble cgemm_kernel_L1_M1_BEGIN
ble .Lcgemm_kernel_L1_M1_BEGIN


cgemm_kernel_L1_M2_20:
.Lcgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M2_40
ble .Lcgemm_kernel_L1_M2_40


cgemm_kernel_L1_M2_22:
.Lcgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -2084,43 +2084,43 @@ cgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_22
bgt .Lcgemm_kernel_L1_M2_22




cgemm_kernel_L1_M2_40:
.Lcgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M2_100
ble .Lcgemm_kernel_L1_M2_100


cgemm_kernel_L1_M2_42:
.Lcgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M2_42
bgt .Lcgemm_kernel_L1_M2_42


cgemm_kernel_L1_M2_100:
.Lcgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


cgemm_kernel_L1_M2_END:
.Lcgemm_kernel_L1_M2_END:




cgemm_kernel_L1_M1_BEGIN:
.Lcgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble cgemm_kernel_L1_END
ble .Lcgemm_kernel_L1_END


cgemm_kernel_L1_M1_20:
.Lcgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble cgemm_kernel_L1_M1_40
ble .Lcgemm_kernel_L1_M1_40


cgemm_kernel_L1_M1_22:
.Lcgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -2132,30 +2132,30 @@ cgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_22
bgt .Lcgemm_kernel_L1_M1_22




cgemm_kernel_L1_M1_40:
.Lcgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble cgemm_kernel_L1_M1_100
ble .Lcgemm_kernel_L1_M1_100


cgemm_kernel_L1_M1_42:
.Lcgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt cgemm_kernel_L1_M1_42
bgt .Lcgemm_kernel_L1_M1_42


cgemm_kernel_L1_M1_100:
.Lcgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




cgemm_kernel_L1_END:
.Lcgemm_kernel_L1_END:




cgemm_kernel_L999:
.Lcgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 20
- 20
kernel/arm64/copy.S View File

@@ -159,50 +159,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble copy_kernel_L999
ble .Lcopy_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne copy_kernel_S_BEGIN
bne .Lcopy_kernel_S_BEGIN


copy_kernel_F_BEGIN:
.Lcopy_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq copy_kernel_F1
beq .Lcopy_kernel_F1


copy_kernel_F4:
.Lcopy_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne copy_kernel_F4
bne .Lcopy_kernel_F4


copy_kernel_F1:
.Lcopy_kernel_F1:


ands I, N, #3 ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999


copy_kernel_F10:
.Lcopy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne copy_kernel_F10
bne .Lcopy_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


copy_kernel_S_BEGIN:
.Lcopy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble copy_kernel_S1
ble .Lcopy_kernel_S1


copy_kernel_S4:
.Lcopy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -210,21 +210,21 @@ copy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne copy_kernel_S4
bne .Lcopy_kernel_S4


copy_kernel_S1:
.Lcopy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble copy_kernel_L999
ble .Lcopy_kernel_L999


copy_kernel_S10:
.Lcopy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne copy_kernel_S10
bne .Lcopy_kernel_S10


copy_kernel_L999:
.Lcopy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret


+ 129
- 129
kernel/arm64/ctrmm_kernel_4x4.S View File

@@ -785,11 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -798,14 +798,14 @@ ctrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN


ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -826,55 +826,55 @@ ctrmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5 .align 5


ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22




ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44


ctrmm_kernel_L4_M4_32:
.Lctrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E


b ctrmm_kernel_L4_M4_44
b .Lctrmm_kernel_L4_M4_44




ctrmm_kernel_L4_M4_40:
.Lctrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100


ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -893,20 +893,20 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ctrmm_kernel_L4_M4_20
bne .Lctrmm_kernel_L4_M4_20


ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN


ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -930,9 +930,9 @@ ctrmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40


ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -945,22 +945,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22




ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100


ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42


ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -980,15 +980,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:




ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1012,9 +1012,9 @@ ctrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40


ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1026,22 +1026,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22




ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100


ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42


ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1061,7 +1061,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1071,19 +1071,19 @@ ctrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1095,14 +1095,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN


ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1126,10 +1126,10 @@ ctrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5 .align 5


ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1141,22 +1141,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22




ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100


ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42


ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -1176,22 +1176,22 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M4_20
bgt .Lctrmm_kernel_L2_M4_20




ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN


ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1215,9 +1215,9 @@ ctrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40


ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1230,22 +1230,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22




ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100


ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42


ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1265,15 +1265,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:




ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1297,9 +1297,9 @@ ctrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40


ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1311,22 +1311,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22




ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100


ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42


ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1346,7 +1346,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1354,11 +1354,11 @@ ctrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1370,14 +1370,14 @@ ctrmm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN


ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1401,10 +1401,10 @@ ctrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5 .align 5


ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1416,22 +1416,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22




ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100


ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42


ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1451,22 +1451,22 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M4_20
bgt .Lctrmm_kernel_L1_M4_20




ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN


ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1490,9 +1490,9 @@ ctrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40


ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1505,22 +1505,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22




ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100


ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42


ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1540,15 +1540,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:




ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1572,9 +1572,9 @@ ctrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40


ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1586,30 +1586,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22




ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100


ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42


ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:




ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 175
- 175
kernel/arm64/ctrmm_kernel_8x4.S View File

@@ -1405,11 +1405,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ctrmm_kernel_L2_BEGIN
ble .Lctrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L4_BEGIN:
.Lctrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1423,14 +1423,14 @@ ctrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


ctrmm_kernel_L4_M8_BEGIN:
.Lctrmm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L4_M4_BEGIN
ble .Lctrmm_kernel_L4_M4_BEGIN


ctrmm_kernel_L4_M8_20:
.Lctrmm_kernel_L4_M8_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1452,7 +1452,7 @@ ctrmm_kernel_L4_M8_20:


asr counterL , tempK, #3 asr counterL , tempK, #3
cmp counterL , #2 cmp counterL , #2
blt ctrmm_kernel_L4_M8_32
blt .Lctrmm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1464,10 +1464,10 @@ ctrmm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble ctrmm_kernel_L4_M8_22a
ble .Lctrmm_kernel_L4_M8_22a


.align 5 .align 5
ctrmm_kernel_L4_M8_22:
.Lctrmm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1479,10 +1479,10 @@ ctrmm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M8_22
bgt .Lctrmm_kernel_L4_M8_22


.align 5 .align 5
ctrmm_kernel_L4_M8_22a:
.Lctrmm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1493,13 +1493,13 @@ ctrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44


.align 5 .align 5
ctrmm_kernel_L4_M8_32:
.Lctrmm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M8_40
ble .Lctrmm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1510,26 +1510,26 @@ ctrmm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b ctrmm_kernel_L4_M8_44
b .Lctrmm_kernel_L4_M8_44


ctrmm_kernel_L4_M8_40:
.Lctrmm_kernel_L4_M8_40:


INIT8x4 INIT8x4


ctrmm_kernel_L4_M8_44:
.Lctrmm_kernel_L4_M8_44:


ands counterL , tempK, #7 ands counterL , tempK, #7
ble ctrmm_kernel_L4_M8_100
ble .Lctrmm_kernel_L4_M8_100


.align 5 .align 5
ctrmm_kernel_L4_M8_46:
.Lctrmm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne ctrmm_kernel_L4_M8_46
bne .Lctrmm_kernel_L4_M8_46


ctrmm_kernel_L4_M8_100:
.Lctrmm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


@@ -1552,21 +1552,21 @@ ctrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


ctrmm_kernel_L4_M8_END:
.Lctrmm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ctrmm_kernel_L4_M8_20
bne .Lctrmm_kernel_L4_M8_20


ctrmm_kernel_L4_M4_BEGIN:
.Lctrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble ctrmm_kernel_L4_M2_BEGIN
ble .Lctrmm_kernel_L4_M2_BEGIN




ctrmm_kernel_L4_M4_20:
.Lctrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1587,46 +1587,46 @@ ctrmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt ctrmm_kernel_L4_M4_32
blt .Lctrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble ctrmm_kernel_L4_M4_22a
ble .Lctrmm_kernel_L4_M4_22a
.align 5 .align 5




ctrmm_kernel_L4_M4_22:
.Lctrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M4_22
bgt .Lctrmm_kernel_L4_M4_22


ctrmm_kernel_L4_M4_22a:
.Lctrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_32:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_32:
tst counterL, #1 tst counterL, #1
ble ctrmm_kernel_L4_M4_40
ble .Lctrmm_kernel_L4_M4_40
KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E
b ctrmm_kernel_L4_M4_44
ctrmm_kernel_L4_M4_40:
b .Lctrmm_kernel_L4_M4_44
.Lctrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


ctrmm_kernel_L4_M4_44:
.Lctrmm_kernel_L4_M4_44:
ands counterL , tempK, #1 ands counterL , tempK, #1
ble ctrmm_kernel_L4_M4_100
ble .Lctrmm_kernel_L4_M4_100


ctrmm_kernel_L4_M4_46:
.Lctrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


ctrmm_kernel_L4_M4_100:
.Lctrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -1645,18 +1645,18 @@ ctrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L4_M4_END:
.Lctrmm_kernel_L4_M4_END:


ctrmm_kernel_L4_M2_BEGIN:
.Lctrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L4_M1_BEGIN
ble .Lctrmm_kernel_L4_M1_BEGIN


ctrmm_kernel_L4_M2_20:
.Lctrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1679,9 +1679,9 @@ ctrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M2_40
ble .Lctrmm_kernel_L4_M2_40


ctrmm_kernel_L4_M2_22:
.Lctrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1694,22 +1694,22 @@ ctrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_22
bgt .Lctrmm_kernel_L4_M2_22




ctrmm_kernel_L4_M2_40:
.Lctrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M2_100
ble .Lctrmm_kernel_L4_M2_100


ctrmm_kernel_L4_M2_42:
.Lctrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M2_42
bgt .Lctrmm_kernel_L4_M2_42


ctrmm_kernel_L4_M2_100:
.Lctrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1729,15 +1729,15 @@ ctrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L4_M2_END:
.Lctrmm_kernel_L4_M2_END:




ctrmm_kernel_L4_M1_BEGIN:
.Lctrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L4_END
ble .Lctrmm_kernel_L4_END


ctrmm_kernel_L4_M1_20:
.Lctrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1761,9 +1761,9 @@ ctrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L4_M1_40
ble .Lctrmm_kernel_L4_M1_40


ctrmm_kernel_L4_M1_22:
.Lctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1775,22 +1775,22 @@ ctrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_22
bgt .Lctrmm_kernel_L4_M1_22




ctrmm_kernel_L4_M1_40:
.Lctrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L4_M1_100
ble .Lctrmm_kernel_L4_M1_100


ctrmm_kernel_L4_M1_42:
.Lctrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L4_M1_42
bgt .Lctrmm_kernel_L4_M1_42


ctrmm_kernel_L4_M1_100:
.Lctrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1810,7 +1810,7 @@ ctrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L4_END:
.Lctrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1820,19 +1820,19 @@ ctrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ctrmm_kernel_L4_BEGIN
bgt .Lctrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lctrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ctrmm_kernel_L999 // error, N was less than 4?
ble .Lctrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble ctrmm_kernel_L1_BEGIN
ble .Lctrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1843,14 +1843,14 @@ ctrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L2_M8_BEGIN:
.Lctrmm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L2_M4_BEGIN
ble .Lctrmm_kernel_L2_M4_BEGIN


ctrmm_kernel_L2_M8_20:
.Lctrmm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1874,10 +1874,10 @@ ctrmm_kernel_L2_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M8_40
ble .Lctrmm_kernel_L2_M8_40
.align 5 .align 5


ctrmm_kernel_L2_M8_22:
.Lctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1889,22 +1889,22 @@ ctrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_22
bgt .Lctrmm_kernel_L2_M8_22




ctrmm_kernel_L2_M8_40:
.Lctrmm_kernel_L2_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M8_100
ble .Lctrmm_kernel_L2_M8_100


ctrmm_kernel_L2_M8_42:
.Lctrmm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M8_42
bgt .Lctrmm_kernel_L2_M8_42


ctrmm_kernel_L2_M8_100:
.Lctrmm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


@@ -1924,21 +1924,21 @@ ctrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


ctrmm_kernel_L2_M8_END:
.Lctrmm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L2_M8_20
bgt .Lctrmm_kernel_L2_M8_20


ctrmm_kernel_L2_M4_BEGIN:
.Lctrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L2_M2_BEGIN
ble .Lctrmm_kernel_L2_M2_BEGIN


ctrmm_kernel_L2_M4_20:
.Lctrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1962,10 +1962,10 @@ ctrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M4_40
ble .Lctrmm_kernel_L2_M4_40
.align 5 .align 5


ctrmm_kernel_L2_M4_22:
.Lctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1977,22 +1977,22 @@ ctrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_22
bgt .Lctrmm_kernel_L2_M4_22




ctrmm_kernel_L2_M4_40:
.Lctrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M4_100
ble .Lctrmm_kernel_L2_M4_100


ctrmm_kernel_L2_M4_42:
.Lctrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M4_42
bgt .Lctrmm_kernel_L2_M4_42


ctrmm_kernel_L2_M4_100:
.Lctrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -2012,19 +2012,19 @@ ctrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L2_M4_END:
.Lctrmm_kernel_L2_M4_END:




ctrmm_kernel_L2_M2_BEGIN:
.Lctrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L2_M1_BEGIN
ble .Lctrmm_kernel_L2_M1_BEGIN


ctrmm_kernel_L2_M2_20:
.Lctrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -2048,9 +2048,9 @@ ctrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ctrmm_kernel_L2_M2_40
ble .Lctrmm_kernel_L2_M2_40


ctrmm_kernel_L2_M2_22:
.Lctrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -2063,22 +2063,22 @@ ctrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_22
bgt .Lctrmm_kernel_L2_M2_22




ctrmm_kernel_L2_M2_40:
.Lctrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M2_100
ble .Lctrmm_kernel_L2_M2_100


ctrmm_kernel_L2_M2_42:
.Lctrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M2_42
bgt .Lctrmm_kernel_L2_M2_42


ctrmm_kernel_L2_M2_100:
.Lctrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -2098,15 +2098,15 @@ ctrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L2_M2_END:
.Lctrmm_kernel_L2_M2_END:




ctrmm_kernel_L2_M1_BEGIN:
.Lctrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L2_END
ble .Lctrmm_kernel_L2_END


ctrmm_kernel_L2_M1_20:
.Lctrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -2130,9 +2130,9 @@ ctrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ctrmm_kernel_L2_M1_40
ble .Lctrmm_kernel_L2_M1_40


ctrmm_kernel_L2_M1_22:
.Lctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -2144,22 +2144,22 @@ ctrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_22
bgt .Lctrmm_kernel_L2_M1_22




ctrmm_kernel_L2_M1_40:
.Lctrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L2_M1_100
ble .Lctrmm_kernel_L2_M1_100


ctrmm_kernel_L2_M1_42:
.Lctrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L2_M1_42
bgt .Lctrmm_kernel_L2_M1_42


ctrmm_kernel_L2_M1_100:
.Lctrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -2179,7 +2179,7 @@ ctrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


ctrmm_kernel_L2_END:
.Lctrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -2187,11 +2187,11 @@ ctrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


ctrmm_kernel_L1_BEGIN:
.Lctrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ctrmm_kernel_L999 // done
ble .Lctrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
@@ -2201,14 +2201,14 @@ ctrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


ctrmm_kernel_L1_M8_BEGIN:
.Lctrmm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble ctrmm_kernel_L1_M4_BEGIN
ble .Lctrmm_kernel_L1_M4_BEGIN


ctrmm_kernel_L1_M8_20:
.Lctrmm_kernel_L1_M8_20:


INIT8x1 INIT8x1


@@ -2232,10 +2232,10 @@ ctrmm_kernel_L1_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M8_40
ble .Lctrmm_kernel_L1_M8_40
.align 5 .align 5


ctrmm_kernel_L1_M8_22:
.Lctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -2247,22 +2247,22 @@ ctrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_22
bgt .Lctrmm_kernel_L1_M8_22




ctrmm_kernel_L1_M8_40:
.Lctrmm_kernel_L1_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M8_100
ble .Lctrmm_kernel_L1_M8_100


ctrmm_kernel_L1_M8_42:
.Lctrmm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M8_42
bgt .Lctrmm_kernel_L1_M8_42


ctrmm_kernel_L1_M8_100:
.Lctrmm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


@@ -2282,21 +2282,21 @@ ctrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


ctrmm_kernel_L1_M8_END:
.Lctrmm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ctrmm_kernel_L1_M8_20
bgt .Lctrmm_kernel_L1_M8_20


ctrmm_kernel_L1_M4_BEGIN:
.Lctrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble ctrmm_kernel_L1_M2_BEGIN
ble .Lctrmm_kernel_L1_M2_BEGIN


ctrmm_kernel_L1_M4_20:
.Lctrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -2319,10 +2319,10 @@ ctrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M4_40
ble .Lctrmm_kernel_L1_M4_40
.align 5 .align 5


ctrmm_kernel_L1_M4_22:
.Lctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -2334,22 +2334,22 @@ ctrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_22
bgt .Lctrmm_kernel_L1_M4_22




ctrmm_kernel_L1_M4_40:
.Lctrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M4_100
ble .Lctrmm_kernel_L1_M4_100


ctrmm_kernel_L1_M4_42:
.Lctrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M4_42
bgt .Lctrmm_kernel_L1_M4_42


ctrmm_kernel_L1_M4_100:
.Lctrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -2369,18 +2369,18 @@ ctrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ctrmm_kernel_L1_M4_END:
.Lctrmm_kernel_L1_M4_END:


ctrmm_kernel_L1_M2_BEGIN:
.Lctrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ctrmm_kernel_L1_M1_BEGIN
ble .Lctrmm_kernel_L1_M1_BEGIN


ctrmm_kernel_L1_M2_20:
.Lctrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -2404,9 +2404,9 @@ ctrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M2_40
ble .Lctrmm_kernel_L1_M2_40


ctrmm_kernel_L1_M2_22:
.Lctrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -2419,22 +2419,22 @@ ctrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_22
bgt .Lctrmm_kernel_L1_M2_22




ctrmm_kernel_L1_M2_40:
.Lctrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M2_100
ble .Lctrmm_kernel_L1_M2_100


ctrmm_kernel_L1_M2_42:
.Lctrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M2_42
bgt .Lctrmm_kernel_L1_M2_42


ctrmm_kernel_L1_M2_100:
.Lctrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -2454,15 +2454,15 @@ ctrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ctrmm_kernel_L1_M2_END:
.Lctrmm_kernel_L1_M2_END:




ctrmm_kernel_L1_M1_BEGIN:
.Lctrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ctrmm_kernel_L1_END
ble .Lctrmm_kernel_L1_END


ctrmm_kernel_L1_M1_20:
.Lctrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -2486,9 +2486,9 @@ ctrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ctrmm_kernel_L1_M1_40
ble .Lctrmm_kernel_L1_M1_40


ctrmm_kernel_L1_M1_22:
.Lctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -2500,30 +2500,30 @@ ctrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_22
bgt .Lctrmm_kernel_L1_M1_22




ctrmm_kernel_L1_M1_40:
.Lctrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ctrmm_kernel_L1_M1_100
ble .Lctrmm_kernel_L1_M1_100


ctrmm_kernel_L1_M1_42:
.Lctrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ctrmm_kernel_L1_M1_42
bgt .Lctrmm_kernel_L1_M1_42


ctrmm_kernel_L1_M1_100:
.Lctrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




ctrmm_kernel_L1_END:
.Lctrmm_kernel_L1_END:




ctrmm_kernel_L999:
.Lctrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 22
- 22
kernel/arm64/daxpy_thunderx2t99.S View File

@@ -122,53 +122,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999


fcmp DA, #0.0 fcmp DA, #0.0
beq axpy_kernel_L999
beq .Ldaxpy_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne axpy_kernel_S_BEGIN
bne .Ldaxpy_kernel_S_BEGIN


axpy_kernel_F_BEGIN:
.Ldaxpy_kernel_F_BEGIN:


asr I, N, #5 asr I, N, #5
cmp I, xzr cmp I, xzr
beq axpy_kernel_F1
beq .Ldaxpy_kernel_F1


.align 5 .align 5
axpy_kernel_F32:
.Ldaxpy_kernel_F32:


KERNEL_F32 KERNEL_F32


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F32
bne .Ldaxpy_kernel_F32


axpy_kernel_F1:
.Ldaxpy_kernel_F1:


ands I, N, #31 ands I, N, #31
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999


axpy_kernel_F10:
.Ldaxpy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_F10
bne .Ldaxpy_kernel_F10


b axpy_kernel_L999
b .Ldaxpy_kernel_L999


axpy_kernel_S_BEGIN:
.Ldaxpy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble axpy_kernel_S1
ble .Ldaxpy_kernel_S1


axpy_kernel_S4:
.Ldaxpy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -176,21 +176,21 @@ axpy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S4
bne .Ldaxpy_kernel_S4


axpy_kernel_S1:
.Ldaxpy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble axpy_kernel_L999
ble .Ldaxpy_kernel_L999


axpy_kernel_S10:
.Ldaxpy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne axpy_kernel_S10
bne .Ldaxpy_kernel_S10


axpy_kernel_L999:
.Ldaxpy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 143
- 143
kernel/arm64/dgemm_kernel_4x4.S View File

@@ -775,9 +775,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -791,20 +791,20 @@ dgemm_kernel_L4_BEGIN:


//------------------------------------------------------------------------------ //------------------------------------------------------------------------------


dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #2 // L = K / 4 asr counterL , origK, #2 // L = K / 4
cmp counterL , #2 cmp counterL , #2
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -812,60 +812,60 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a


.align 5 .align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22


.align 5 .align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


.align 5 .align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44




dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:


ands counterL , origK, #3 ands counterL , origK, #3
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100


.align 5 .align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46


dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
lsl temp, origK, #5 lsl temp, origK, #5
prfm PLDL1KEEP, [pA, temp] prfm PLDL1KEEP, [pA, temp]
prfm PLDL1KEEP, [ppA, temp] prfm PLDL1KEEP, [ppA, temp]
@@ -873,31 +873,31 @@ dgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
lsl temp, origK, #5 // k * 4 * 8 lsl temp, origK, #5 // k * 4 * 8
add pA, pA, temp add pA, pA, temp
add ppA, ppA, temp add ppA, ppA, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -910,47 +910,47 @@ dgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22




dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:




dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -963,43 +963,43 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1011,45 +1011,45 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1059,24 +1059,24 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction






dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5


dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1088,50 +1088,50 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1144,43 +1144,43 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22




dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1192,36 +1192,36 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22




dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1231,24 +1231,24 @@ dgemm_kernel_L1_BEGIN:






dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5


dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1260,50 +1260,50 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20




dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1316,43 +1316,43 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22




dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40


dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1364,30 +1364,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 176
- 176
kernel/arm64/dgemm_kernel_4x8.S View File

@@ -938,98 +938,98 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L4_BEGIN
ble .Ldgemm_kernel_L4_BEGIN


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L8_BEGIN:
.Ldgemm_kernel_L8_BEGIN:


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3 add pC, pC, LDC, lsl #3


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L8_M4_BEGIN:
.Ldgemm_kernel_L8_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L8_M2_BEGIN
ble .Ldgemm_kernel_L8_M2_BEGIN


dgemm_kernel_L8_M4_20:
.Ldgemm_kernel_L8_M4_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L8_M4_32
blt .Ldgemm_kernel_L8_M4_32


KERNEL4x8_I // do one in the K KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K KERNEL4x8_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dgemm_kernel_L8_M4_22a
ble .Ldgemm_kernel_L8_M4_22a
.align 5 .align 5


dgemm_kernel_L8_M4_22:
.Ldgemm_kernel_L8_M4_22:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_M2 KERNEL4x8_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M4_22
bgt .Ldgemm_kernel_L8_M4_22




dgemm_kernel_L8_M4_22a:
.Ldgemm_kernel_L8_M4_22a:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_E KERNEL4x8_E


b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44


dgemm_kernel_L8_M4_32:
.Ldgemm_kernel_L8_M4_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L8_M4_40
ble .Ldgemm_kernel_L8_M4_40


KERNEL4x8_I KERNEL4x8_I


KERNEL4x8_E KERNEL4x8_E


b dgemm_kernel_L8_M4_44
b .Ldgemm_kernel_L8_M4_44




dgemm_kernel_L8_M4_40:
.Ldgemm_kernel_L8_M4_40:


INIT4x8 INIT4x8


dgemm_kernel_L8_M4_44:
.Ldgemm_kernel_L8_M4_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble dgemm_kernel_L8_M4_100
ble .Ldgemm_kernel_L8_M4_100


dgemm_kernel_L8_M4_46:
.Ldgemm_kernel_L8_M4_46:


KERNEL4x8_SUB KERNEL4x8_SUB


dgemm_kernel_L8_M4_100:
.Ldgemm_kernel_L8_M4_100:


SAVE4x8 SAVE4x8


dgemm_kernel_L8_M4_END:
.Ldgemm_kernel_L8_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L8_M4_20
bne .Ldgemm_kernel_L8_M4_20


dgemm_kernel_L8_M2_BEGIN:
.Ldgemm_kernel_L8_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L8_M1_BEGIN
ble .Ldgemm_kernel_L8_M1_BEGIN


dgemm_kernel_L8_M2_20:
.Ldgemm_kernel_L8_M2_20:


INIT2x8 INIT2x8


@@ -1037,9 +1037,9 @@ dgemm_kernel_L8_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L8_M2_40
ble .Ldgemm_kernel_L8_M2_40


dgemm_kernel_L8_M2_22:
.Ldgemm_kernel_L8_M2_22:


KERNEL2x8_SUB KERNEL2x8_SUB
KERNEL2x8_SUB KERNEL2x8_SUB
@@ -1052,34 +1052,34 @@ dgemm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_22
bgt .Ldgemm_kernel_L8_M2_22




dgemm_kernel_L8_M2_40:
.Ldgemm_kernel_L8_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M2_100
ble .Ldgemm_kernel_L8_M2_100


dgemm_kernel_L8_M2_42:
.Ldgemm_kernel_L8_M2_42:


KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M2_42
bgt .Ldgemm_kernel_L8_M2_42


dgemm_kernel_L8_M2_100:
.Ldgemm_kernel_L8_M2_100:


SAVE2x8 SAVE2x8


dgemm_kernel_L8_M2_END:
.Ldgemm_kernel_L8_M2_END:




dgemm_kernel_L8_M1_BEGIN:
.Ldgemm_kernel_L8_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L8_END
ble .Ldgemm_kernel_L8_END


dgemm_kernel_L8_M1_20:
.Ldgemm_kernel_L8_M1_20:


INIT1x8 INIT1x8


@@ -1087,9 +1087,9 @@ dgemm_kernel_L8_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L8_M1_40
ble .Ldgemm_kernel_L8_M1_40


dgemm_kernel_L8_M1_22:
.Ldgemm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
@@ -1101,131 +1101,131 @@ dgemm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_22
bgt .Ldgemm_kernel_L8_M1_22




dgemm_kernel_L8_M1_40:
.Ldgemm_kernel_L8_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L8_M1_100
ble .Ldgemm_kernel_L8_M1_100


dgemm_kernel_L8_M1_42:
.Ldgemm_kernel_L8_M1_42:


KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L8_M1_42
bgt .Ldgemm_kernel_L8_M1_42


dgemm_kernel_L8_M1_100:
.Ldgemm_kernel_L8_M1_100:


SAVE1x8 SAVE1x8


dgemm_kernel_L8_END:
.Ldgemm_kernel_L8_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8 add origPB, origPB, temp // B = B + K * 8 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L8_BEGIN
bgt .Ldgemm_kernel_L8_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #7 tst counterJ , #7
ble dgemm_kernel_L999
ble .Ldgemm_kernel_L999


tst counterJ , #4 tst counterJ , #4
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M4_32
blt .Ldgemm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dgemm_kernel_L4_M4_22a
ble .Ldgemm_kernel_L4_M4_22a
.align 5 .align 5


dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22




dgemm_kernel_L4_M4_22a:
.Ldgemm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44


dgemm_kernel_L4_M4_32:
.Ldgemm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I


KERNEL4x4_E KERNEL4x4_E


b dgemm_kernel_L4_M4_44
b .Ldgemm_kernel_L4_M4_44




dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


dgemm_kernel_L4_M4_44:
.Ldgemm_kernel_L4_M4_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_46:
.Ldgemm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M4_20
bne .Ldgemm_kernel_L4_M4_20


dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1233,9 +1233,9 @@ dgemm_kernel_L4_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1248,34 +1248,34 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1283,9 +1283,9 @@ dgemm_kernel_L4_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1297,40 +1297,40 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1339,14 +1339,14 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1354,10 +1354,10 @@ dgemm_kernel_L2_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40
.align 5 .align 5


dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1369,41 +1369,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M4_20
bgt .Ldgemm_kernel_L2_M4_20




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1411,9 +1411,9 @@ dgemm_kernel_L2_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1426,34 +1426,34 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22




dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1461,9 +1461,9 @@ dgemm_kernel_L2_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1475,35 +1475,35 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22




dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1511,24 +1511,24 @@ dgemm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40
.align 5 .align 5


dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1540,41 +1540,41 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M4_20
bgt .Ldgemm_kernel_L1_M4_20




dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1582,9 +1582,9 @@ dgemm_kernel_L1_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1597,34 +1597,34 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22




dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1632,9 +1632,9 @@ dgemm_kernel_L1_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40


dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1646,30 +1646,30 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dgemm_kernel_8x4.S View File

@@ -885,12 +885,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


.align 5 .align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -900,21 +900,21 @@ dgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #3 // L = K / 8 asr counterL , origK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -926,10 +926,10 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a


.align 5 .align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -941,10 +941,10 @@ dgemm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22


.align 5 .align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -955,13 +955,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


.align 5 .align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -972,46 +972,46 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100


.align 5 .align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46


dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE8x4 SAVE8x4


dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


@@ -1019,10 +1019,10 @@ dgemm_kernel_L4_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


.align 5 .align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1043,38 +1043,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22


dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:


dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1082,10 +1082,10 @@ dgemm_kernel_L4_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


.align 5 .align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1104,37 +1104,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1142,10 +1142,10 @@ dgemm_kernel_L4_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


.align 5 .align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1163,46 +1163,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
@@ -1211,15 +1211,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1227,10 +1227,10 @@ dgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40


.align 5 .align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1244,41 +1244,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22


dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42


dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20


dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1286,10 +1286,10 @@ dgemm_kernel_L2_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40


.align 5 .align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1307,41 +1307,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1349,9 +1349,9 @@ dgemm_kernel_L2_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1368,37 +1368,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1406,9 +1406,9 @@ dgemm_kernel_L2_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
@@ -1424,62 +1424,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40


.align 5 .align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1493,51 +1493,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22




dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42


dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20


dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40


.align 5 .align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1555,39 +1555,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1595,9 +1595,9 @@ dgemm_kernel_L1_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1614,36 +1614,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1651,10 +1651,10 @@ dgemm_kernel_L1_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40




dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
@@ -1668,32 +1668,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


prfm PLDL1KEEP, [pA, #A_PRE_SIZE] prfm PLDL1KEEP, [pA, #A_PRE_SIZE]
prfm PLDL1KEEP, [pB, #B_PRE_SIZE] prfm PLDL1KEEP, [pB, #B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dgemm_kernel_8x4_thunderx2t99.S View File

@@ -962,12 +962,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dgemm_kernel_L2_BEGIN
ble .Ldgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


.align 5 .align 5
dgemm_kernel_L4_BEGIN:
.Ldgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -977,21 +977,21 @@ dgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dgemm_kernel_L4_M8_BEGIN:
.Ldgemm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L4_M4_BEGIN
ble .Ldgemm_kernel_L4_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L4_M8_20:
.Ldgemm_kernel_L4_M8_20:


mov pB, origPB mov pB, origPB


asr counterL , origK, #7 // L = K / 128 asr counterL , origK, #7 // L = K / 128
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dgemm_kernel_L4_M8_32
blt .Ldgemm_kernel_L4_M8_32


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1003,18 +1003,18 @@ dgemm_kernel_L4_M8_20:
KERNEL8x4_M1_M2_x1 KERNEL8x4_M1_M2_x1


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dgemm_kernel_L4_M8_22a
ble .Ldgemm_kernel_L4_M8_22a


.align 5 .align 5
dgemm_kernel_L4_M8_22:
.Ldgemm_kernel_L4_M8_22:


KERNEL8x4_M1_M2_x64 KERNEL8x4_M1_M2_x64


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M8_22
bgt .Ldgemm_kernel_L4_M8_22


.align 5 .align 5
dgemm_kernel_L4_M8_22a:
.Ldgemm_kernel_L4_M8_22a:


KERNEL8x4_M1_M2_x32 KERNEL8x4_M1_M2_x32
KERNEL8x4_M1_M2_x16 KERNEL8x4_M1_M2_x16
@@ -1025,13 +1025,13 @@ dgemm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


.align 5 .align 5
dgemm_kernel_L4_M8_32:
.Ldgemm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dgemm_kernel_L4_M8_40
ble .Ldgemm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -1043,26 +1043,26 @@ dgemm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dgemm_kernel_L4_M8_44
b .Ldgemm_kernel_L4_M8_44


dgemm_kernel_L4_M8_40:
.Ldgemm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dgemm_kernel_L4_M8_44:
.Ldgemm_kernel_L4_M8_44:


ands counterL , origK, #127 ands counterL , origK, #127
ble dgemm_kernel_L4_M8_100
ble .Ldgemm_kernel_L4_M8_100


.align 5 .align 5
dgemm_kernel_L4_M8_46:
.Ldgemm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dgemm_kernel_L4_M8_46
bne .Ldgemm_kernel_L4_M8_46


dgemm_kernel_L4_M8_100:
.Ldgemm_kernel_L4_M8_100:
prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow0, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow1, C_PRE_SIZE]
prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE] prfm PLDL2KEEP, [pCRow2, C_PRE_SIZE]
@@ -1073,20 +1073,20 @@ dgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


dgemm_kernel_L4_M8_END:
.Ldgemm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dgemm_kernel_L4_M8_20
bne .Ldgemm_kernel_L4_M8_20


dgemm_kernel_L4_M4_BEGIN:
.Ldgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dgemm_kernel_L4_M2_BEGIN
ble .Ldgemm_kernel_L4_M2_BEGIN


dgemm_kernel_L4_M4_20:
.Ldgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


@@ -1094,10 +1094,10 @@ dgemm_kernel_L4_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M4_40
ble .Ldgemm_kernel_L4_M4_40


.align 5 .align 5
dgemm_kernel_L4_M4_22:
.Ldgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1118,38 +1118,38 @@ dgemm_kernel_L4_M4_22:
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_22
bgt .Ldgemm_kernel_L4_M4_22


dgemm_kernel_L4_M4_40:
.Ldgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M4_100
ble .Ldgemm_kernel_L4_M4_100


dgemm_kernel_L4_M4_42:
.Ldgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M4_42
bgt .Ldgemm_kernel_L4_M4_42


dgemm_kernel_L4_M4_100:
.Ldgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


dgemm_kernel_L4_M4_END:
.Ldgemm_kernel_L4_M4_END:


dgemm_kernel_L4_M2_BEGIN:
.Ldgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L4_M1_BEGIN
ble .Ldgemm_kernel_L4_M1_BEGIN


dgemm_kernel_L4_M2_20:
.Ldgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1157,10 +1157,10 @@ dgemm_kernel_L4_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M2_40
ble .Ldgemm_kernel_L4_M2_40


.align 5 .align 5
dgemm_kernel_L4_M2_22:
.Ldgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1179,37 +1179,37 @@ dgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_22
bgt .Ldgemm_kernel_L4_M2_22




dgemm_kernel_L4_M2_40:
.Ldgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M2_100
ble .Ldgemm_kernel_L4_M2_100


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
dgemm_kernel_L4_M2_42:
.Ldgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M2_42
bgt .Ldgemm_kernel_L4_M2_42


dgemm_kernel_L4_M2_100:
.Ldgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


dgemm_kernel_L4_M2_END:
.Ldgemm_kernel_L4_M2_END:




dgemm_kernel_L4_M1_BEGIN:
.Ldgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L4_END
ble .Ldgemm_kernel_L4_END


dgemm_kernel_L4_M1_20:
.Ldgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1217,10 +1217,10 @@ dgemm_kernel_L4_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L4_M1_40
ble .Ldgemm_kernel_L4_M1_40


.align 5 .align 5
dgemm_kernel_L4_M1_22:
.Ldgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1238,46 +1238,46 @@ dgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_22
bgt .Ldgemm_kernel_L4_M1_22




dgemm_kernel_L4_M1_40:
.Ldgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L4_M1_100
ble .Ldgemm_kernel_L4_M1_100


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
dgemm_kernel_L4_M1_42:
.Ldgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L4_M1_42
bgt .Ldgemm_kernel_L4_M1_42


dgemm_kernel_L4_M1_100:
.Ldgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


dgemm_kernel_L4_END:
.Ldgemm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dgemm_kernel_L4_BEGIN
bgt .Ldgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dgemm_kernel_L999 // error, N was less than 4?
ble .Ldgemm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dgemm_kernel_L1_BEGIN
ble .Ldgemm_kernel_L1_BEGIN


mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
@@ -1286,15 +1286,15 @@ dgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L2_M8_BEGIN:
.Ldgemm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L2_M4_BEGIN
ble .Ldgemm_kernel_L2_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L2_M8_20:
.Ldgemm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1302,10 +1302,10 @@ dgemm_kernel_L2_M8_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M8_40
ble .Ldgemm_kernel_L2_M8_40


.align 5 .align 5
dgemm_kernel_L2_M8_22:
.Ldgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1319,41 +1319,41 @@ dgemm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_22
bgt .Ldgemm_kernel_L2_M8_22


dgemm_kernel_L2_M8_40:
.Ldgemm_kernel_L2_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M8_100
ble .Ldgemm_kernel_L2_M8_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M8_42:
.Ldgemm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M8_42
bgt .Ldgemm_kernel_L2_M8_42


dgemm_kernel_L2_M8_100:
.Ldgemm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


dgemm_kernel_L2_M8_END:
.Ldgemm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L2_M8_20
bgt .Ldgemm_kernel_L2_M8_20


dgemm_kernel_L2_M4_BEGIN:
.Ldgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L2_M2_BEGIN
ble .Ldgemm_kernel_L2_M2_BEGIN


dgemm_kernel_L2_M4_20:
.Ldgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1361,10 +1361,10 @@ dgemm_kernel_L2_M4_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M4_40
ble .Ldgemm_kernel_L2_M4_40


.align 5 .align 5
dgemm_kernel_L2_M4_22:
.Ldgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1382,41 +1382,41 @@ dgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_22
bgt .Ldgemm_kernel_L2_M4_22




dgemm_kernel_L2_M4_40:
.Ldgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M4_100
ble .Ldgemm_kernel_L2_M4_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M4_42:
.Ldgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M4_42
bgt .Ldgemm_kernel_L2_M4_42


dgemm_kernel_L2_M4_100:
.Ldgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


dgemm_kernel_L2_M4_END:
.Ldgemm_kernel_L2_M4_END:




dgemm_kernel_L2_M2_BEGIN:
.Ldgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L2_M1_BEGIN
ble .Ldgemm_kernel_L2_M1_BEGIN


dgemm_kernel_L2_M2_20:
.Ldgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1424,9 +1424,9 @@ dgemm_kernel_L2_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dgemm_kernel_L2_M2_40
ble .Ldgemm_kernel_L2_M2_40


dgemm_kernel_L2_M2_22:
.Ldgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1443,37 +1443,37 @@ dgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_22
bgt .Ldgemm_kernel_L2_M2_22


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M2_40:
.Ldgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M2_100
ble .Ldgemm_kernel_L2_M2_100


dgemm_kernel_L2_M2_42:
.Ldgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M2_42
bgt .Ldgemm_kernel_L2_M2_42


dgemm_kernel_L2_M2_100:
.Ldgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


dgemm_kernel_L2_M2_END:
.Ldgemm_kernel_L2_M2_END:




dgemm_kernel_L2_M1_BEGIN:
.Ldgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L2_END
ble .Ldgemm_kernel_L2_END


dgemm_kernel_L2_M1_20:
.Ldgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1481,9 +1481,9 @@ dgemm_kernel_L2_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dgemm_kernel_L2_M1_40
ble .Ldgemm_kernel_L2_M1_40


dgemm_kernel_L2_M1_22:
.Ldgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
@@ -1499,62 +1499,62 @@ dgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_22
bgt .Ldgemm_kernel_L2_M1_22


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE_64] prfm PLDL1KEEP, [pB, B_PRE_SIZE_64]
dgemm_kernel_L2_M1_40:
.Ldgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L2_M1_100
ble .Ldgemm_kernel_L2_M1_100


dgemm_kernel_L2_M1_42:
.Ldgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L2_M1_42
bgt .Ldgemm_kernel_L2_M1_42


dgemm_kernel_L2_M1_100:
.Ldgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


dgemm_kernel_L2_END:
.Ldgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8


/******************************************************************************/ /******************************************************************************/


dgemm_kernel_L1_BEGIN:
.Ldgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dgemm_kernel_L999 // done
ble .Ldgemm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next


mov pA, origPA // pA = A mov pA, origPA // pA = A


dgemm_kernel_L1_M8_BEGIN:
.Ldgemm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dgemm_kernel_L1_M4_BEGIN
ble .Ldgemm_kernel_L1_M4_BEGIN


.align 5 .align 5
dgemm_kernel_L1_M8_20:
.Ldgemm_kernel_L1_M8_20:


INIT8x1 INIT8x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M8_40
ble .Ldgemm_kernel_L1_M8_40


.align 5 .align 5
dgemm_kernel_L1_M8_22:
.Ldgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1568,51 +1568,51 @@ dgemm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_22
bgt .Ldgemm_kernel_L1_M8_22




dgemm_kernel_L1_M8_40:
.Ldgemm_kernel_L1_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M8_100
ble .Ldgemm_kernel_L1_M8_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M8_42:
.Ldgemm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M8_42
bgt .Ldgemm_kernel_L1_M8_42


dgemm_kernel_L1_M8_100:
.Ldgemm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


dgemm_kernel_L1_M8_END:
.Ldgemm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dgemm_kernel_L1_M8_20
bgt .Ldgemm_kernel_L1_M8_20


dgemm_kernel_L1_M4_BEGIN:
.Ldgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dgemm_kernel_L1_M2_BEGIN
ble .Ldgemm_kernel_L1_M2_BEGIN


dgemm_kernel_L1_M4_20:
.Ldgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M4_40
ble .Ldgemm_kernel_L1_M4_40


.align 5 .align 5
dgemm_kernel_L1_M4_22:
.Ldgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1630,39 +1630,39 @@ dgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_22
bgt .Ldgemm_kernel_L1_M4_22




dgemm_kernel_L1_M4_40:
.Ldgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M4_100
ble .Ldgemm_kernel_L1_M4_100


prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M4_42:
.Ldgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M4_42
bgt .Ldgemm_kernel_L1_M4_42


dgemm_kernel_L1_M4_100:
.Ldgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


dgemm_kernel_L1_M4_END:
.Ldgemm_kernel_L1_M4_END:


dgemm_kernel_L1_M2_BEGIN:
.Ldgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dgemm_kernel_L1_M1_BEGIN
ble .Ldgemm_kernel_L1_M1_BEGIN


dgemm_kernel_L1_M2_20:
.Ldgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1670,9 +1670,9 @@ dgemm_kernel_L1_M2_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M2_40
ble .Ldgemm_kernel_L1_M2_40


dgemm_kernel_L1_M2_22:
.Ldgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1689,36 +1689,36 @@ dgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_22
bgt .Ldgemm_kernel_L1_M2_22


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pA, A_PRE_SIZE_64] prfm PLDL1KEEP, [pA, A_PRE_SIZE_64]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M2_40:
.Ldgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M2_100
ble .Ldgemm_kernel_L1_M2_100


dgemm_kernel_L1_M2_42:
.Ldgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M2_42
bgt .Ldgemm_kernel_L1_M2_42


dgemm_kernel_L1_M2_100:
.Ldgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


dgemm_kernel_L1_M2_END:
.Ldgemm_kernel_L1_M2_END:




dgemm_kernel_L1_M1_BEGIN:
.Ldgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dgemm_kernel_L1_END
ble .Ldgemm_kernel_L1_END


dgemm_kernel_L1_M1_20:
.Ldgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1726,10 +1726,10 @@ dgemm_kernel_L1_M1_20:


asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dgemm_kernel_L1_M1_40
ble .Ldgemm_kernel_L1_M1_40




dgemm_kernel_L1_M1_22:
.Ldgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
@@ -1743,32 +1743,32 @@ dgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_22
bgt .Ldgemm_kernel_L1_M1_22




dgemm_kernel_L1_M1_40:
.Ldgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble dgemm_kernel_L1_M1_100
ble .Ldgemm_kernel_L1_M1_100


prfm PLDL1KEEP, [pA, A_PRE_SIZE] prfm PLDL1KEEP, [pA, A_PRE_SIZE]
prfm PLDL1KEEP, [pB, B_PRE_SIZE] prfm PLDL1KEEP, [pB, B_PRE_SIZE]
dgemm_kernel_L1_M1_42:
.Ldgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dgemm_kernel_L1_M1_42
bgt .Ldgemm_kernel_L1_M1_42


dgemm_kernel_L1_M1_100:
.Ldgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dgemm_kernel_L1_END:
.Ldgemm_kernel_L1_END:




dgemm_kernel_L999:
.Ldgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 36
- 36
kernel/arm64/dgemm_ncopy_4.S View File

@@ -192,14 +192,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl LDA, LDA, #3 // LDA = LDA * SIZE


dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:


asr J, N, #2 // J = N / 4 asr J, N, #2 // J = N / 4
cmp J, #0 cmp J, #0
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN


.align 5 .align 5
dgemm_ncopy_L4_M4_BEGIN:
.Ldgemm_ncopy_L4_M4_BEGIN:


mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@@ -209,128 +209,128 @@ dgemm_ncopy_L4_M4_BEGIN:


asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M4_40
ble .Ldgemm_ncopy_L4_M4_40


.align 5 .align 5
dgemm_ncopy_L4_M4_20:
.Ldgemm_ncopy_L4_M4_20:


COPY4x4 COPY4x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M4_20
bne .Ldgemm_ncopy_L4_M4_20




dgemm_ncopy_L4_M4_40:
.Ldgemm_ncopy_L4_M4_40:


and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M4_END
ble .Ldgemm_ncopy_L4_M4_END


.align 5 .align 5
dgemm_ncopy_L4_M4_60:
.Ldgemm_ncopy_L4_M4_60:


COPY1x4 COPY1x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M4_60
bne .Ldgemm_ncopy_L4_M4_60




dgemm_ncopy_L4_M4_END:
.Ldgemm_ncopy_L4_M4_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_ncopy_L4_M4_BEGIN
bne .Ldgemm_ncopy_L4_M4_BEGIN






/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:


tst N, #3 tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


tst N, #2 tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN


dgemm_ncopy_L2_M4_BEGIN:
.Ldgemm_ncopy_L2_M4_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
add A00, A02, LDA add A00, A02, LDA


asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M4_40
ble .Ldgemm_ncopy_L2_M4_40


.align 5 .align 5
dgemm_ncopy_L2_M4_20:
.Ldgemm_ncopy_L2_M4_20:


COPY4x2 COPY4x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M4_20
bne .Ldgemm_ncopy_L2_M4_20




dgemm_ncopy_L2_M4_40:
.Ldgemm_ncopy_L2_M4_40:


and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M4_END
ble .Ldgemm_ncopy_L2_M4_END


.align 5 .align 5
dgemm_ncopy_L2_M4_60:
.Ldgemm_ncopy_L2_M4_60:


COPY1x2 COPY1x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M4_60
bne .Ldgemm_ncopy_L2_M4_60




dgemm_ncopy_L2_M4_END:
.Ldgemm_ncopy_L2_M4_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:


tst N, #1 tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999




dgemm_ncopy_L1_M4_BEGIN:
.Ldgemm_ncopy_L1_M4_BEGIN:


mov A01, A00 mov A01, A00


asr I, M, #2 // I = M / 4 asr I, M, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M4_40
ble .Ldgemm_ncopy_L1_M4_40


.align 5 .align 5
dgemm_ncopy_L1_M4_20:
.Ldgemm_ncopy_L1_M4_20:


COPY4x1 COPY4x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M4_20
bne .Ldgemm_ncopy_L1_M4_20




dgemm_ncopy_L1_M4_40:
.Ldgemm_ncopy_L1_M4_40:


and I, M , #3 and I, M , #3
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M4_END
ble .Ldgemm_ncopy_L1_M4_END


.align 5 .align 5
dgemm_ncopy_L1_M4_60:
.Ldgemm_ncopy_L1_M4_60:


COPY1x1 COPY1x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M4_60
bne .Ldgemm_ncopy_L1_M4_60




dgemm_ncopy_L1_M4_END:
.Ldgemm_ncopy_L1_M4_END:


dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:


mov x0, #0 mov x0, #0
RESTORE_REGS RESTORE_REGS


+ 48
- 48
kernel/arm64/dgemm_ncopy_8.S View File

@@ -353,13 +353,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl LDA, LDA, #3 // LDA = LDA * SIZE lsl LDA, LDA, #3 // LDA = LDA * SIZE


dgemm_ncopy_L8_BEGIN:
.Ldgemm_ncopy_L8_BEGIN:


asr J, N, #3 // J = N / 8 asr J, N, #3 // J = N / 8
cmp J, #0 cmp J, #0
ble dgemm_ncopy_L4_BEGIN
ble .Ldgemm_ncopy_L4_BEGIN


dgemm_ncopy_L8_M8_BEGIN:
.Ldgemm_ncopy_L8_M8_BEGIN:


mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@@ -374,46 +374,46 @@ dgemm_ncopy_L8_M8_BEGIN:


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L8_M8_40
ble .Ldgemm_ncopy_L8_M8_40


dgemm_ncopy_L8_M8_20:
.Ldgemm_ncopy_L8_M8_20:


COPY8x8 COPY8x8


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L8_M8_20
bne .Ldgemm_ncopy_L8_M8_20




dgemm_ncopy_L8_M8_40:
.Ldgemm_ncopy_L8_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L8_M8_END
ble .Ldgemm_ncopy_L8_M8_END


dgemm_ncopy_L8_M8_60:
.Ldgemm_ncopy_L8_M8_60:


COPY1x8 COPY1x8


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L8_M8_60
bne .Ldgemm_ncopy_L8_M8_60




dgemm_ncopy_L8_M8_END:
.Ldgemm_ncopy_L8_M8_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_ncopy_L8_M8_BEGIN
bne .Ldgemm_ncopy_L8_M8_BEGIN


/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L4_BEGIN:
.Ldgemm_ncopy_L4_BEGIN:


tst N, #7 tst N, #7
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


tst N, #4 tst N, #4
ble dgemm_ncopy_L2_BEGIN
ble .Ldgemm_ncopy_L2_BEGIN


dgemm_ncopy_L4_M8_BEGIN:
.Ldgemm_ncopy_L4_M8_BEGIN:


mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
@@ -423,118 +423,118 @@ dgemm_ncopy_L4_M8_BEGIN:


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M8_40
ble .Ldgemm_ncopy_L4_M8_40


dgemm_ncopy_L4_M8_20:
.Ldgemm_ncopy_L4_M8_20:


COPY8x4 COPY8x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M8_20
bne .Ldgemm_ncopy_L4_M8_20




dgemm_ncopy_L4_M8_40:
.Ldgemm_ncopy_L4_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L4_M8_END
ble .Ldgemm_ncopy_L4_M8_END


dgemm_ncopy_L4_M8_60:
.Ldgemm_ncopy_L4_M8_60:


COPY1x4 COPY1x4


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L4_M8_60
bne .Ldgemm_ncopy_L4_M8_60




dgemm_ncopy_L4_M8_END:
.Ldgemm_ncopy_L4_M8_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L2_BEGIN:
.Ldgemm_ncopy_L2_BEGIN:


tst N, #3 tst N, #3
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999


tst N, #2 tst N, #2
ble dgemm_ncopy_L1_BEGIN
ble .Ldgemm_ncopy_L1_BEGIN


dgemm_ncopy_L2_M8_BEGIN:
.Ldgemm_ncopy_L2_M8_BEGIN:
mov A01, A00 mov A01, A00
add A02, A01, LDA add A02, A01, LDA
add A00, A02, LDA add A00, A02, LDA


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M8_40
ble .Ldgemm_ncopy_L2_M8_40


dgemm_ncopy_L2_M8_20:
.Ldgemm_ncopy_L2_M8_20:


COPY8x2 COPY8x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M8_20
bne .Ldgemm_ncopy_L2_M8_20




dgemm_ncopy_L2_M8_40:
.Ldgemm_ncopy_L2_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L2_M8_END
ble .Ldgemm_ncopy_L2_M8_END


dgemm_ncopy_L2_M8_60:
.Ldgemm_ncopy_L2_M8_60:


COPY1x2 COPY1x2


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L2_M8_60
bne .Ldgemm_ncopy_L2_M8_60




dgemm_ncopy_L2_M8_END:
.Ldgemm_ncopy_L2_M8_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_ncopy_L1_BEGIN:
.Ldgemm_ncopy_L1_BEGIN:


tst N, #1 tst N, #1
ble dgemm_ncopy_L999
ble .Ldgemm_ncopy_L999




dgemm_ncopy_L1_M8_BEGIN:
.Ldgemm_ncopy_L1_M8_BEGIN:


mov A01, A00 mov A01, A00


asr I, M, #3 // I = M / 8 asr I, M, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M8_40
ble .Ldgemm_ncopy_L1_M8_40


dgemm_ncopy_L1_M8_20:
.Ldgemm_ncopy_L1_M8_20:


COPY8x1 COPY8x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M8_20
bne .Ldgemm_ncopy_L1_M8_20




dgemm_ncopy_L1_M8_40:
.Ldgemm_ncopy_L1_M8_40:


and I, M , #7 and I, M , #7
cmp I, #0 cmp I, #0
ble dgemm_ncopy_L1_M8_END
ble .Ldgemm_ncopy_L1_M8_END


dgemm_ncopy_L1_M8_60:
.Ldgemm_ncopy_L1_M8_60:


COPY1x1 COPY1x1


subs I , I , #1 subs I , I , #1
bne dgemm_ncopy_L1_M8_60
bne .Ldgemm_ncopy_L1_M8_60




dgemm_ncopy_L1_M8_END:
.Ldgemm_ncopy_L1_M8_END:


dgemm_ncopy_L999:
.Ldgemm_ncopy_L999:


mov x0, #0 mov x0, #0
RESTORE_REGS RESTORE_REGS


+ 36
- 36
kernel/arm64/dgemm_tcopy_4.S View File

@@ -247,13 +247,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl M4, M, #5 // M4 = M * 4 * SIZE lsl M4, M, #5 // M4 = M * 4 * SIZE


dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
asr J, M, #2 // J = M / 4 asr J, M, #2 // J = M / 4
cmp J, #0 cmp J, #0
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN


.align 5 .align 5
dgemm_tcopy_L4_M4_BEGIN:
.Ldgemm_tcopy_L4_M4_BEGIN:


mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@@ -266,51 +266,51 @@ dgemm_tcopy_L4_M4_BEGIN:


asr I, N, #2 // I = N / 4 asr I, N, #2 // I = N / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L4_M4_40
ble .Ldgemm_tcopy_L4_M4_40


.align 5 .align 5
dgemm_tcopy_L4_M4_20:
.Ldgemm_tcopy_L4_M4_20:


COPY4x4 COPY4x4


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L4_M4_20
bne .Ldgemm_tcopy_L4_M4_20




dgemm_tcopy_L4_M4_40:
.Ldgemm_tcopy_L4_M4_40:


tst N , #2 tst N , #2
ble dgemm_tcopy_L4_M4_60
ble .Ldgemm_tcopy_L4_M4_60


COPY2x4 COPY2x4




dgemm_tcopy_L4_M4_60:
.Ldgemm_tcopy_L4_M4_60:


tst N, #1 tst N, #1
ble dgemm_tcopy_L4_M4_END
ble .Ldgemm_tcopy_L4_M4_END


COPY1x4 COPY1x4




dgemm_tcopy_L4_M4_END:
.Ldgemm_tcopy_L4_M4_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_tcopy_L4_M4_BEGIN
bne .Ldgemm_tcopy_L4_M4_BEGIN






/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:


tst M, #3 tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


tst M, #2 tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN


dgemm_tcopy_L2_M4_BEGIN:
.Ldgemm_tcopy_L2_M4_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
add A, A02, LDA add A, A02, LDA
@@ -320,80 +320,80 @@ dgemm_tcopy_L2_M4_BEGIN:


asr I, N, #2 // I = N / 4 asr I, N, #2 // I = N / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L2_M4_40
ble .Ldgemm_tcopy_L2_M4_40


.align 5 .align 5
dgemm_tcopy_L2_M4_20:
.Ldgemm_tcopy_L2_M4_20:


COPY4x2 COPY4x2


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L2_M4_20
bne .Ldgemm_tcopy_L2_M4_20




dgemm_tcopy_L2_M4_40:
.Ldgemm_tcopy_L2_M4_40:


tst N , #2 tst N , #2
ble dgemm_tcopy_L2_M4_60
ble .Ldgemm_tcopy_L2_M4_60


COPY2x2 COPY2x2


dgemm_tcopy_L2_M4_60:
.Ldgemm_tcopy_L2_M4_60:


tst N , #1 tst N , #1
ble dgemm_tcopy_L2_M4_END
ble .Ldgemm_tcopy_L2_M4_END


COPY1x2 COPY1x2




dgemm_tcopy_L2_M4_END:
.Ldgemm_tcopy_L2_M4_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:


tst M, #1 tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999




dgemm_tcopy_L1_M4_BEGIN:
.Ldgemm_tcopy_L1_M4_BEGIN:


mov A01, A // A01 = A mov A01, A // A01 = A
mov B01, B mov B01, B


asr I, N, #2 // I = M / 4 asr I, N, #2 // I = M / 4
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L1_M4_40
ble .Ldgemm_tcopy_L1_M4_40


.align 5 .align 5
dgemm_tcopy_L1_M4_20:
.Ldgemm_tcopy_L1_M4_20:


COPY4x1 COPY4x1


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L1_M4_20
bne .Ldgemm_tcopy_L1_M4_20




dgemm_tcopy_L1_M4_40:
.Ldgemm_tcopy_L1_M4_40:


tst N , #2 tst N , #2
ble dgemm_tcopy_L1_M4_60
ble .Ldgemm_tcopy_L1_M4_60


COPY2x1 COPY2x1


dgemm_tcopy_L1_M4_60:
.Ldgemm_tcopy_L1_M4_60:


tst N , #1 tst N , #1
ble dgemm_tcopy_L1_M4_END
ble .Ldgemm_tcopy_L1_M4_END


COPY1x1 COPY1x1




dgemm_tcopy_L1_M4_END:
.Ldgemm_tcopy_L1_M4_END:




dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
RESTORE_REGS RESTORE_REGS
ret ret


+ 56
- 56
kernel/arm64/dgemm_tcopy_8.S View File

@@ -454,13 +454,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


lsl M8, M, #6 // M8 = M * 8 * SIZE lsl M8, M, #6 // M8 = M * 8 * SIZE


dgemm_tcopy_L8_BEGIN:
.Ldgemm_tcopy_L8_BEGIN:
asr J, M, #3 // J = M / 4 asr J, M, #3 // J = M / 4
cmp J, #0 cmp J, #0
ble dgemm_tcopy_L4_BEGIN
ble .Ldgemm_tcopy_L4_BEGIN


.align 5 .align 5
dgemm_tcopy_L8_M8_BEGIN:
.Ldgemm_tcopy_L8_M8_BEGIN:


mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@@ -477,53 +477,53 @@ dgemm_tcopy_L8_M8_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L8_M8_40
ble .Ldgemm_tcopy_L8_M8_40


.align 5 .align 5
dgemm_tcopy_L8_M8_20:
.Ldgemm_tcopy_L8_M8_20:


COPY8x8 COPY8x8


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L8_M8_20
bne .Ldgemm_tcopy_L8_M8_20


dgemm_tcopy_L8_M8_40:
.Ldgemm_tcopy_L8_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L8_M8_60
ble .Ldgemm_tcopy_L8_M8_60


COPY4x8 COPY4x8


dgemm_tcopy_L8_M8_60:
.Ldgemm_tcopy_L8_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L8_M8_80
ble .Ldgemm_tcopy_L8_M8_80


COPY2x8 COPY2x8




dgemm_tcopy_L8_M8_80:
.Ldgemm_tcopy_L8_M8_80:


tst N, #1 tst N, #1
ble dgemm_tcopy_L8_M8_END
ble .Ldgemm_tcopy_L8_M8_END


COPY1x8 COPY1x8




dgemm_tcopy_L8_M8_END:
.Ldgemm_tcopy_L8_M8_END:


subs J , J, #1 // j-- subs J , J, #1 // j--
bne dgemm_tcopy_L8_M8_BEGIN
bne .Ldgemm_tcopy_L8_M8_BEGIN


/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L4_BEGIN:
.Ldgemm_tcopy_L4_BEGIN:
tst M, #7 tst M, #7
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


tst M, #4 tst M, #4
ble dgemm_tcopy_L2_BEGIN
ble .Ldgemm_tcopy_L2_BEGIN


dgemm_tcopy_L4_M8_BEGIN:
.Ldgemm_tcopy_L4_M8_BEGIN:


mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
@@ -536,51 +536,51 @@ dgemm_tcopy_L4_M8_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L4_M8_40
ble .Ldgemm_tcopy_L4_M8_40


.align 5 .align 5
dgemm_tcopy_L4_M8_20:
.Ldgemm_tcopy_L4_M8_20:


COPY8x4 COPY8x4


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L4_M8_20
bne .Ldgemm_tcopy_L4_M8_20


dgemm_tcopy_L4_M8_40:
.Ldgemm_tcopy_L4_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L4_M8_60
ble .Ldgemm_tcopy_L4_M8_60


COPY4x4 COPY4x4


dgemm_tcopy_L4_M8_60:
.Ldgemm_tcopy_L4_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L4_M8_80
ble .Ldgemm_tcopy_L4_M8_80


COPY2x4 COPY2x4




dgemm_tcopy_L4_M8_80:
.Ldgemm_tcopy_L4_M8_80:


tst N, #1 tst N, #1
ble dgemm_tcopy_L4_M8_END
ble .Ldgemm_tcopy_L4_M8_END


COPY1x4 COPY1x4




dgemm_tcopy_L4_M8_END:
.Ldgemm_tcopy_L4_M8_END:


/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L2_BEGIN:
.Ldgemm_tcopy_L2_BEGIN:


tst M, #3 tst M, #3
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999


tst M, #2 tst M, #2
ble dgemm_tcopy_L1_BEGIN
ble .Ldgemm_tcopy_L1_BEGIN


dgemm_tcopy_L2_M8_BEGIN:
.Ldgemm_tcopy_L2_M8_BEGIN:
mov A01, A mov A01, A
add A02, A01, LDA add A02, A01, LDA
add A, A02, LDA add A, A02, LDA
@@ -590,90 +590,90 @@ dgemm_tcopy_L2_M8_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L2_M8_40
ble .Ldgemm_tcopy_L2_M8_40


.align 5 .align 5
dgemm_tcopy_L2_M8_20:
.Ldgemm_tcopy_L2_M8_20:


COPY8x2 COPY8x2


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L2_M8_20
bne .Ldgemm_tcopy_L2_M8_20


dgemm_tcopy_L2_M8_40:
.Ldgemm_tcopy_L2_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L2_M8_60
ble .Ldgemm_tcopy_L2_M8_60


COPY4x2 COPY4x2


dgemm_tcopy_L2_M8_60:
.Ldgemm_tcopy_L2_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L2_M8_80
ble .Ldgemm_tcopy_L2_M8_80


COPY2x2 COPY2x2


dgemm_tcopy_L2_M8_80:
.Ldgemm_tcopy_L2_M8_80:


tst N , #1 tst N , #1
ble dgemm_tcopy_L2_M8_END
ble .Ldgemm_tcopy_L2_M8_END


COPY1x2 COPY1x2




dgemm_tcopy_L2_M8_END:
.Ldgemm_tcopy_L2_M8_END:




/*********************************************************************************************/ /*********************************************************************************************/


dgemm_tcopy_L1_BEGIN:
.Ldgemm_tcopy_L1_BEGIN:


tst M, #1 tst M, #1
ble dgemm_tcopy_L999
ble .Ldgemm_tcopy_L999




dgemm_tcopy_L1_M8_BEGIN:
.Ldgemm_tcopy_L1_M8_BEGIN:


mov A01, A // A01 = A mov A01, A // A01 = A
mov B01, B mov B01, B


asr I, N, #3 // I = M / 8 asr I, N, #3 // I = M / 8
cmp I, #0 cmp I, #0
ble dgemm_tcopy_L1_M8_40
ble .Ldgemm_tcopy_L1_M8_40


.align 5 .align 5
dgemm_tcopy_L1_M8_20:
.Ldgemm_tcopy_L1_M8_20:


COPY8x1 COPY8x1


subs I , I , #1 subs I , I , #1
bne dgemm_tcopy_L1_M8_20
bne .Ldgemm_tcopy_L1_M8_20


dgemm_tcopy_L1_M8_40:
.Ldgemm_tcopy_L1_M8_40:
tst N , #4 tst N , #4
ble dgemm_tcopy_L1_M8_60
ble .Ldgemm_tcopy_L1_M8_60


COPY4x1 COPY4x1


dgemm_tcopy_L1_M8_60:
.Ldgemm_tcopy_L1_M8_60:


tst N , #2 tst N , #2
ble dgemm_tcopy_L1_M8_80
ble .Ldgemm_tcopy_L1_M8_80


COPY2x1 COPY2x1


dgemm_tcopy_L1_M8_80:
.Ldgemm_tcopy_L1_M8_80:


tst N , #1 tst N , #1
ble dgemm_tcopy_L1_M8_END
ble .Ldgemm_tcopy_L1_M8_END


COPY1x1 COPY1x1




dgemm_tcopy_L1_M8_END:
.Ldgemm_tcopy_L1_M8_END:




dgemm_tcopy_L999:
.Ldgemm_tcopy_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
RESTORE_REGS RESTORE_REGS
ret ret


+ 20
- 20
kernel/arm64/dot.S View File

@@ -154,51 +154,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif


cmp N, xzr cmp N, xzr
ble dot_kernel_L999
ble .Ldot_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Ldot_kernel_S_BEGIN


dot_kernel_F_BEGIN:
.Ldot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq dot_kernel_F1
beq .Ldot_kernel_F1


dot_kernel_F4:
.Ldot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne dot_kernel_F4
bne .Ldot_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


dot_kernel_F1:
.Ldot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999


dot_kernel_F10:
.Ldot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne dot_kernel_F10
bne .Ldot_kernel_F10


ret ret


dot_kernel_S_BEGIN:
.Ldot_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble dot_kernel_S1
ble .Ldot_kernel_S1


dot_kernel_S4:
.Ldot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -206,21 +206,21 @@ dot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S4
bne .Ldot_kernel_S4


dot_kernel_S1:
.Ldot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Ldot_kernel_L999


dot_kernel_S10:
.Ldot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S10
bne .Ldot_kernel_S10


dot_kernel_L999:
.Ldot_kernel_L999:


ret ret




+ 129
- 129
kernel/arm64/dtrmm_kernel_4x4.S View File

@@ -549,11 +549,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -563,14 +563,14 @@ dtrmm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN


dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -591,57 +591,57 @@ dtrmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5 .align 5


dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22




dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44


dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I


KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44




dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100


dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -660,20 +660,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20


dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN


dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -697,9 +697,9 @@ dtrmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40


dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -712,22 +712,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22




dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100


dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42


dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -747,15 +747,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:




dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -779,9 +779,9 @@ dtrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40


dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -793,22 +793,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22




dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100


dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42


dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -828,7 +828,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -838,19 +838,19 @@ dtrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -863,14 +863,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN


dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -894,10 +894,10 @@ dtrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5


dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -909,22 +909,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22




dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100


dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42


dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -944,22 +944,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20




dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN


dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -983,9 +983,9 @@ dtrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40


dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -998,22 +998,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22




dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100


dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42


dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1033,15 +1033,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:




dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1065,9 +1065,9 @@ dtrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40


dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1079,22 +1079,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22




dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100


dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42


dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1114,7 +1114,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1122,11 +1122,11 @@ dtrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1138,14 +1138,14 @@ dtrmm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN


dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1169,10 +1169,10 @@ dtrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5


dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1184,22 +1184,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22




dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100


dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42


dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1220,22 +1220,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20




dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN


dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1259,9 +1259,9 @@ dtrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40


dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1274,22 +1274,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22




dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100


dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42


dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1309,15 +1309,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:




dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1341,9 +1341,9 @@ dtrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40


dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1355,30 +1355,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22




dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100


dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42


dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:




dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 176
- 176
kernel/arm64/dtrmm_kernel_4x8.S View File

@@ -900,11 +900,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #3 // J = J / 8 asr counterJ, counterJ, #3 // J = J / 8
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L4_BEGIN
ble .Ldtrmm_kernel_L4_BEGIN


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L8_BEGIN:
.Ldtrmm_kernel_L8_BEGIN:


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #3 add pC, pC, LDC, lsl #3
@@ -915,14 +915,14 @@ dtrmm_kernel_L8_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L8_M4_BEGIN:
.Ldtrmm_kernel_L8_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L8_M2_BEGIN
ble .Ldtrmm_kernel_L8_M2_BEGIN


dtrmm_kernel_L8_M4_20:
.Ldtrmm_kernel_L8_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -944,57 +944,57 @@ dtrmm_kernel_L8_M4_20:


asr counterL, tempK, #1 // L = K / 2 asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L8_M4_32
blt .Ldtrmm_kernel_L8_M4_32


KERNEL4x8_I // do one in the K KERNEL4x8_I // do one in the K
KERNEL4x8_M2 // do another in the K KERNEL4x8_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L8_M4_22a
ble .Ldtrmm_kernel_L8_M4_22a
.align 5 .align 5


dtrmm_kernel_L8_M4_22:
.Ldtrmm_kernel_L8_M4_22:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_M2 KERNEL4x8_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M4_22
bgt .Ldtrmm_kernel_L8_M4_22




dtrmm_kernel_L8_M4_22a:
.Ldtrmm_kernel_L8_M4_22a:


KERNEL4x8_M1 KERNEL4x8_M1
KERNEL4x8_E KERNEL4x8_E


b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44


dtrmm_kernel_L8_M4_32:
.Ldtrmm_kernel_L8_M4_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L8_M4_40
ble .Ldtrmm_kernel_L8_M4_40


KERNEL4x8_I KERNEL4x8_I


KERNEL4x8_E KERNEL4x8_E


b dtrmm_kernel_L8_M4_44
b .Ldtrmm_kernel_L8_M4_44




dtrmm_kernel_L8_M4_40:
.Ldtrmm_kernel_L8_M4_40:


INIT4x8 INIT4x8


dtrmm_kernel_L8_M4_44:
.Ldtrmm_kernel_L8_M4_44:


ands counterL, tempK, #1 ands counterL, tempK, #1
ble dtrmm_kernel_L8_M4_100
ble .Ldtrmm_kernel_L8_M4_100


dtrmm_kernel_L8_M4_46:
.Ldtrmm_kernel_L8_M4_46:


KERNEL4x8_SUB KERNEL4x8_SUB


dtrmm_kernel_L8_M4_100:
.Ldtrmm_kernel_L8_M4_100:


SAVE4x8 SAVE4x8


@@ -1014,20 +1014,20 @@ dtrmm_kernel_L8_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L8_M4_END:
.Ldtrmm_kernel_L8_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L8_M4_20
bne .Ldtrmm_kernel_L8_M4_20


dtrmm_kernel_L8_M2_BEGIN:
.Ldtrmm_kernel_L8_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L8_M1_BEGIN
ble .Ldtrmm_kernel_L8_M1_BEGIN


dtrmm_kernel_L8_M2_20:
.Ldtrmm_kernel_L8_M2_20:


INIT2x8 INIT2x8


@@ -1051,9 +1051,9 @@ dtrmm_kernel_L8_M2_20:


asr counterL, tempK, #3 // counterL = counterL / 8 asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L8_M2_40
ble .Ldtrmm_kernel_L8_M2_40


dtrmm_kernel_L8_M2_22:
.Ldtrmm_kernel_L8_M2_22:


KERNEL2x8_SUB KERNEL2x8_SUB
KERNEL2x8_SUB KERNEL2x8_SUB
@@ -1066,22 +1066,22 @@ dtrmm_kernel_L8_M2_22:
KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_22
bgt .Ldtrmm_kernel_L8_M2_22




dtrmm_kernel_L8_M2_40:
.Ldtrmm_kernel_L8_M2_40:


ands counterL, tempK, #7 // counterL = counterL % 8 ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M2_100
ble .Ldtrmm_kernel_L8_M2_100


dtrmm_kernel_L8_M2_42:
.Ldtrmm_kernel_L8_M2_42:


KERNEL2x8_SUB KERNEL2x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M2_42
bgt .Ldtrmm_kernel_L8_M2_42


dtrmm_kernel_L8_M2_100:
.Ldtrmm_kernel_L8_M2_100:


SAVE2x8 SAVE2x8


@@ -1102,15 +1102,15 @@ dtrmm_kernel_L8_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L8_M2_END:
.Ldtrmm_kernel_L8_M2_END:




dtrmm_kernel_L8_M1_BEGIN:
.Ldtrmm_kernel_L8_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L8_END
ble .Ldtrmm_kernel_L8_END


dtrmm_kernel_L8_M1_20:
.Ldtrmm_kernel_L8_M1_20:


INIT1x8 INIT1x8


@@ -1134,9 +1134,9 @@ dtrmm_kernel_L8_M1_20:


asr counterL, tempK, #3 // counterL = counterL / 8 asr counterL, tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L8_M1_40
ble .Ldtrmm_kernel_L8_M1_40


dtrmm_kernel_L8_M1_22:
.Ldtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
KERNEL1x8_SUB KERNEL1x8_SUB
@@ -1148,22 +1148,22 @@ dtrmm_kernel_L8_M1_22:
KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_22
bgt .Ldtrmm_kernel_L8_M1_22




dtrmm_kernel_L8_M1_40:
.Ldtrmm_kernel_L8_M1_40:


ands counterL, tempK, #7 // counterL = counterL % 8 ands counterL, tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L8_M1_100
ble .Ldtrmm_kernel_L8_M1_100


dtrmm_kernel_L8_M1_42:
.Ldtrmm_kernel_L8_M1_42:


KERNEL1x8_SUB KERNEL1x8_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L8_M1_42
bgt .Ldtrmm_kernel_L8_M1_42


dtrmm_kernel_L8_M1_100:
.Ldtrmm_kernel_L8_M1_100:


SAVE1x8 SAVE1x8


@@ -1183,7 +1183,7 @@ dtrmm_kernel_L8_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L8_END:
.Ldtrmm_kernel_L8_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 8 * 8 add origPB, origPB, temp // B = B + K * 8 * 8
@@ -1193,19 +1193,19 @@ dtrmm_kernel_L8_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L8_BEGIN
bgt .Ldtrmm_kernel_L8_BEGIN




/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #7 tst counterJ , #7
ble dtrmm_kernel_L999
ble .Ldtrmm_kernel_L999


tst counterJ , #4 tst counterJ , #4
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2
@@ -1216,14 +1216,14 @@ dtrmm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN


dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1244,57 +1244,57 @@ dtrmm_kernel_L4_M4_20:


asr counterL, tempK, #1 // L = K / 2 asr counterL, tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M4_32
blt .Ldtrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble dtrmm_kernel_L4_M4_22a
ble .Ldtrmm_kernel_L4_M4_22a
.align 5 .align 5


dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22




dtrmm_kernel_L4_M4_22a:
.Ldtrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44


dtrmm_kernel_L4_M4_32:
.Ldtrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I


KERNEL4x4_E KERNEL4x4_E


b dtrmm_kernel_L4_M4_44
b .Ldtrmm_kernel_L4_M4_44




dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


dtrmm_kernel_L4_M4_44:
.Ldtrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100


dtrmm_kernel_L4_M4_46:
.Ldtrmm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1312,20 +1312,20 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M4_20
bne .Ldtrmm_kernel_L4_M4_20


dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN


dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1348,9 +1348,9 @@ dtrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40


dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1363,22 +1363,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22




dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100


dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42


dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1397,15 +1397,15 @@ dtrmm_kernel_L4_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:




dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1428,9 +1428,9 @@ dtrmm_kernel_L4_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40


dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1442,22 +1442,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22




dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100


dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42


dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1476,7 +1476,7 @@ dtrmm_kernel_L4_M1_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1486,14 +1486,14 @@ dtrmm_kernel_L4_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1505,14 +1505,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
mov pA, origPA // pA = A mov pA, origPA // pA = A




dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN


dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1535,10 +1535,10 @@ dtrmm_kernel_L2_M4_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5


dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1550,22 +1550,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22




dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100


dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42


dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1584,22 +1584,22 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M4_20
bgt .Ldtrmm_kernel_L2_M4_20




dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN


dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1622,9 +1622,9 @@ dtrmm_kernel_L2_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40


dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1637,22 +1637,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22




dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100


dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42


dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1671,15 +1671,15 @@ dtrmm_kernel_L2_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:




dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1702,9 +1702,9 @@ dtrmm_kernel_L2_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40


dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1716,22 +1716,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22




dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100


dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42


dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1750,7 +1750,7 @@ dtrmm_kernel_L2_M1_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif
dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1758,11 +1758,11 @@ dtrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1773,14 +1773,14 @@ dtrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN


dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1802,10 +1802,10 @@ dtrmm_kernel_L1_M4_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5


dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1817,22 +1817,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22




dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100


dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42


dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1851,22 +1851,22 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M4_20
bgt .Ldtrmm_kernel_L1_M4_20




dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN


dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1889,9 +1889,9 @@ dtrmm_kernel_L1_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40


dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1904,22 +1904,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22




dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100


dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42


dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1938,15 +1938,15 @@ dtrmm_kernel_L1_M2_100:
#if defined(LEFT) #if defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:




dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1969,9 +1969,9 @@ dtrmm_kernel_L1_M1_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40


dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1983,30 +1983,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22




dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100


dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42


dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:




dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 169
- 169
kernel/arm64/dtrmm_kernel_8x4.S View File

@@ -829,11 +829,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble dtrmm_kernel_L2_BEGIN
ble .Ldtrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L4_BEGIN:
.Ldtrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -847,15 +847,15 @@ dtrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


dtrmm_kernel_L4_M8_BEGIN:
.Ldtrmm_kernel_L4_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L4_M4_BEGIN
ble .Ldtrmm_kernel_L4_M4_BEGIN


.align 5 .align 5
dtrmm_kernel_L4_M8_20:
.Ldtrmm_kernel_L4_M8_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -877,7 +877,7 @@ dtrmm_kernel_L4_M8_20:


asr counterL , tempK, #3 // L = K / 8 asr counterL , tempK, #3 // L = K / 8
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt dtrmm_kernel_L4_M8_32
blt .Ldtrmm_kernel_L4_M8_32


KERNEL8x4_I // do one in the K KERNEL8x4_I // do one in the K
KERNEL8x4_M2 // do another in the K KERNEL8x4_M2 // do another in the K
@@ -889,10 +889,10 @@ dtrmm_kernel_L4_M8_20:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble dtrmm_kernel_L4_M8_22a
ble .Ldtrmm_kernel_L4_M8_22a


.align 5 .align 5
dtrmm_kernel_L4_M8_22:
.Ldtrmm_kernel_L4_M8_22:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -904,10 +904,10 @@ dtrmm_kernel_L4_M8_22:
KERNEL8x4_M2 KERNEL8x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M8_22
bgt .Ldtrmm_kernel_L4_M8_22


.align 5 .align 5
dtrmm_kernel_L4_M8_22a:
.Ldtrmm_kernel_L4_M8_22a:


KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_M2 KERNEL8x4_M2
@@ -918,13 +918,13 @@ dtrmm_kernel_L4_M8_22a:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44


.align 5 .align 5
dtrmm_kernel_L4_M8_32:
.Ldtrmm_kernel_L4_M8_32:


tst counterL, #1 tst counterL, #1
ble dtrmm_kernel_L4_M8_40
ble .Ldtrmm_kernel_L4_M8_40


KERNEL8x4_I KERNEL8x4_I
KERNEL8x4_M2 KERNEL8x4_M2
@@ -935,26 +935,26 @@ dtrmm_kernel_L4_M8_32:
KERNEL8x4_M1 KERNEL8x4_M1
KERNEL8x4_E KERNEL8x4_E


b dtrmm_kernel_L4_M8_44
b .Ldtrmm_kernel_L4_M8_44


dtrmm_kernel_L4_M8_40:
.Ldtrmm_kernel_L4_M8_40:


INIT8x4 INIT8x4


dtrmm_kernel_L4_M8_44:
.Ldtrmm_kernel_L4_M8_44:


ands counterL , tempK, #7 ands counterL , tempK, #7
ble dtrmm_kernel_L4_M8_100
ble .Ldtrmm_kernel_L4_M8_100


.align 5 .align 5
dtrmm_kernel_L4_M8_46:
.Ldtrmm_kernel_L4_M8_46:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne dtrmm_kernel_L4_M8_46
bne .Ldtrmm_kernel_L4_M8_46


dtrmm_kernel_L4_M8_100:
.Ldtrmm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


@@ -977,20 +977,20 @@ dtrmm_kernel_L4_M8_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


dtrmm_kernel_L4_M8_END:
.Ldtrmm_kernel_L4_M8_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne dtrmm_kernel_L4_M8_20
bne .Ldtrmm_kernel_L4_M8_20


dtrmm_kernel_L4_M4_BEGIN:
.Ldtrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble dtrmm_kernel_L4_M2_BEGIN
ble .Ldtrmm_kernel_L4_M2_BEGIN


dtrmm_kernel_L4_M4_20:
.Ldtrmm_kernel_L4_M4_20:


INIT4x4 INIT4x4


@@ -1013,9 +1013,9 @@ dtrmm_kernel_L4_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M4_40
ble .Ldtrmm_kernel_L4_M4_40


dtrmm_kernel_L4_M4_22:
.Ldtrmm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -1028,22 +1028,22 @@ dtrmm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_22
bgt .Ldtrmm_kernel_L4_M4_22




dtrmm_kernel_L4_M4_40:
.Ldtrmm_kernel_L4_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M4_100
ble .Ldtrmm_kernel_L4_M4_100


dtrmm_kernel_L4_M4_42:
.Ldtrmm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M4_42
bgt .Ldtrmm_kernel_L4_M4_42


dtrmm_kernel_L4_M4_100:
.Ldtrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -1062,19 +1062,19 @@ dtrmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L4_M4_END:
.Ldtrmm_kernel_L4_M4_END:




dtrmm_kernel_L4_M2_BEGIN:
.Ldtrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L4_M1_BEGIN
ble .Ldtrmm_kernel_L4_M1_BEGIN


dtrmm_kernel_L4_M2_20:
.Ldtrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1097,9 +1097,9 @@ dtrmm_kernel_L4_M2_20:
#endif #endif
asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M2_40
ble .Ldtrmm_kernel_L4_M2_40


dtrmm_kernel_L4_M2_22:
.Ldtrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1112,22 +1112,22 @@ dtrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_22
bgt .Ldtrmm_kernel_L4_M2_22




dtrmm_kernel_L4_M2_40:
.Ldtrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M2_100
ble .Ldtrmm_kernel_L4_M2_100


dtrmm_kernel_L4_M2_42:
.Ldtrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M2_42
bgt .Ldtrmm_kernel_L4_M2_42


dtrmm_kernel_L4_M2_100:
.Ldtrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1147,15 +1147,15 @@ dtrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L4_M2_END:
.Ldtrmm_kernel_L4_M2_END:




dtrmm_kernel_L4_M1_BEGIN:
.Ldtrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L4_END
ble .Ldtrmm_kernel_L4_END


dtrmm_kernel_L4_M1_20:
.Ldtrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1179,9 +1179,9 @@ dtrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L4_M1_40
ble .Ldtrmm_kernel_L4_M1_40


dtrmm_kernel_L4_M1_22:
.Ldtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1193,22 +1193,22 @@ dtrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_22
bgt .Ldtrmm_kernel_L4_M1_22




dtrmm_kernel_L4_M1_40:
.Ldtrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L4_M1_100
ble .Ldtrmm_kernel_L4_M1_100


dtrmm_kernel_L4_M1_42:
.Ldtrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L4_M1_42
bgt .Ldtrmm_kernel_L4_M1_42


dtrmm_kernel_L4_M1_100:
.Ldtrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1228,7 +1228,7 @@ dtrmm_kernel_L4_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L4_END:
.Ldtrmm_kernel_L4_END:


lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 4 * 8 add origPB, origPB, temp // B = B + K * 4 * 8
@@ -1238,19 +1238,19 @@ dtrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt dtrmm_kernel_L4_BEGIN
bgt .Ldtrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Ldtrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble dtrmm_kernel_L999 // error, N was less than 4?
ble .Ldtrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble dtrmm_kernel_L1_BEGIN
ble .Ldtrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1261,14 +1261,14 @@ dtrmm_kernel_L2_BEGIN: // less than 2 left in N direction
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L2_M8_BEGIN:
.Ldtrmm_kernel_L2_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L2_M4_BEGIN
ble .Ldtrmm_kernel_L2_M4_BEGIN


dtrmm_kernel_L2_M8_20:
.Ldtrmm_kernel_L2_M8_20:


INIT8x2 INIT8x2


@@ -1292,10 +1292,10 @@ dtrmm_kernel_L2_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M8_40
ble .Ldtrmm_kernel_L2_M8_40
.align 5 .align 5


dtrmm_kernel_L2_M8_22:
.Ldtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
KERNEL8x2_SUB KERNEL8x2_SUB
@@ -1307,22 +1307,22 @@ dtrmm_kernel_L2_M8_22:
KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_22
bgt .Ldtrmm_kernel_L2_M8_22




dtrmm_kernel_L2_M8_40:
.Ldtrmm_kernel_L2_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M8_100
ble .Ldtrmm_kernel_L2_M8_100


dtrmm_kernel_L2_M8_42:
.Ldtrmm_kernel_L2_M8_42:


KERNEL8x2_SUB KERNEL8x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M8_42
bgt .Ldtrmm_kernel_L2_M8_42


dtrmm_kernel_L2_M8_100:
.Ldtrmm_kernel_L2_M8_100:


SAVE8x2 SAVE8x2


@@ -1342,21 +1342,21 @@ dtrmm_kernel_L2_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


dtrmm_kernel_L2_M8_END:
.Ldtrmm_kernel_L2_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L2_M8_20
bgt .Ldtrmm_kernel_L2_M8_20


dtrmm_kernel_L2_M4_BEGIN:
.Ldtrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L2_M2_BEGIN
ble .Ldtrmm_kernel_L2_M2_BEGIN


dtrmm_kernel_L2_M4_20:
.Ldtrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1380,10 +1380,10 @@ dtrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M4_40
ble .Ldtrmm_kernel_L2_M4_40
.align 5 .align 5


dtrmm_kernel_L2_M4_22:
.Ldtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1395,22 +1395,22 @@ dtrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_22
bgt .Ldtrmm_kernel_L2_M4_22




dtrmm_kernel_L2_M4_40:
.Ldtrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M4_100
ble .Ldtrmm_kernel_L2_M4_100


dtrmm_kernel_L2_M4_42:
.Ldtrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M4_42
bgt .Ldtrmm_kernel_L2_M4_42


dtrmm_kernel_L2_M4_100:
.Ldtrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -1430,19 +1430,19 @@ dtrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L2_M4_END:
.Ldtrmm_kernel_L2_M4_END:




dtrmm_kernel_L2_M2_BEGIN:
.Ldtrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L2_M1_BEGIN
ble .Ldtrmm_kernel_L2_M1_BEGIN


dtrmm_kernel_L2_M2_20:
.Ldtrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1466,9 +1466,9 @@ dtrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble dtrmm_kernel_L2_M2_40
ble .Ldtrmm_kernel_L2_M2_40


dtrmm_kernel_L2_M2_22:
.Ldtrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1481,22 +1481,22 @@ dtrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_22
bgt .Ldtrmm_kernel_L2_M2_22




dtrmm_kernel_L2_M2_40:
.Ldtrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M2_100
ble .Ldtrmm_kernel_L2_M2_100


dtrmm_kernel_L2_M2_42:
.Ldtrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M2_42
bgt .Ldtrmm_kernel_L2_M2_42


dtrmm_kernel_L2_M2_100:
.Ldtrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1516,15 +1516,15 @@ dtrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L2_M2_END:
.Ldtrmm_kernel_L2_M2_END:




dtrmm_kernel_L2_M1_BEGIN:
.Ldtrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L2_END
ble .Ldtrmm_kernel_L2_END


dtrmm_kernel_L2_M1_20:
.Ldtrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1548,9 +1548,9 @@ dtrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble dtrmm_kernel_L2_M1_40
ble .Ldtrmm_kernel_L2_M1_40


dtrmm_kernel_L2_M1_22:
.Ldtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1562,22 +1562,22 @@ dtrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_22
bgt .Ldtrmm_kernel_L2_M1_22




dtrmm_kernel_L2_M1_40:
.Ldtrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L2_M1_100
ble .Ldtrmm_kernel_L2_M1_100


dtrmm_kernel_L2_M1_42:
.Ldtrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L2_M1_42
bgt .Ldtrmm_kernel_L2_M1_42


dtrmm_kernel_L2_M1_100:
.Ldtrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1597,7 +1597,7 @@ dtrmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


dtrmm_kernel_L2_END:
.Ldtrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1605,11 +1605,11 @@ dtrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


dtrmm_kernel_L1_BEGIN:
.Ldtrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble dtrmm_kernel_L999 // done
ble .Ldtrmm_kernel_L999 // done


mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC , pC , LDC // Update pC to point to next add pC , pC , LDC // Update pC to point to next
@@ -1619,14 +1619,14 @@ dtrmm_kernel_L1_BEGIN:
#endif #endif
mov pA, origPA // pA = A mov pA, origPA // pA = A


dtrmm_kernel_L1_M8_BEGIN:
.Ldtrmm_kernel_L1_M8_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #3 // counterI = counterI / 8 asr counterI, counterI, #3 // counterI = counterI / 8
cmp counterI, #0 cmp counterI, #0
ble dtrmm_kernel_L1_M4_BEGIN
ble .Ldtrmm_kernel_L1_M4_BEGIN


dtrmm_kernel_L1_M8_20:
.Ldtrmm_kernel_L1_M8_20:


INIT8x1 INIT8x1


@@ -1650,10 +1650,10 @@ dtrmm_kernel_L1_M8_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M8_40
ble .Ldtrmm_kernel_L1_M8_40
.align 5 .align 5


dtrmm_kernel_L1_M8_22:
.Ldtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
KERNEL8x1_SUB KERNEL8x1_SUB
@@ -1665,22 +1665,22 @@ dtrmm_kernel_L1_M8_22:
KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_22
bgt .Ldtrmm_kernel_L1_M8_22




dtrmm_kernel_L1_M8_40:
.Ldtrmm_kernel_L1_M8_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M8_100
ble .Ldtrmm_kernel_L1_M8_100


dtrmm_kernel_L1_M8_42:
.Ldtrmm_kernel_L1_M8_42:


KERNEL8x1_SUB KERNEL8x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M8_42
bgt .Ldtrmm_kernel_L1_M8_42


dtrmm_kernel_L1_M8_100:
.Ldtrmm_kernel_L1_M8_100:


SAVE8x1 SAVE8x1


@@ -1700,21 +1700,21 @@ dtrmm_kernel_L1_M8_100:
add tempOffset, tempOffset, #8 add tempOffset, tempOffset, #8
#endif #endif


dtrmm_kernel_L1_M8_END:
.Ldtrmm_kernel_L1_M8_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt dtrmm_kernel_L1_M8_20
bgt .Ldtrmm_kernel_L1_M8_20


dtrmm_kernel_L1_M4_BEGIN:
.Ldtrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #4 // counterI = counterI / 2 tst counterI, #4 // counterI = counterI / 2
ble dtrmm_kernel_L1_M2_BEGIN
ble .Ldtrmm_kernel_L1_M2_BEGIN


dtrmm_kernel_L1_M4_20:
.Ldtrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1737,10 +1737,10 @@ dtrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M4_40
ble .Ldtrmm_kernel_L1_M4_40
.align 5 .align 5


dtrmm_kernel_L1_M4_22:
.Ldtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ dtrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_22
bgt .Ldtrmm_kernel_L1_M4_22




dtrmm_kernel_L1_M4_40:
.Ldtrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M4_100
ble .Ldtrmm_kernel_L1_M4_100


dtrmm_kernel_L1_M4_42:
.Ldtrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M4_42
bgt .Ldtrmm_kernel_L1_M4_42


dtrmm_kernel_L1_M4_100:
.Ldtrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1787,18 +1787,18 @@ dtrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


dtrmm_kernel_L1_M4_END:
.Ldtrmm_kernel_L1_M4_END:


dtrmm_kernel_L1_M2_BEGIN:
.Ldtrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble dtrmm_kernel_L1_M1_BEGIN
ble .Ldtrmm_kernel_L1_M1_BEGIN


dtrmm_kernel_L1_M2_20:
.Ldtrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1822,9 +1822,9 @@ dtrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M2_40
ble .Ldtrmm_kernel_L1_M2_40


dtrmm_kernel_L1_M2_22:
.Ldtrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1837,22 +1837,22 @@ dtrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_22
bgt .Ldtrmm_kernel_L1_M2_22




dtrmm_kernel_L1_M2_40:
.Ldtrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M2_100
ble .Ldtrmm_kernel_L1_M2_100


dtrmm_kernel_L1_M2_42:
.Ldtrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M2_42
bgt .Ldtrmm_kernel_L1_M2_42


dtrmm_kernel_L1_M2_100:
.Ldtrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1872,15 +1872,15 @@ dtrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


dtrmm_kernel_L1_M2_END:
.Ldtrmm_kernel_L1_M2_END:




dtrmm_kernel_L1_M1_BEGIN:
.Ldtrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble dtrmm_kernel_L1_END
ble .Ldtrmm_kernel_L1_END


dtrmm_kernel_L1_M1_20:
.Ldtrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1904,9 +1904,9 @@ dtrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble dtrmm_kernel_L1_M1_40
ble .Ldtrmm_kernel_L1_M1_40


dtrmm_kernel_L1_M1_22:
.Ldtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1918,30 +1918,30 @@ dtrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_22
bgt .Ldtrmm_kernel_L1_M1_22




dtrmm_kernel_L1_M1_40:
.Ldtrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble dtrmm_kernel_L1_M1_100
ble .Ldtrmm_kernel_L1_M1_100


dtrmm_kernel_L1_M1_42:
.Ldtrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt dtrmm_kernel_L1_M1_42
bgt .Ldtrmm_kernel_L1_M1_42


dtrmm_kernel_L1_M1_100:
.Ldtrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




dtrmm_kernel_L1_END:
.Ldtrmm_kernel_L1_END:




dtrmm_kernel_L999:
.Ldtrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 31
- 31
kernel/arm64/gemv_n.S View File

@@ -203,18 +203,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999
cmp M, xzr cmp M, xzr
ble gemv_n_kernel_L999
ble .Lgemv_n_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ lsl INC_X, INC_X, #SHZ
mov J, N mov J, N


cmp INC_Y, #1 cmp INC_Y, #1
bne gemv_n_kernel_S_BEGIN
bne .Lgemv_n_kernel_S_BEGIN


gemv_n_kernel_F_LOOP:
.Lgemv_n_kernel_F_LOOP:


ld1 TEMPV, [X], INC_X ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP fmul TEMP, ALPHA, TEMP
@@ -229,57 +229,57 @@ gemv_n_kernel_F_LOOP:
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y


gemv_n_kernel_F32:
.Lgemv_n_kernel_F32:


asr I, M, #5 asr I, M, #5
cmp I, xzr cmp I, xzr
beq gemv_n_kernel_F4
beq .Lgemv_n_kernel_F4


gemv_n_kernel_F320:
.Lgemv_n_kernel_F320:


KERNEL_F16 KERNEL_F16
KERNEL_F16 KERNEL_F16


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F320
bne .Lgemv_n_kernel_F320


gemv_n_kernel_F4:
.Lgemv_n_kernel_F4:
ands I, M, #31 ands I, M, #31
asr I, I, #2 asr I, I, #2
cmp I, xzr cmp I, xzr
beq gemv_n_kernel_F1
beq .Lgemv_n_kernel_F1


gemv_n_kernel_F40:
.Lgemv_n_kernel_F40:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F40
bne .Lgemv_n_kernel_F40


gemv_n_kernel_F1:
.Lgemv_n_kernel_F1:
ands I, M, #3 ands I, M, #3
ble gemv_n_kernel_F_END
ble .Lgemv_n_kernel_F_END


gemv_n_kernel_F10:
.Lgemv_n_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_F10
bne .Lgemv_n_kernel_F10


gemv_n_kernel_F_END:
.Lgemv_n_kernel_F_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne gemv_n_kernel_F_LOOP
bne .Lgemv_n_kernel_F_LOOP


b gemv_n_kernel_L999
b .Lgemv_n_kernel_L999


gemv_n_kernel_S_BEGIN:
.Lgemv_n_kernel_S_BEGIN:


INIT_S INIT_S


gemv_n_kernel_S_LOOP:
.Lgemv_n_kernel_S_LOOP:


ld1 TEMPV, [X], INC_X ld1 TEMPV, [X], INC_X
fmul TEMP, ALPHA, TEMP fmul TEMP, ALPHA, TEMP
@@ -288,9 +288,9 @@ gemv_n_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble gemv_n_kernel_S1
ble .Lgemv_n_kernel_S1


gemv_n_kernel_S4:
.Lgemv_n_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -298,27 +298,27 @@ gemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_S4
bne .Lgemv_n_kernel_S4


gemv_n_kernel_S1:
.Lgemv_n_kernel_S1:


ands I, M, #3 ands I, M, #3
ble gemv_n_kernel_S_END
ble .Lgemv_n_kernel_S_END


gemv_n_kernel_S10:
.Lgemv_n_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_n_kernel_S10
bne .Lgemv_n_kernel_S10


gemv_n_kernel_S_END:
.Lgemv_n_kernel_S_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne gemv_n_kernel_S_LOOP
bne .Lgemv_n_kernel_S_LOOP


gemv_n_kernel_L999:
.Lgemv_n_kernel_L999:


mov w0, wzr mov w0, wzr




+ 31
- 31
kernel/arm64/gemv_t.S View File

@@ -233,18 +233,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999
cmp M, xzr cmp M, xzr
ble gemv_t_kernel_L999
ble .Lgemv_t_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ lsl INC_Y, INC_Y, #SHZ
mov J, N mov J, N


cmp INC_X, #1 cmp INC_X, #1
bne gemv_t_kernel_S_BEGIN
bne .Lgemv_t_kernel_S_BEGIN


gemv_t_kernel_F_LOOP:
.Lgemv_t_kernel_F_LOOP:


fmov TEMP, REG0 fmov TEMP, REG0
fmov TEMP1, REG0 fmov TEMP1, REG0
@@ -254,64 +254,64 @@ gemv_t_kernel_F_LOOP:
mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X


gemv_t_kernel_F32:
.Lgemv_t_kernel_F32:


asr I, M, #5 asr I, M, #5
cmp I, xzr cmp I, xzr
beq gemv_t_kernel_F4
beq .Lgemv_t_kernel_F4


gemv_t_kernel_F320:
.Lgemv_t_kernel_F320:


KERNEL_F32 KERNEL_F32


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F320
bne .Lgemv_t_kernel_F320


KERNEL_F32_FINALIZE KERNEL_F32_FINALIZE


gemv_t_kernel_F4:
.Lgemv_t_kernel_F4:
ands I, M, #31 ands I, M, #31
asr I, I, #2 asr I, I, #2
cmp I, xzr cmp I, xzr
beq gemv_t_kernel_F1
beq .Lgemv_t_kernel_F1


gemv_t_kernel_F40:
.Lgemv_t_kernel_F40:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F40
bne .Lgemv_t_kernel_F40


gemv_t_kernel_F1:
.Lgemv_t_kernel_F1:


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


ands I, M, #3 ands I, M, #3
ble gemv_t_kernel_F_END
ble .Lgemv_t_kernel_F_END


gemv_t_kernel_F10:
.Lgemv_t_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_F10
bne .Lgemv_t_kernel_F10


gemv_t_kernel_F_END:
.Lgemv_t_kernel_F_END:


ld1 TMPV1, [Y] ld1 TMPV1, [Y]
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1 fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_F_LOOP
bne .Lgemv_t_kernel_F_LOOP


b gemv_t_kernel_L999
b .Lgemv_t_kernel_L999


gemv_t_kernel_S_BEGIN:
.Lgemv_t_kernel_S_BEGIN:


INIT_S INIT_S


gemv_t_kernel_S_LOOP:
.Lgemv_t_kernel_S_LOOP:


fmov TEMP, REG0 fmov TEMP, REG0
mov A_PTR, A mov A_PTR, A
@@ -319,9 +319,9 @@ gemv_t_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble gemv_t_kernel_S1
ble .Lgemv_t_kernel_S1


gemv_t_kernel_S4:
.Lgemv_t_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -329,30 +329,30 @@ gemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_S4
bne .Lgemv_t_kernel_S4


gemv_t_kernel_S1:
.Lgemv_t_kernel_S1:


ands I, M, #3 ands I, M, #3
ble gemv_t_kernel_S_END
ble .Lgemv_t_kernel_S_END


gemv_t_kernel_S10:
.Lgemv_t_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne gemv_t_kernel_S10
bne .Lgemv_t_kernel_S10


gemv_t_kernel_S_END:
.Lgemv_t_kernel_S_END:


ld1 TMPV1, [Y] ld1 TMPV1, [Y]
add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
fmadd TMP1, ALPHA, TEMP, TMP1 fmadd TMP1, ALPHA, TEMP, TMP1
st1 TMPV1, [Y], INC_Y st1 TMPV1, [Y], INC_Y
bne gemv_t_kernel_S_LOOP
bne .Lgemv_t_kernel_S_LOOP


gemv_t_kernel_L999:
.Lgemv_t_kernel_L999:


RESTORE_REGS RESTORE_REGS




+ 24
- 24
kernel/arm64/iamax.S View File

@@ -230,62 +230,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble iamax_kernel_zero
ble .Liamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Liamax_kernel_S_BEGIN
mov x7, X mov x7, X


iamax_kernel_F_BEGIN:
.Liamax_kernel_F_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq iamax_kernel_F1
beq .Liamax_kernel_F1


add Z, Z, #1 add Z, Z, #1
iamax_kernel_F8:
.Liamax_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F8
bne .Liamax_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


sub Z, Z, #1 sub Z, Z, #1
iamax_kernel_F1:
.Liamax_kernel_F1:


ands I, N, #7 ands I, N, #7
ble iamax_kernel_L999
ble .Liamax_kernel_L999


iamax_kernel_F10:
.Liamax_kernel_F10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F10
bne .Liamax_kernel_F10


b iamax_kernel_L999
b .Liamax_kernel_L999


iamax_kernel_S_BEGIN:
.Liamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Liamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble iamax_kernel_S1
ble .Liamax_kernel_S1


iamax_kernel_S4:
.Liamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -293,25 +293,25 @@ iamax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_S4
bne .Liamax_kernel_S4


iamax_kernel_S1:
.Liamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble iamax_kernel_L999
ble .Liamax_kernel_L999


iamax_kernel_S10:
.Liamax_kernel_S10:


KERNEL_S1 KERNEL_S1
subs I, I, #1 subs I, I, #1
bne iamax_kernel_S10
bne .Liamax_kernel_S10


iamax_kernel_L999:
.Liamax_kernel_L999:


mov x0, INDEX mov x0, INDEX
ret ret


iamax_kernel_zero:
.Liamax_kernel_zero:


mov x0, xzr mov x0, xzr
ret ret


+ 24
- 24
kernel/arm64/izamax.S View File

@@ -276,64 +276,64 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble iamax_kernel_zero
ble .Lizamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne iamax_kernel_S_BEGIN
bne .Lizamax_kernel_S_BEGIN
mov x7, X mov x7, X




iamax_kernel_F_BEGIN:
.Lizamax_kernel_F_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
ble iamax_kernel_F1
ble .Lizamax_kernel_F1


add Z, Z, #1 add Z, Z, #1


iamax_kernel_F8:
.Lizamax_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F8
bne .Lizamax_kernel_F8


KERNEL_F8_FINALIZE KERNEL_F8_FINALIZE


sub Z, Z, #1 sub Z, Z, #1
iamax_kernel_F1:
.Lizamax_kernel_F1:


ands I, N, #7 ands I, N, #7
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


iamax_kernel_F10:
.Lizamax_kernel_F10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_F10
bne .Lizamax_kernel_F10


b iamax_kernel_L999
b .Lizamax_kernel_L999


iamax_kernel_S_BEGIN:
.Lizamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble iamax_kernel_S1
ble .Lizamax_kernel_S1


iamax_kernel_S4:
.Lizamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -341,26 +341,26 @@ iamax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_S4
bne .Lizamax_kernel_S4


iamax_kernel_S1:
.Lizamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble iamax_kernel_L999
ble .Lizamax_kernel_L999


iamax_kernel_S10:
.Lizamax_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne iamax_kernel_S10
bne .Lizamax_kernel_S10


iamax_kernel_L999:
.Lizamax_kernel_L999:


mov x0, INDEX mov x0, INDEX
ret ret


iamax_kernel_zero:
.Lizamax_kernel_zero:


mov x0, xzr mov x0, xzr
ret ret


+ 16
- 16
kernel/arm64/nrm2.S View File

@@ -162,44 +162,44 @@ KERNEL_S1_NEXT:
INIT INIT


cmp N, #0 cmp N, #0
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999


cmp INC_X, #0 cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lnrm2_kernel_L999




cmp INC_X, #1 cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lnrm2_kernel_S_BEGIN


nrm2_kernel_F_BEGIN:
.Lnrm2_kernel_F_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, xzr cmp I, xzr
ble nrm2_kernel_F1
ble .Lnrm2_kernel_F1


nrm2_kernel_F8:
.Lnrm2_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F8
bne .Lnrm2_kernel_F8


nrm2_kernel_F1:
.Lnrm2_kernel_F1:


ands I, N, #7 ands I, N, #7
ble nrm2_kernel_L999
ble .Lnrm2_kernel_L999




nrm2_kernel_F10:
.Lnrm2_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F10
bne .Lnrm2_kernel_F10


b nrm2_kernel_L999
b .Lnrm2_kernel_L999


nrm2_kernel_S_BEGIN:
.Lnrm2_kernel_S_BEGIN:


INIT_S INIT_S


@@ -207,15 +207,15 @@ nrm2_kernel_S_BEGIN:


.align 5 .align 5


nrm2_kernel_S10:
.Lnrm2_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_S10
bne .Lnrm2_kernel_S10




nrm2_kernel_L999:
.Lnrm2_kernel_L999:
fsqrt SSQ, SSQ fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ fmul SSQ, SCALE, SSQ




+ 20
- 20
kernel/arm64/rot.S View File

@@ -165,48 +165,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble rot_kernel_L999
ble .Lrot_kernel_L999


INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lrot_kernel_S_BEGIN


rot_kernel_F_BEGIN:
.Lrot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq rot_kernel_F1
beq .Lrot_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


rot_kernel_F4:
.Lrot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne rot_kernel_F4
bne .Lrot_kernel_F4


rot_kernel_F1:
.Lrot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999


INIT_F1 INIT_F1


rot_kernel_F10:
.Lrot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne rot_kernel_F10
bne .Lrot_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


rot_kernel_S_BEGIN:
.Lrot_kernel_S_BEGIN:


INIT_S INIT_S
INIT_F1 INIT_F1
@@ -214,9 +214,9 @@ rot_kernel_S_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble rot_kernel_S1
ble .Lrot_kernel_S1


rot_kernel_S4:
.Lrot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -224,22 +224,22 @@ rot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S4
bne .Lrot_kernel_S4


rot_kernel_S1:
.Lrot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lrot_kernel_L999




rot_kernel_S10:
.Lrot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S10
bne .Lrot_kernel_S10


rot_kernel_L999:
.Lrot_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 23
- 23
kernel/arm64/scal.S View File

@@ -166,86 +166,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble scal_kernel_L999
ble .Lscal_kernel_L999


fcmp DA, #0.0 fcmp DA, #0.0
beq scal_kernel_zero
beq .Lscal_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne scal_kernel_S_BEGIN
bne .Lscal_kernel_S_BEGIN


scal_kernel_F_BEGIN:
.Lscal_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq scal_kernel_F1
beq .Lscal_kernel_F1


KERNEL_INIT_F8 KERNEL_INIT_F8


scal_kernel_F8:
.Lscal_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne scal_kernel_F8
bne .Lscal_kernel_F8


scal_kernel_F1:
.Lscal_kernel_F1:


ands I, N, #7 ands I, N, #7
ble scal_kernel_L999
ble .Lscal_kernel_L999


scal_kernel_F10:
.Lscal_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne scal_kernel_F10
bne .Lscal_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


scal_kernel_S_BEGIN:
.Lscal_kernel_S_BEGIN:


INIT_S INIT_S
mov X_COPY, X mov X_COPY, X


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble scal_kernel_S1
ble .Lscal_kernel_S1


scal_kernel_S4:
.Lscal_kernel_S4:


KERNEL_S4 KERNEL_S4


subs I, I, #1 subs I, I, #1
bne scal_kernel_S4
bne .Lscal_kernel_S4


scal_kernel_S1:
.Lscal_kernel_S1:


ands I, N, #3 ands I, N, #3
ble scal_kernel_L999
ble .Lscal_kernel_L999


scal_kernel_S10:
.Lscal_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne scal_kernel_S10
bne .Lscal_kernel_S10


scal_kernel_L999:
.Lscal_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret


scal_kernel_zero:
.Lscal_kernel_zero:


INIT_S INIT_S


scal_kernel_Z1:
.Lscal_kernel_Z1:


st1 DAV, [X], INC_X st1 DAV, [X], INC_X
subs N, N, #1 subs N, N, #1
bne scal_kernel_Z1
bne .Lscal_kernel_Z1


mov w0, wzr mov w0, wzr
ret ret


+ 221
- 221
kernel/arm64/sgemm_kernel_16x4.S
File diff suppressed because it is too large
View File


+ 221
- 221
kernel/arm64/sgemm_kernel_16x4_thunderx2t99.S
File diff suppressed because it is too large
View File


+ 155
- 155
kernel/arm64/sgemm_kernel_4x4.S View File

@@ -892,11 +892,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble sgemm_kernel_L2_BEGIN
ble .Lsgemm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


sgemm_kernel_L4_BEGIN:
.Lsgemm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -906,73 +906,73 @@ sgemm_kernel_L4_BEGIN:
add pA_2, temp, pA_1 add pA_2, temp, pA_1
add pA_3, temp, pA_2 add pA_3, temp, pA_2


sgemm_kernel_L4_M16_BEGIN:
.Lsgemm_kernel_L4_M16_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #4 // counterI = counterI / 16 asr counterI, counterI, #4 // counterI = counterI / 16
cmp counterI, #0 cmp counterI, #0
ble sgemm_kernel_L4_M8_BEGIN
ble .Lsgemm_kernel_L4_M8_BEGIN


sgemm_kernel_L4_M16_20:
.Lsgemm_kernel_L4_M16_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #1 // L = K / 2 asr counterL , origK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt sgemm_kernel_L4_M16_32
blt .Lsgemm_kernel_L4_M16_32


KERNEL16x4_I // do one in the K KERNEL16x4_I // do one in the K
KERNEL16x4_M2 // do another in the K KERNEL16x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble sgemm_kernel_L4_M16_22a
ble .Lsgemm_kernel_L4_M16_22a
.align 5 .align 5


sgemm_kernel_L4_M16_22:
.Lsgemm_kernel_L4_M16_22:


KERNEL16x4_M1 KERNEL16x4_M1
KERNEL16x4_M2 KERNEL16x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M16_22
bgt .Lsgemm_kernel_L4_M16_22




sgemm_kernel_L4_M16_22a:
.Lsgemm_kernel_L4_M16_22a:


KERNEL16x4_M1 KERNEL16x4_M1
KERNEL16x4_E KERNEL16x4_E


b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44


sgemm_kernel_L4_M16_32:
.Lsgemm_kernel_L4_M16_32:


tst counterL, #1 tst counterL, #1
ble sgemm_kernel_L4_M16_40
ble .Lsgemm_kernel_L4_M16_40


KERNEL16x4_I KERNEL16x4_I


KERNEL16x4_E KERNEL16x4_E


b sgemm_kernel_L4_M16_44
b .Lsgemm_kernel_L4_M16_44




sgemm_kernel_L4_M16_40:
.Lsgemm_kernel_L4_M16_40:


INIT16x4 INIT16x4


sgemm_kernel_L4_M16_44:
.Lsgemm_kernel_L4_M16_44:


ands counterL , origK, #1 ands counterL , origK, #1
ble sgemm_kernel_L4_M16_100
ble .Lsgemm_kernel_L4_M16_100


sgemm_kernel_L4_M16_46:
.Lsgemm_kernel_L4_M16_46:


KERNEL16x4_SUB KERNEL16x4_SUB


sgemm_kernel_L4_M16_100:
.Lsgemm_kernel_L4_M16_100:


SAVE16x4 SAVE16x4


sgemm_kernel_L4_M16_END:
.Lsgemm_kernel_L4_M16_END:
lsl temp, origK, #4 // k * 4 * 4 = Four rows of A lsl temp, origK, #4 // k * 4 * 4 = Four rows of A
add pA_0, pA_0, temp add pA_0, pA_0, temp
add pA_0, pA_0, temp add pA_0, pA_0, temp
@@ -981,26 +981,26 @@ sgemm_kernel_L4_M16_END:
add pA_2, pA_1, temp add pA_2, pA_1, temp
add pA_3, pA_2, temp add pA_3, pA_2, temp
subs counterI, counterI, #1 subs counterI, counterI, #1
bne sgemm_kernel_L4_M16_20
bne .Lsgemm_kernel_L4_M16_20


sgemm_kernel_L4_M8_BEGIN:
.Lsgemm_kernel_L4_M8_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #15 tst counterI , #15
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


tst counterI, #8 tst counterI, #8
ble sgemm_kernel_L4_M4_BEGIN
ble .Lsgemm_kernel_L4_M4_BEGIN


sgemm_kernel_L4_M8_20:
.Lsgemm_kernel_L4_M8_20:


INIT8x4 INIT8x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L4_M8_40
ble .Lsgemm_kernel_L4_M8_40


sgemm_kernel_L4_M8_22:
.Lsgemm_kernel_L4_M8_22:


KERNEL8x4_SUB KERNEL8x4_SUB
KERNEL8x4_SUB KERNEL8x4_SUB
@@ -1013,47 +1013,47 @@ sgemm_kernel_L4_M8_22:
KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_22
bgt .Lsgemm_kernel_L4_M8_22




sgemm_kernel_L4_M8_40:
.Lsgemm_kernel_L4_M8_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M8_100
ble .Lsgemm_kernel_L4_M8_100


sgemm_kernel_L4_M8_42:
.Lsgemm_kernel_L4_M8_42:


KERNEL8x4_SUB KERNEL8x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M8_42
bgt .Lsgemm_kernel_L4_M8_42


sgemm_kernel_L4_M8_100:
.Lsgemm_kernel_L4_M8_100:


SAVE8x4 SAVE8x4


sgemm_kernel_L4_M8_END:
.Lsgemm_kernel_L4_M8_END:
lsl temp, origK, #4 // k * 4 * 4 lsl temp, origK, #4 // k * 4 * 4
add pA_0, pA_0, temp add pA_0, pA_0, temp


sgemm_kernel_L4_M4_BEGIN:
.Lsgemm_kernel_L4_M4_BEGIN:
mov counterI, origM mov counterI, origM
tst counterI , #7 tst counterI , #7
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


tst counterI, #4 tst counterI, #4
ble sgemm_kernel_L4_M2_BEGIN
ble .Lsgemm_kernel_L4_M2_BEGIN


sgemm_kernel_L4_M4_20:
.Lsgemm_kernel_L4_M4_20:


INIT4x4 INIT4x4


mov pB, origPB mov pB, origPB
asr counterL, origK, #3 // counterL = counterL / 8 asr counterL, origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L4_M4_40
ble .Lsgemm_kernel_L4_M4_40


sgemm_kernel_L4_M4_22:
.Lsgemm_kernel_L4_M4_22:


KERNEL4x4_SUB KERNEL4x4_SUB
KERNEL4x4_SUB KERNEL4x4_SUB
@@ -1066,47 +1066,47 @@ sgemm_kernel_L4_M4_22:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_22
bgt .Lsgemm_kernel_L4_M4_22




sgemm_kernel_L4_M4_40:
.Lsgemm_kernel_L4_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M4_100
ble .Lsgemm_kernel_L4_M4_100


sgemm_kernel_L4_M4_42:
.Lsgemm_kernel_L4_M4_42:


KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M4_42
bgt .Lsgemm_kernel_L4_M4_42


sgemm_kernel_L4_M4_100:
.Lsgemm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


sgemm_kernel_L4_M4_END:
.Lsgemm_kernel_L4_M4_END:




sgemm_kernel_L4_M2_BEGIN:
.Lsgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L4_M1_BEGIN
ble .Lsgemm_kernel_L4_M1_BEGIN


sgemm_kernel_L4_M2_20:
.Lsgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L4_M2_40
ble .Lsgemm_kernel_L4_M2_40


sgemm_kernel_L4_M2_22:
.Lsgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1119,43 +1119,43 @@ sgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_22
bgt .Lsgemm_kernel_L4_M2_22




sgemm_kernel_L4_M2_40:
.Lsgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M2_100
ble .Lsgemm_kernel_L4_M2_100


sgemm_kernel_L4_M2_42:
.Lsgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M2_42
bgt .Lsgemm_kernel_L4_M2_42


sgemm_kernel_L4_M2_100:
.Lsgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


sgemm_kernel_L4_M2_END:
.Lsgemm_kernel_L4_M2_END:




sgemm_kernel_L4_M1_BEGIN:
.Lsgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L4_END
ble .Lsgemm_kernel_L4_END


sgemm_kernel_L4_M1_20:
.Lsgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L4_M1_40
ble .Lsgemm_kernel_L4_M1_40


sgemm_kernel_L4_M1_22:
.Lsgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1167,45 +1167,45 @@ sgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_22
bgt .Lsgemm_kernel_L4_M1_22




sgemm_kernel_L4_M1_40:
.Lsgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L4_M1_100
ble .Lsgemm_kernel_L4_M1_100


sgemm_kernel_L4_M1_42:
.Lsgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L4_M1_42
bgt .Lsgemm_kernel_L4_M1_42


sgemm_kernel_L4_M1_100:
.Lsgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




sgemm_kernel_L4_END:
.Lsgemm_kernel_L4_END:


lsl temp, origK, #4 lsl temp, origK, #4
add origPB, origPB, temp // B = B + K * 4 * 4 add origPB, origPB, temp // B = B + K * 4 * 4


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt sgemm_kernel_L4_BEGIN
bgt .Lsgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


sgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble sgemm_kernel_L999
ble .Lsgemm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble sgemm_kernel_L1_BEGIN
ble .Lsgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1215,24 +1215,24 @@ sgemm_kernel_L2_BEGIN: // less than 2 left in N direction






sgemm_kernel_L2_M4_BEGIN:
.Lsgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble sgemm_kernel_L2_M2_BEGIN
ble .Lsgemm_kernel_L2_M2_BEGIN


sgemm_kernel_L2_M4_20:
.Lsgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble sgemm_kernel_L2_M4_40
ble .Lsgemm_kernel_L2_M4_40
.align 5 .align 5


sgemm_kernel_L2_M4_22:
.Lsgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1244,50 +1244,50 @@ sgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_22
bgt .Lsgemm_kernel_L2_M4_22




sgemm_kernel_L2_M4_40:
.Lsgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M4_100
ble .Lsgemm_kernel_L2_M4_100


sgemm_kernel_L2_M4_42:
.Lsgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M4_42
bgt .Lsgemm_kernel_L2_M4_42


sgemm_kernel_L2_M4_100:
.Lsgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


sgemm_kernel_L2_M4_END:
.Lsgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt sgemm_kernel_L2_M4_20
bgt .Lsgemm_kernel_L2_M4_20




sgemm_kernel_L2_M2_BEGIN:
.Lsgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L2_M1_BEGIN
ble .Lsgemm_kernel_L2_M1_BEGIN


sgemm_kernel_L2_M2_20:
.Lsgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble sgemm_kernel_L2_M2_40
ble .Lsgemm_kernel_L2_M2_40


sgemm_kernel_L2_M2_22:
.Lsgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1300,43 +1300,43 @@ sgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_22
bgt .Lsgemm_kernel_L2_M2_22




sgemm_kernel_L2_M2_40:
.Lsgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M2_100
ble .Lsgemm_kernel_L2_M2_100


sgemm_kernel_L2_M2_42:
.Lsgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M2_42
bgt .Lsgemm_kernel_L2_M2_42


sgemm_kernel_L2_M2_100:
.Lsgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


sgemm_kernel_L2_M2_END:
.Lsgemm_kernel_L2_M2_END:




sgemm_kernel_L2_M1_BEGIN:
.Lsgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L2_END
ble .Lsgemm_kernel_L2_END


sgemm_kernel_L2_M1_20:
.Lsgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble sgemm_kernel_L2_M1_40
ble .Lsgemm_kernel_L2_M1_40


sgemm_kernel_L2_M1_22:
.Lsgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1348,36 +1348,36 @@ sgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_22
bgt .Lsgemm_kernel_L2_M1_22




sgemm_kernel_L2_M1_40:
.Lsgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L2_M1_100
ble .Lsgemm_kernel_L2_M1_100


sgemm_kernel_L2_M1_42:
.Lsgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L2_M1_42
bgt .Lsgemm_kernel_L2_M1_42


sgemm_kernel_L2_M1_100:
.Lsgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




sgemm_kernel_L2_END:
.Lsgemm_kernel_L2_END:
add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4


/******************************************************************************/ /******************************************************************************/


sgemm_kernel_L1_BEGIN:
.Lsgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble sgemm_kernel_L999 // done
ble .Lsgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1387,24 +1387,24 @@ sgemm_kernel_L1_BEGIN:






sgemm_kernel_L1_M4_BEGIN:
.Lsgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble sgemm_kernel_L1_M2_BEGIN
ble .Lsgemm_kernel_L1_M2_BEGIN


sgemm_kernel_L1_M4_20:
.Lsgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M4_40
ble .Lsgemm_kernel_L1_M4_40
.align 5 .align 5


sgemm_kernel_L1_M4_22:
.Lsgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1416,50 +1416,50 @@ sgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_22
bgt .Lsgemm_kernel_L1_M4_22




sgemm_kernel_L1_M4_40:
.Lsgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M4_100
ble .Lsgemm_kernel_L1_M4_100


sgemm_kernel_L1_M4_42:
.Lsgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M4_42
bgt .Lsgemm_kernel_L1_M4_42


sgemm_kernel_L1_M4_100:
.Lsgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


sgemm_kernel_L1_M4_END:
.Lsgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt sgemm_kernel_L1_M4_20
bgt .Lsgemm_kernel_L1_M4_20




sgemm_kernel_L1_M2_BEGIN:
.Lsgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble sgemm_kernel_L1_M1_BEGIN
ble .Lsgemm_kernel_L1_M1_BEGIN


sgemm_kernel_L1_M2_20:
.Lsgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M2_40
ble .Lsgemm_kernel_L1_M2_40


sgemm_kernel_L1_M2_22:
.Lsgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1472,43 +1472,43 @@ sgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_22
bgt .Lsgemm_kernel_L1_M2_22




sgemm_kernel_L1_M2_40:
.Lsgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M2_100
ble .Lsgemm_kernel_L1_M2_100


sgemm_kernel_L1_M2_42:
.Lsgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M2_42
bgt .Lsgemm_kernel_L1_M2_42


sgemm_kernel_L1_M2_100:
.Lsgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


sgemm_kernel_L1_M2_END:
.Lsgemm_kernel_L1_M2_END:




sgemm_kernel_L1_M1_BEGIN:
.Lsgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble sgemm_kernel_L1_END
ble .Lsgemm_kernel_L1_END


sgemm_kernel_L1_M1_20:
.Lsgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble sgemm_kernel_L1_M1_40
ble .Lsgemm_kernel_L1_M1_40


sgemm_kernel_L1_M1_22:
.Lsgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1520,30 +1520,30 @@ sgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_22
bgt .Lsgemm_kernel_L1_M1_22




sgemm_kernel_L1_M1_40:
.Lsgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble sgemm_kernel_L1_M1_100
ble .Lsgemm_kernel_L1_M1_100


sgemm_kernel_L1_M1_42:
.Lsgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt sgemm_kernel_L1_M1_42
bgt .Lsgemm_kernel_L1_M1_42


sgemm_kernel_L1_M1_100:
.Lsgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




sgemm_kernel_L1_END:
.Lsgemm_kernel_L1_END:




sgemm_kernel_L999:
.Lsgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 241
- 241
kernel/arm64/sgemm_kernel_8x8.S
File diff suppressed because it is too large
View File


+ 221
- 221
kernel/arm64/strmm_kernel_16x4.S
File diff suppressed because it is too large
View File


+ 130
- 130
kernel/arm64/strmm_kernel_4x4.S View File

@@ -507,7 +507,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


PROLOGUE PROLOGUE


strmm_kernel_begin:
.Lstrmm_kernel_begin:


.align 5 .align 5
add sp, sp, #-(11 * 16) add sp, sp, #-(11 * 16)
@@ -539,11 +539,11 @@ strmm_kernel_begin:
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble strmm_kernel_L2_BEGIN
ble .Lstrmm_kernel_L2_BEGIN


/******************************************************************************/ /******************************************************************************/


strmm_kernel_L4_BEGIN:
.Lstrmm_kernel_L4_BEGIN:
mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
add pC, pC, LDC, lsl #2 add pC, pC, LDC, lsl #2


@@ -553,14 +553,14 @@ strmm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


strmm_kernel_L4_M4_BEGIN:
.Lstrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble strmm_kernel_L4_M2_BEGIN
ble .Lstrmm_kernel_L4_M2_BEGIN


strmm_kernel_L4_M4_20:
.Lstrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -581,54 +581,54 @@ strmm_kernel_L4_M4_20:


asr counterL , tempK, #1 // L = K / 2 asr counterL , tempK, #1 // L = K / 2
cmp counterL , #2 // is there at least 4 to do? cmp counterL , #2 // is there at least 4 to do?
blt strmm_kernel_L4_M4_32
blt .Lstrmm_kernel_L4_M4_32


KERNEL4x4_I // do one in the K KERNEL4x4_I // do one in the K
KERNEL4x4_M2 // do another in the K KERNEL4x4_M2 // do another in the K


subs counterL, counterL, #2 subs counterL, counterL, #2
ble strmm_kernel_L4_M4_22a
ble .Lstrmm_kernel_L4_M4_22a
.align 5 .align 5


strmm_kernel_L4_M4_22:
.Lstrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M4_22
bgt .Lstrmm_kernel_L4_M4_22


strmm_kernel_L4_M4_22a:
.Lstrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44


strmm_kernel_L4_M4_32:
.Lstrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble strmm_kernel_L4_M4_40
ble .Lstrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_E KERNEL4x4_E


b strmm_kernel_L4_M4_44
b .Lstrmm_kernel_L4_M4_44


strmm_kernel_L4_M4_40:
.Lstrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


strmm_kernel_L4_M4_44:
.Lstrmm_kernel_L4_M4_44:


ands counterL , tempK, #1 ands counterL , tempK, #1
ble strmm_kernel_L4_M4_100
ble .Lstrmm_kernel_L4_M4_100


strmm_kernel_L4_M4_46:
.Lstrmm_kernel_L4_M4_46:


KERNEL4x4_SUB KERNEL4x4_SUB


strmm_kernel_L4_M4_100:
.Lstrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -647,20 +647,20 @@ strmm_kernel_L4_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


strmm_kernel_L4_M4_END:
.Lstrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne strmm_kernel_L4_M4_20
bne .Lstrmm_kernel_L4_M4_20


strmm_kernel_L4_M2_BEGIN:
.Lstrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L4_M1_BEGIN
ble .Lstrmm_kernel_L4_M1_BEGIN


strmm_kernel_L4_M2_20:
.Lstrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -684,9 +684,9 @@ strmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L4_M2_40
ble .Lstrmm_kernel_L4_M2_40


strmm_kernel_L4_M2_22:
.Lstrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -699,22 +699,22 @@ strmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_22
bgt .Lstrmm_kernel_L4_M2_22




strmm_kernel_L4_M2_40:
.Lstrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M2_100
ble .Lstrmm_kernel_L4_M2_100


strmm_kernel_L4_M2_42:
.Lstrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M2_42
bgt .Lstrmm_kernel_L4_M2_42


strmm_kernel_L4_M2_100:
.Lstrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -735,15 +735,15 @@ strmm_kernel_L4_M2_100:
#endif #endif




strmm_kernel_L4_M2_END:
.Lstrmm_kernel_L4_M2_END:




strmm_kernel_L4_M1_BEGIN:
.Lstrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L4_END
ble .Lstrmm_kernel_L4_END


strmm_kernel_L4_M1_20:
.Lstrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -767,9 +767,9 @@ strmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L4_M1_40
ble .Lstrmm_kernel_L4_M1_40


strmm_kernel_L4_M1_22:
.Lstrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -781,22 +781,22 @@ strmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_22
bgt .Lstrmm_kernel_L4_M1_22




strmm_kernel_L4_M1_40:
.Lstrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L4_M1_100
ble .Lstrmm_kernel_L4_M1_100


strmm_kernel_L4_M1_42:
.Lstrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L4_M1_42
bgt .Lstrmm_kernel_L4_M1_42


strmm_kernel_L4_M1_100:
.Lstrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -817,7 +817,7 @@ strmm_kernel_L4_M1_100:
#endif #endif




strmm_kernel_L4_END:
.Lstrmm_kernel_L4_END:
add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4


#if !defined(LEFT) #if !defined(LEFT)
@@ -825,19 +825,19 @@ strmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt strmm_kernel_L4_BEGIN
bgt .Lstrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


strmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble strmm_kernel_L999
ble .Lstrmm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble strmm_kernel_L1_BEGIN
ble .Lstrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -849,14 +849,14 @@ strmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


strmm_kernel_L2_M4_BEGIN:
.Lstrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble strmm_kernel_L2_M2_BEGIN
ble .Lstrmm_kernel_L2_M2_BEGIN


strmm_kernel_L2_M4_20:
.Lstrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -880,10 +880,10 @@ strmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble strmm_kernel_L2_M4_40
ble .Lstrmm_kernel_L2_M4_40
.align 5 .align 5


strmm_kernel_L2_M4_22:
.Lstrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -895,22 +895,22 @@ strmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_22
bgt .Lstrmm_kernel_L2_M4_22




strmm_kernel_L2_M4_40:
.Lstrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M4_100
ble .Lstrmm_kernel_L2_M4_100


strmm_kernel_L2_M4_42:
.Lstrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M4_42
bgt .Lstrmm_kernel_L2_M4_42


strmm_kernel_L2_M4_100:
.Lstrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -930,22 +930,22 @@ strmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


strmm_kernel_L2_M4_END:
.Lstrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt strmm_kernel_L2_M4_20
bgt .Lstrmm_kernel_L2_M4_20




strmm_kernel_L2_M2_BEGIN:
.Lstrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L2_M1_BEGIN
ble .Lstrmm_kernel_L2_M1_BEGIN


strmm_kernel_L2_M2_20:
.Lstrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -969,9 +969,9 @@ strmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble strmm_kernel_L2_M2_40
ble .Lstrmm_kernel_L2_M2_40


strmm_kernel_L2_M2_22:
.Lstrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -984,22 +984,22 @@ strmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_22
bgt .Lstrmm_kernel_L2_M2_22




strmm_kernel_L2_M2_40:
.Lstrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M2_100
ble .Lstrmm_kernel_L2_M2_100


strmm_kernel_L2_M2_42:
.Lstrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M2_42
bgt .Lstrmm_kernel_L2_M2_42


strmm_kernel_L2_M2_100:
.Lstrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
@@ -1018,15 +1018,15 @@ strmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


strmm_kernel_L2_M2_END:
.Lstrmm_kernel_L2_M2_END:




strmm_kernel_L2_M1_BEGIN:
.Lstrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L2_END
ble .Lstrmm_kernel_L2_END


strmm_kernel_L2_M1_20:
.Lstrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1050,9 +1050,9 @@ strmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble strmm_kernel_L2_M1_40
ble .Lstrmm_kernel_L2_M1_40


strmm_kernel_L2_M1_22:
.Lstrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1064,22 +1064,22 @@ strmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_22
bgt .Lstrmm_kernel_L2_M1_22




strmm_kernel_L2_M1_40:
.Lstrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L2_M1_100
ble .Lstrmm_kernel_L2_M1_100


strmm_kernel_L2_M1_42:
.Lstrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L2_M1_42
bgt .Lstrmm_kernel_L2_M1_42


strmm_kernel_L2_M1_100:
.Lstrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1099,7 +1099,7 @@ strmm_kernel_L2_M1_100:
add tempOffset, tempOffset, #1 add tempOffset, tempOffset, #1
#endif #endif


strmm_kernel_L2_END:
.Lstrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1107,11 +1107,11 @@ strmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


strmm_kernel_L1_BEGIN:
.Lstrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble strmm_kernel_L999 // done
ble .Lstrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1123,14 +1123,14 @@ strmm_kernel_L1_BEGIN:


mov pA, origPA // pA = A mov pA, origPA // pA = A


strmm_kernel_L1_M4_BEGIN:
.Lstrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble strmm_kernel_L1_M2_BEGIN
ble .Lstrmm_kernel_L1_M2_BEGIN


strmm_kernel_L1_M4_20:
.Lstrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1154,10 +1154,10 @@ strmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M4_40
ble .Lstrmm_kernel_L1_M4_40
.align 5 .align 5


strmm_kernel_L1_M4_22:
.Lstrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1169,22 +1169,22 @@ strmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_22
bgt .Lstrmm_kernel_L1_M4_22




strmm_kernel_L1_M4_40:
.Lstrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M4_100
ble .Lstrmm_kernel_L1_M4_100


strmm_kernel_L1_M4_42:
.Lstrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M4_42
bgt .Lstrmm_kernel_L1_M4_42


strmm_kernel_L1_M4_100:
.Lstrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1204,22 +1204,22 @@ strmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


strmm_kernel_L1_M4_END:
.Lstrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt strmm_kernel_L1_M4_20
bgt .Lstrmm_kernel_L1_M4_20




strmm_kernel_L1_M2_BEGIN:
.Lstrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble strmm_kernel_L1_M1_BEGIN
ble .Lstrmm_kernel_L1_M1_BEGIN


strmm_kernel_L1_M2_20:
.Lstrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1243,9 +1243,9 @@ strmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M2_40
ble .Lstrmm_kernel_L1_M2_40


strmm_kernel_L1_M2_22:
.Lstrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1258,22 +1258,22 @@ strmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_22
bgt .Lstrmm_kernel_L1_M2_22




strmm_kernel_L1_M2_40:
.Lstrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M2_100
ble .Lstrmm_kernel_L1_M2_100


strmm_kernel_L1_M2_42:
.Lstrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M2_42
bgt .Lstrmm_kernel_L1_M2_42


strmm_kernel_L1_M2_100:
.Lstrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1294,15 +1294,15 @@ strmm_kernel_L1_M2_100:
#endif #endif




strmm_kernel_L1_M2_END:
.Lstrmm_kernel_L1_M2_END:




strmm_kernel_L1_M1_BEGIN:
.Lstrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble strmm_kernel_L1_END
ble .Lstrmm_kernel_L1_END


strmm_kernel_L1_M1_20:
.Lstrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1326,9 +1326,9 @@ strmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble strmm_kernel_L1_M1_40
ble .Lstrmm_kernel_L1_M1_40


strmm_kernel_L1_M1_22:
.Lstrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1340,22 +1340,22 @@ strmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_22
bgt .Lstrmm_kernel_L1_M1_22




strmm_kernel_L1_M1_40:
.Lstrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble strmm_kernel_L1_M1_100
ble .Lstrmm_kernel_L1_M1_100


strmm_kernel_L1_M1_42:
.Lstrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt strmm_kernel_L1_M1_42
bgt .Lstrmm_kernel_L1_M1_42


strmm_kernel_L1_M1_100:
.Lstrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1


@@ -1377,7 +1377,7 @@ strmm_kernel_L1_M1_100:
#endif #endif
#endif #endif


strmm_kernel_L1_END:
.Lstrmm_kernel_L1_END:


#if 0 #if 0
#if !defined(LEFT) #if !defined(LEFT)
@@ -1385,7 +1385,7 @@ strmm_kernel_L1_END:
#endif #endif
#endif #endif


strmm_kernel_L999:
.Lstrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 241
- 241
kernel/arm64/strmm_kernel_8x8.S
File diff suppressed because it is too large
View File


+ 21
- 21
kernel/arm64/swap.S View File

@@ -193,50 +193,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble swap_kernel_L999
ble .Lswap_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne swap_kernel_S_BEGIN
bne .Lswap_kernel_S_BEGIN


swap_kernel_F_BEGIN:
.Lswap_kernel_F_BEGIN:


asr I, N, #3 asr I, N, #3
cmp I, xzr cmp I, xzr
beq swap_kernel_F1
beq .Lswap_kernel_F1


swap_kernel_F8:
.Lswap_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne swap_kernel_F8
bne .Lswap_kernel_F8


swap_kernel_F1:
.Lswap_kernel_F1:


ands I, N, #7 ands I, N, #7
ble swap_kernel_L999
ble .Lswap_kernel_L999


swap_kernel_F10:
.Lswap_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne swap_kernel_F10
bne .Lswap_kernel_F10


b swap_kernel_L999
b .Lswap_kernel_L999




swap_kernel_S_BEGIN:
.Lswap_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble swap_kernel_S1
ble .Lswap_kernel_S1


swap_kernel_S4:
.Lswap_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -244,21 +244,21 @@ swap_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne swap_kernel_S4
bne .Lswap_kernel_S4


swap_kernel_S1:
.Lswap_kernel_S1:


ands I, N, #3 ands I, N, #3
ble swap_kernel_L999
ble .Lswap_kernel_L999


swap_kernel_S10:
.Lswap_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne swap_kernel_S10
bne .Lswap_kernel_S10


swap_kernel_L999:
.Lswap_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret


+ 25
- 25
kernel/arm64/zamax.S View File

@@ -184,62 +184,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero
cmp INC_X, xzr cmp INC_X, xzr
ble amax_kernel_zero
ble .Lzamax_kernel_zero


cmp INC_X, #1 cmp INC_X, #1
bne amax_kernel_S_BEGIN
bne .Lzamax_kernel_S_BEGIN


amax_kernel_F_BEGIN:
.Lzamax_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq amax_kernel_F1_INIT
beq .Lzamax_kernel_F1_INIT


INIT_F4 INIT_F4
subs I, I, #1 subs I, I, #1
beq amax_kernel_F1
beq .Lzamax_kernel_F1


amax_kernel_F4:
.Lzamax_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne amax_kernel_F4
bne .Lzamax_kernel_F4


amax_kernel_F1:
.Lzamax_kernel_F1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999


amax_kernel_F10:
.Lzamax_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne amax_kernel_F10
bne .Lzamax_kernel_F10


ret ret


amax_kernel_F1_INIT:
.Lzamax_kernel_F1_INIT:


INIT_F1 INIT_F1
subs N, N, #1 subs N, N, #1
b amax_kernel_F1
b .Lzamax_kernel_F1


amax_kernel_S_BEGIN:
.Lzamax_kernel_S_BEGIN:


INIT_S INIT_S


subs N, N, #1 subs N, N, #1
ble amax_kernel_L999
ble .Lzamax_kernel_L999


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble amax_kernel_S1
ble .Lzamax_kernel_S1


amax_kernel_S4:
.Lzamax_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -247,25 +247,25 @@ amax_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S4
bne .Lzamax_kernel_S4


amax_kernel_S1:
.Lzamax_kernel_S1:


ands I, N, #3 ands I, N, #3
ble amax_kernel_L999
ble .Lzamax_kernel_L999


amax_kernel_S10:
.Lzamax_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne amax_kernel_S10
bne .Lzamax_kernel_S10


amax_kernel_L999:
.Lzamax_kernel_L999:


ret ret


amax_kernel_zero:
.Lzamax_kernel_zero:


fmov MAXF, REG0 fmov MAXF, REG0
ret ret


+ 20
- 20
kernel/arm64/zasum.S View File

@@ -92,52 +92,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
fmov SUMF, REG0 fmov SUMF, REG0


cmp N, xzr cmp N, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999
cmp INC_X, xzr cmp INC_X, xzr
ble asum_kernel_L999
ble .Lzasum_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne asum_kernel_S_BEGIN
bne .Lzasum_kernel_S_BEGIN


asum_kernel_F_BEGIN:
.Lzasum_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq asum_kernel_F1
beq .Lzasum_kernel_F1


asum_kernel_F4:
.Lzasum_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne asum_kernel_F4
bne .Lzasum_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


asum_kernel_F1:
.Lzasum_kernel_F1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999


asum_kernel_F10:
.Lzasum_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne asum_kernel_F10
bne .Lzasum_kernel_F10


asum_kernel_L999:
.Lzasum_kernel_L999:
ret ret


asum_kernel_S_BEGIN:
.Lzasum_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble asum_kernel_S1
ble .Lzasum_kernel_S1


asum_kernel_S4:
.Lzasum_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -145,19 +145,19 @@ asum_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S4
bne .Lzasum_kernel_S4


asum_kernel_S1:
.Lzasum_kernel_S1:


ands I, N, #3 ands I, N, #3
ble asum_kernel_L999
ble .Lzasum_kernel_L999


asum_kernel_S10:
.Lzasum_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne asum_kernel_S10
bne .Lzasum_kernel_S10


ret ret




+ 21
- 21
kernel/arm64/zaxpy.S View File

@@ -241,62 +241,62 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999


mov Y_COPY, Y mov Y_COPY, Y


fcmp DA_R, #0.0 fcmp DA_R, #0.0
bne .L1 bne .L1
fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zaxpy_kernel_L999
beq .Lzaxpy_kernel_L999


.L1: .L1:
INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne zaxpy_kernel_S_BEGIN
bne .Lzaxpy_kernel_S_BEGIN


zaxpy_kernel_F_BEGIN:
.Lzaxpy_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq zaxpy_kernel_F1
beq .Lzaxpy_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


zaxpy_kernel_F4:
.Lzaxpy_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_F4
bne .Lzaxpy_kernel_F4


zaxpy_kernel_F1:
.Lzaxpy_kernel_F1:


ands I, N, #3 ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999


zaxpy_kernel_F10:
.Lzaxpy_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_F10
bne .Lzaxpy_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


zaxpy_kernel_S_BEGIN:
.Lzaxpy_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble zaxpy_kernel_S1
ble .Lzaxpy_kernel_S1


zaxpy_kernel_S4:
.Lzaxpy_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -304,21 +304,21 @@ zaxpy_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_S4
bne .Lzaxpy_kernel_S4


zaxpy_kernel_S1:
.Lzaxpy_kernel_S1:


ands I, N, #3 ands I, N, #3
ble zaxpy_kernel_L999
ble .Lzaxpy_kernel_L999


zaxpy_kernel_S10:
.Lzaxpy_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zaxpy_kernel_S10
bne .Lzaxpy_kernel_S10


zaxpy_kernel_L999:
.Lzaxpy_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 20
- 20
kernel/arm64/zdot.S View File

@@ -229,51 +229,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif


cmp N, xzr cmp N, xzr
ble dot_kernel_L999
ble .Lzdot_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne dot_kernel_S_BEGIN
bne .Lzdot_kernel_S_BEGIN


dot_kernel_F_BEGIN:
.Lzdot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq dot_kernel_F1
beq .Lzdot_kernel_F1


dot_kernel_F4:
.Lzdot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne dot_kernel_F4
bne .Lzdot_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


dot_kernel_F1:
.Lzdot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999


dot_kernel_F10:
.Lzdot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne dot_kernel_F10
bne .Lzdot_kernel_F10


ret ret


dot_kernel_S_BEGIN:
.Lzdot_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble dot_kernel_S1
ble .Lzdot_kernel_S1


dot_kernel_S4:
.Lzdot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -281,21 +281,21 @@ dot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S4
bne .Lzdot_kernel_S4


dot_kernel_S1:
.Lzdot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble dot_kernel_L999
ble .Lzdot_kernel_L999


dot_kernel_S10:
.Lzdot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne dot_kernel_S10
bne .Lzdot_kernel_S10


dot_kernel_L999:
.Lzdot_kernel_L999:


ret ret




+ 130
- 130
kernel/arm64/zgemm_kernel_4x4.S View File

@@ -1099,9 +1099,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN


zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1111,20 +1111,20 @@ zgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN


.align 5 .align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1136,10 +1136,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a


.align 5 .align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1151,10 +1151,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22


.align 5 .align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1165,13 +1165,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44


.align 5 .align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1182,55 +1182,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44




zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100


.align 5 .align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46


zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE4x4 SAVE4x4


zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20


zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN


zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40


zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1243,43 +1243,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22




zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100


zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42


zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:




zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40


zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1291,45 +1291,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22




zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100


zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42


zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1339,24 +1339,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction






zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN


zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5 .align 5


zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1368,50 +1368,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22




zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100


zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42


zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20




zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN


zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40


zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1424,43 +1424,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22




zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100


zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42


zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:




zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40


zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1472,37 +1472,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22




zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100


zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42


zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2 add origPB, origPB, temp // B = B + K * 2 * 8 * 2


/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1512,24 +1512,24 @@ zgemm_kernel_L1_BEGIN:






zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN


zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5 .align 5


zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1541,50 +1541,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22




zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100


zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42


zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20




zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN


zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40


zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1597,43 +1597,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22




zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100


zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42


zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:




zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40


zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1645,30 +1645,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22




zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100


zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42


zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:




zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 130
- 130
kernel/arm64/zgemm_kernel_4x4_thunderx2t99.S View File

@@ -1109,9 +1109,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble zgemm_kernel_L2_BEGIN
ble .Lzgemm_kernel_L2_BEGIN


zgemm_kernel_L4_BEGIN:
.Lzgemm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1121,20 +1121,20 @@ zgemm_kernel_L4_BEGIN:


mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


zgemm_kernel_L4_M4_BEGIN:
.Lzgemm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L4_M2_BEGIN
ble .Lzgemm_kernel_L4_M2_BEGIN


.align 5 .align 5
zgemm_kernel_L4_M4_20:
.Lzgemm_kernel_L4_M4_20:


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 asr counterL , origK, #3
cmp counterL , #2 cmp counterL , #2
blt zgemm_kernel_L4_M4_32
blt .Lzgemm_kernel_L4_M4_32


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1146,10 +1146,10 @@ zgemm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #2 // subtract 2 subs counterL, counterL, #2 // subtract 2
ble zgemm_kernel_L4_M4_22a
ble .Lzgemm_kernel_L4_M4_22a


.align 5 .align 5
zgemm_kernel_L4_M4_22:
.Lzgemm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1161,10 +1161,10 @@ zgemm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M4_22
bgt .Lzgemm_kernel_L4_M4_22


.align 5 .align 5
zgemm_kernel_L4_M4_22a:
.Lzgemm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1175,13 +1175,13 @@ zgemm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44


.align 5 .align 5
zgemm_kernel_L4_M4_32:
.Lzgemm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble zgemm_kernel_L4_M4_40
ble .Lzgemm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1192,55 +1192,55 @@ zgemm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b zgemm_kernel_L4_M4_44
b .Lzgemm_kernel_L4_M4_44




zgemm_kernel_L4_M4_40:
.Lzgemm_kernel_L4_M4_40:


INIT4x4 INIT4x4


zgemm_kernel_L4_M4_44:
.Lzgemm_kernel_L4_M4_44:


ands counterL , origK, #7 ands counterL , origK, #7
ble zgemm_kernel_L4_M4_100
ble .Lzgemm_kernel_L4_M4_100


.align 5 .align 5
zgemm_kernel_L4_M4_46:
.Lzgemm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne zgemm_kernel_L4_M4_46
bne .Lzgemm_kernel_L4_M4_46


zgemm_kernel_L4_M4_100:
.Lzgemm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA] prfm PLDL1KEEP, [pA]
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


SAVE4x4 SAVE4x4


zgemm_kernel_L4_M4_END:
.Lzgemm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne zgemm_kernel_L4_M4_20
bne .Lzgemm_kernel_L4_M4_20


zgemm_kernel_L4_M2_BEGIN:
.Lzgemm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L4_M1_BEGIN
ble .Lzgemm_kernel_L4_M1_BEGIN


zgemm_kernel_L4_M2_20:
.Lzgemm_kernel_L4_M2_20:


INIT2x4 INIT2x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M2_40
ble .Lzgemm_kernel_L4_M2_40


zgemm_kernel_L4_M2_22:
.Lzgemm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1253,43 +1253,43 @@ zgemm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_22
bgt .Lzgemm_kernel_L4_M2_22




zgemm_kernel_L4_M2_40:
.Lzgemm_kernel_L4_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M2_100
ble .Lzgemm_kernel_L4_M2_100


zgemm_kernel_L4_M2_42:
.Lzgemm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M2_42
bgt .Lzgemm_kernel_L4_M2_42


zgemm_kernel_L4_M2_100:
.Lzgemm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


zgemm_kernel_L4_M2_END:
.Lzgemm_kernel_L4_M2_END:




zgemm_kernel_L4_M1_BEGIN:
.Lzgemm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L4_END
ble .Lzgemm_kernel_L4_END


zgemm_kernel_L4_M1_20:
.Lzgemm_kernel_L4_M1_20:


INIT1x4 INIT1x4


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L4_M1_40
ble .Lzgemm_kernel_L4_M1_40


zgemm_kernel_L4_M1_22:
.Lzgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1301,45 +1301,45 @@ zgemm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_22
bgt .Lzgemm_kernel_L4_M1_22




zgemm_kernel_L4_M1_40:
.Lzgemm_kernel_L4_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L4_M1_100
ble .Lzgemm_kernel_L4_M1_100


zgemm_kernel_L4_M1_42:
.Lzgemm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L4_M1_42
bgt .Lzgemm_kernel_L4_M1_42


zgemm_kernel_L4_M1_100:
.Lzgemm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4




zgemm_kernel_L4_END:
.Lzgemm_kernel_L4_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt zgemm_kernel_L4_BEGIN
bgt .Lzgemm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lzgemm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble zgemm_kernel_L999
ble .Lzgemm_kernel_L999


tst counterJ , #2 tst counterJ , #2
ble zgemm_kernel_L1_BEGIN
ble .Lzgemm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1349,24 +1349,24 @@ zgemm_kernel_L2_BEGIN: // less than 2 left in N direction






zgemm_kernel_L2_M4_BEGIN:
.Lzgemm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble zgemm_kernel_L2_M2_BEGIN
ble .Lzgemm_kernel_L2_M2_BEGIN


zgemm_kernel_L2_M4_20:
.Lzgemm_kernel_L2_M4_20:


INIT4x2 INIT4x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M4_40
ble .Lzgemm_kernel_L2_M4_40
.align 5 .align 5


zgemm_kernel_L2_M4_22:
.Lzgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1378,50 +1378,50 @@ zgemm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_22
bgt .Lzgemm_kernel_L2_M4_22




zgemm_kernel_L2_M4_40:
.Lzgemm_kernel_L2_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M4_100
ble .Lzgemm_kernel_L2_M4_100


zgemm_kernel_L2_M4_42:
.Lzgemm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M4_42
bgt .Lzgemm_kernel_L2_M4_42


zgemm_kernel_L2_M4_100:
.Lzgemm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


zgemm_kernel_L2_M4_END:
.Lzgemm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L2_M4_20
bgt .Lzgemm_kernel_L2_M4_20




zgemm_kernel_L2_M2_BEGIN:
.Lzgemm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L2_M1_BEGIN
ble .Lzgemm_kernel_L2_M1_BEGIN


zgemm_kernel_L2_M2_20:
.Lzgemm_kernel_L2_M2_20:


INIT2x2 INIT2x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble zgemm_kernel_L2_M2_40
ble .Lzgemm_kernel_L2_M2_40


zgemm_kernel_L2_M2_22:
.Lzgemm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1434,43 +1434,43 @@ zgemm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_22
bgt .Lzgemm_kernel_L2_M2_22




zgemm_kernel_L2_M2_40:
.Lzgemm_kernel_L2_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M2_100
ble .Lzgemm_kernel_L2_M2_100


zgemm_kernel_L2_M2_42:
.Lzgemm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M2_42
bgt .Lzgemm_kernel_L2_M2_42


zgemm_kernel_L2_M2_100:
.Lzgemm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


zgemm_kernel_L2_M2_END:
.Lzgemm_kernel_L2_M2_END:




zgemm_kernel_L2_M1_BEGIN:
.Lzgemm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L2_END
ble .Lzgemm_kernel_L2_END


zgemm_kernel_L2_M1_20:
.Lzgemm_kernel_L2_M1_20:


INIT1x2 INIT1x2


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble zgemm_kernel_L2_M1_40
ble .Lzgemm_kernel_L2_M1_40


zgemm_kernel_L2_M1_22:
.Lzgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1482,37 +1482,37 @@ zgemm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_22
bgt .Lzgemm_kernel_L2_M1_22




zgemm_kernel_L2_M1_40:
.Lzgemm_kernel_L2_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L2_M1_100
ble .Lzgemm_kernel_L2_M1_100


zgemm_kernel_L2_M1_42:
.Lzgemm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L2_M1_42
bgt .Lzgemm_kernel_L2_M1_42


zgemm_kernel_L2_M1_100:
.Lzgemm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2




zgemm_kernel_L2_END:
.Lzgemm_kernel_L2_END:
lsl temp, origK, #5 lsl temp, origK, #5
add origPB, origPB, temp // B = B + K * 2 * 8 * 2 add origPB, origPB, temp // B = B + K * 2 * 8 * 2


/******************************************************************************/ /******************************************************************************/


zgemm_kernel_L1_BEGIN:
.Lzgemm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble zgemm_kernel_L999 // done
ble .Lzgemm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1522,24 +1522,24 @@ zgemm_kernel_L1_BEGIN:






zgemm_kernel_L1_M4_BEGIN:
.Lzgemm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble zgemm_kernel_L1_M2_BEGIN
ble .Lzgemm_kernel_L1_M2_BEGIN


zgemm_kernel_L1_M4_20:
.Lzgemm_kernel_L1_M4_20:


INIT4x1 INIT4x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M4_40
ble .Lzgemm_kernel_L1_M4_40
.align 5 .align 5


zgemm_kernel_L1_M4_22:
.Lzgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1551,50 +1551,50 @@ zgemm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_22
bgt .Lzgemm_kernel_L1_M4_22




zgemm_kernel_L1_M4_40:
.Lzgemm_kernel_L1_M4_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M4_100
ble .Lzgemm_kernel_L1_M4_100


zgemm_kernel_L1_M4_42:
.Lzgemm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M4_42
bgt .Lzgemm_kernel_L1_M4_42


zgemm_kernel_L1_M4_100:
.Lzgemm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


zgemm_kernel_L1_M4_END:
.Lzgemm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt zgemm_kernel_L1_M4_20
bgt .Lzgemm_kernel_L1_M4_20




zgemm_kernel_L1_M2_BEGIN:
.Lzgemm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble zgemm_kernel_L1_M1_BEGIN
ble .Lzgemm_kernel_L1_M1_BEGIN


zgemm_kernel_L1_M2_20:
.Lzgemm_kernel_L1_M2_20:


INIT2x1 INIT2x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M2_40
ble .Lzgemm_kernel_L1_M2_40


zgemm_kernel_L1_M2_22:
.Lzgemm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1607,43 +1607,43 @@ zgemm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_22
bgt .Lzgemm_kernel_L1_M2_22




zgemm_kernel_L1_M2_40:
.Lzgemm_kernel_L1_M2_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M2_100
ble .Lzgemm_kernel_L1_M2_100


zgemm_kernel_L1_M2_42:
.Lzgemm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M2_42
bgt .Lzgemm_kernel_L1_M2_42


zgemm_kernel_L1_M2_100:
.Lzgemm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


zgemm_kernel_L1_M2_END:
.Lzgemm_kernel_L1_M2_END:




zgemm_kernel_L1_M1_BEGIN:
.Lzgemm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble zgemm_kernel_L1_END
ble .Lzgemm_kernel_L1_END


zgemm_kernel_L1_M1_20:
.Lzgemm_kernel_L1_M1_20:


INIT1x1 INIT1x1


mov pB, origPB mov pB, origPB
asr counterL , origK, #3 // counterL = counterL / 8 asr counterL , origK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble zgemm_kernel_L1_M1_40
ble .Lzgemm_kernel_L1_M1_40


zgemm_kernel_L1_M1_22:
.Lzgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1655,30 +1655,30 @@ zgemm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_22
bgt .Lzgemm_kernel_L1_M1_22




zgemm_kernel_L1_M1_40:
.Lzgemm_kernel_L1_M1_40:


ands counterL , origK, #7 // counterL = counterL % 8 ands counterL , origK, #7 // counterL = counterL % 8
ble zgemm_kernel_L1_M1_100
ble .Lzgemm_kernel_L1_M1_100


zgemm_kernel_L1_M1_42:
.Lzgemm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt zgemm_kernel_L1_M1_42
bgt .Lzgemm_kernel_L1_M1_42


zgemm_kernel_L1_M1_100:
.Lzgemm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




zgemm_kernel_L1_END:
.Lzgemm_kernel_L1_END:




zgemm_kernel_L999:
.Lzgemm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


+ 26
- 26
kernel/arm64/zgemv_n.S View File

@@ -364,9 +364,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999
cmp M, xzr cmp M, xzr
ble zgemv_n_kernel_L999
ble .Lzgemv_n_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_X, INC_X, #SHZ lsl INC_X, INC_X, #SHZ
@@ -375,9 +375,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT INIT


cmp INC_Y, #1 cmp INC_Y, #1
bne zgemv_n_kernel_S_BEGIN
bne .Lzgemv_n_kernel_S_BEGIN


zgemv_n_kernel_F_LOOP:
.Lzgemv_n_kernel_F_LOOP:
mov A_PTR, A mov A_PTR, A
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y
@@ -387,40 +387,40 @@ zgemv_n_kernel_F_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
beq zgemv_n_kernel_F1
beq .Lzgemv_n_kernel_F1


zgemv_n_kernel_F4:
.Lzgemv_n_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_F4
bne .Lzgemv_n_kernel_F4


zgemv_n_kernel_F1:
.Lzgemv_n_kernel_F1:


ands I, M, #3 ands I, M, #3
ble zgemv_n_kernel_F_END
ble .Lzgemv_n_kernel_F_END


zgemv_n_kernel_F10:
.Lzgemv_n_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_F10
bne .Lzgemv_n_kernel_F10


zgemv_n_kernel_F_END:
.Lzgemv_n_kernel_F_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_n_kernel_F_LOOP
bne .Lzgemv_n_kernel_F_LOOP


b zgemv_n_kernel_L999
b .Lzgemv_n_kernel_L999


zgemv_n_kernel_S_BEGIN:
.Lzgemv_n_kernel_S_BEGIN:


INIT_S INIT_S


zgemv_n_kernel_S_LOOP:
.Lzgemv_n_kernel_S_LOOP:
mov A_PTR, A mov A_PTR, A
mov Y_IPTR, Y mov Y_IPTR, Y
mov Y_OPTR, Y mov Y_OPTR, Y
@@ -430,9 +430,9 @@ zgemv_n_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble zgemv_n_kernel_S1
ble .Lzgemv_n_kernel_S1


zgemv_n_kernel_S4:
.Lzgemv_n_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -440,27 +440,27 @@ zgemv_n_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_S4
bne .Lzgemv_n_kernel_S4


zgemv_n_kernel_S1:
.Lzgemv_n_kernel_S1:


ands I, M, #3 ands I, M, #3
ble zgemv_n_kernel_S_END
ble .Lzgemv_n_kernel_S_END


zgemv_n_kernel_S10:
.Lzgemv_n_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_n_kernel_S10
bne .Lzgemv_n_kernel_S10


zgemv_n_kernel_S_END:
.Lzgemv_n_kernel_S_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_n_kernel_S_LOOP
bne .Lzgemv_n_kernel_S_LOOP


zgemv_n_kernel_L999:
.Lzgemv_n_kernel_L999:
RESTORE_REGS RESTORE_REGS


mov w0, wzr mov w0, wzr


+ 26
- 26
kernel/arm64/zgemv_t.S View File

@@ -292,9 +292,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
SAVE_REGS SAVE_REGS


cmp N, xzr cmp N, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999
cmp M, xzr cmp M, xzr
ble zgemv_t_kernel_L999
ble .Lzgemv_t_kernel_L999


lsl LDA, LDA, #SHZ lsl LDA, LDA, #SHZ
lsl INC_Y, INC_Y, #SHZ lsl INC_Y, INC_Y, #SHZ
@@ -303,9 +303,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne zgemv_t_kernel_S_BEGIN
bne .Lzgemv_t_kernel_S_BEGIN


zgemv_t_kernel_F_LOOP:
.Lzgemv_t_kernel_F_LOOP:


mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X
@@ -314,30 +314,30 @@ zgemv_t_kernel_F_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
beq zgemv_t_kernel_F1
beq .Lzgemv_t_kernel_F1


zgemv_t_kernel_F4:
.Lzgemv_t_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_F4
bne .Lzgemv_t_kernel_F4


KERNEL_F4_FINALIZE KERNEL_F4_FINALIZE


zgemv_t_kernel_F1:
.Lzgemv_t_kernel_F1:


ands I, M, #3 ands I, M, #3
ble zgemv_t_kernel_F_END
ble .Lzgemv_t_kernel_F_END


zgemv_t_kernel_F10:
.Lzgemv_t_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_F10
bne .Lzgemv_t_kernel_F10


zgemv_t_kernel_F_END:
.Lzgemv_t_kernel_F_END:


#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v4.2s}, [Y] ld1 {v4.2s}, [Y]
@@ -355,15 +355,15 @@ zgemv_t_kernel_F_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_t_kernel_F_LOOP
bne .Lzgemv_t_kernel_F_LOOP


b zgemv_t_kernel_L999
b .Lzgemv_t_kernel_L999


zgemv_t_kernel_S_BEGIN:
.Lzgemv_t_kernel_S_BEGIN:


INIT_S INIT_S


zgemv_t_kernel_S_LOOP:
.Lzgemv_t_kernel_S_LOOP:


mov A_PTR, A mov A_PTR, A
mov X_PTR, X mov X_PTR, X
@@ -371,9 +371,9 @@ zgemv_t_kernel_S_LOOP:


asr I, M, #2 asr I, M, #2
cmp I, xzr cmp I, xzr
ble zgemv_t_kernel_S1
ble .Lzgemv_t_kernel_S1


zgemv_t_kernel_S4:
.Lzgemv_t_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -381,21 +381,21 @@ zgemv_t_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_S4
bne .Lzgemv_t_kernel_S4


zgemv_t_kernel_S1:
.Lzgemv_t_kernel_S1:


ands I, M, #3 ands I, M, #3
ble zgemv_t_kernel_S_END
ble .Lzgemv_t_kernel_S_END


zgemv_t_kernel_S10:
.Lzgemv_t_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zgemv_t_kernel_S10
bne .Lzgemv_t_kernel_S10


zgemv_t_kernel_S_END:
.Lzgemv_t_kernel_S_END:


#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v4.2s}, [Y] ld1 {v4.2s}, [Y]
@@ -413,9 +413,9 @@ zgemv_t_kernel_S_END:


add A, A, LDA add A, A, LDA
subs J, J, #1 subs J, J, #1
bne zgemv_t_kernel_S_LOOP
bne .Lzgemv_t_kernel_S_LOOP


zgemv_t_kernel_L999:
.Lzgemv_t_kernel_L999:
RESTORE_REGS RESTORE_REGS
mov w0, wzr mov w0, wzr
ret ret


+ 16
- 16
kernel/arm64/znrm2.S View File

@@ -226,43 +226,43 @@ KERNEL_S1_END_\@:
INIT INIT


cmp N, #0 cmp N, #0
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999


cmp INC_X, #0 cmp INC_X, #0
beq nrm2_kernel_L999
beq .Lznrm2_kernel_L999


cmp INC_X, #1 cmp INC_X, #1
bne nrm2_kernel_S_BEGIN
bne .Lznrm2_kernel_S_BEGIN


nrm2_kernel_F_BEGIN:
.Lznrm2_kernel_F_BEGIN:


asr I, N, #3 // I = N / 8 asr I, N, #3 // I = N / 8
cmp I, xzr cmp I, xzr
ble nrm2_kernel_F1
ble .Lznrm2_kernel_F1


nrm2_kernel_F8:
.Lznrm2_kernel_F8:


KERNEL_F8 KERNEL_F8


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F8
bne .Lznrm2_kernel_F8


nrm2_kernel_F1:
.Lznrm2_kernel_F1:


ands I, N, #7 ands I, N, #7
ble nrm2_kernel_L999
ble .Lznrm2_kernel_L999




nrm2_kernel_F10:
.Lznrm2_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_F10
bne .Lznrm2_kernel_F10


b nrm2_kernel_L999
b .Lznrm2_kernel_L999


nrm2_kernel_S_BEGIN:
.Lznrm2_kernel_S_BEGIN:


INIT_S INIT_S


@@ -270,15 +270,15 @@ nrm2_kernel_S_BEGIN:


.align 5 .align 5


nrm2_kernel_S10:
.Lznrm2_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne nrm2_kernel_S10
bne .Lznrm2_kernel_S10




nrm2_kernel_L999:
.Lznrm2_kernel_L999:
fsqrt SSQ, SSQ fsqrt SSQ, SSQ
fmul SSQ, SCALE, SSQ fmul SSQ, SCALE, SSQ




+ 20
- 20
kernel/arm64/zrot.S View File

@@ -181,54 +181,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
PROLOGUE PROLOGUE


cmp N, xzr cmp N, xzr
ble rot_kernel_L999
ble .Lzrot_kernel_L999


INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN
cmp INC_Y, #1 cmp INC_Y, #1
bne rot_kernel_S_BEGIN
bne .Lzrot_kernel_S_BEGIN


rot_kernel_F_BEGIN:
.Lzrot_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq rot_kernel_F1
beq .Lzrot_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


rot_kernel_F4:
.Lzrot_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne rot_kernel_F4
bne .Lzrot_kernel_F4


rot_kernel_F1:
.Lzrot_kernel_F1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999


rot_kernel_F10:
.Lzrot_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne rot_kernel_F10
bne .Lzrot_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


rot_kernel_S_BEGIN:
.Lzrot_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble rot_kernel_S1
ble .Lzrot_kernel_S1


rot_kernel_S4:
.Lzrot_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -236,21 +236,21 @@ rot_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S4
bne .Lzrot_kernel_S4


rot_kernel_S1:
.Lzrot_kernel_S1:


ands I, N, #3 ands I, N, #3
ble rot_kernel_L999
ble .Lzrot_kernel_L999


rot_kernel_S10:
.Lzrot_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne rot_kernel_S10
bne .Lzrot_kernel_S10


rot_kernel_L999:
.Lzrot_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret

+ 34
- 34
kernel/arm64/zscal.S View File

@@ -215,71 +215,71 @@ zscal_begin:
mov X_COPY, X mov X_COPY, X


cmp N, xzr cmp N, xzr
ble zscal_kernel_L999
ble .Lzscal_kernel_L999


fcmp DA_R, #0.0 fcmp DA_R, #0.0
bne zscal_kernel_R_non_zero
bne .Lzscal_kernel_R_non_zero


fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zscal_kernel_RI_zero
beq .Lzscal_kernel_RI_zero


b zscal_kernel_R_zero
b .Lzscal_kernel_R_zero


zscal_kernel_R_non_zero:
.Lzscal_kernel_R_non_zero:


fcmp DA_I, #0.0 fcmp DA_I, #0.0
beq zscal_kernel_I_zero
beq .Lzscal_kernel_I_zero


/******************************************************************************* /*******************************************************************************
* A_R != 0 && A_I != 0 * A_R != 0 && A_I != 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_RI_non_zero:
.Lzscal_kernel_RI_non_zero:


INIT INIT


cmp INC_X, #1 cmp INC_X, #1
bne zscal_kernel_S_BEGIN
bne .Lzscal_kernel_S_BEGIN


zscal_kernel_F_BEGIN:
.Lzscal_kernel_F_BEGIN:


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
beq zscal_kernel_F1
beq .Lzscal_kernel_F1


KERNEL_INIT_F4 KERNEL_INIT_F4


zscal_kernel_F4:
.Lzscal_kernel_F4:


KERNEL_F4 KERNEL_F4


subs I, I, #1 subs I, I, #1
bne zscal_kernel_F4
bne .Lzscal_kernel_F4


zscal_kernel_F1:
.Lzscal_kernel_F1:


ands I, N, #3 ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999


zscal_kernel_F10:
.Lzscal_kernel_F10:


KERNEL_F1 KERNEL_F1


subs I, I, #1 subs I, I, #1
bne zscal_kernel_F10
bne .Lzscal_kernel_F10


mov w0, wzr mov w0, wzr
ret ret


zscal_kernel_S_BEGIN:
.Lzscal_kernel_S_BEGIN:


INIT_S INIT_S


asr I, N, #2 asr I, N, #2
cmp I, xzr cmp I, xzr
ble zscal_kernel_S1
ble .Lzscal_kernel_S1


zscal_kernel_S4:
.Lzscal_kernel_S4:


KERNEL_S1 KERNEL_S1
KERNEL_S1 KERNEL_S1
@@ -287,21 +287,21 @@ zscal_kernel_S4:
KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zscal_kernel_S4
bne .Lzscal_kernel_S4


zscal_kernel_S1:
.Lzscal_kernel_S1:


ands I, N, #3 ands I, N, #3
ble zscal_kernel_L999
ble .Lzscal_kernel_L999


zscal_kernel_S10:
.Lzscal_kernel_S10:


KERNEL_S1 KERNEL_S1


subs I, I, #1 subs I, I, #1
bne zscal_kernel_S10
bne .Lzscal_kernel_S10


zscal_kernel_L999:
.Lzscal_kernel_L999:


mov w0, wzr mov w0, wzr
ret ret
@@ -310,7 +310,7 @@ zscal_kernel_L999:
* A_R == 0 && A_I != 0 * A_R == 0 && A_I != 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_R_zero:
.Lzscal_kernel_R_zero:
INIT_S INIT_S


#if !defined(DOUBLE) #if !defined(DOUBLE)
@@ -323,7 +323,7 @@ zscal_kernel_R_zero:
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I
#endif #endif


zscal_kernel_R_zero_1:
.Lzscal_kernel_R_zero_1:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0 ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0
@@ -337,7 +337,7 @@ zscal_kernel_R_zero_1:
#endif #endif
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_R_zero_1
bne .Lzscal_kernel_R_zero_1


mov w0, wzr mov w0, wzr
ret ret
@@ -346,7 +346,7 @@ zscal_kernel_R_zero_1:
* A_R != 0 && A_I == 0 * A_R != 0 && A_I == 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_I_zero:
.Lzscal_kernel_I_zero:
INIT_S INIT_S
#if !defined(DOUBLE) #if !defined(DOUBLE)
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R
@@ -354,7 +354,7 @@ zscal_kernel_I_zero:
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R
#endif #endif


zscal_kernel_I_zero_1:
.Lzscal_kernel_I_zero_1:
#if !defined(DOUBLE) #if !defined(DOUBLE)
ld1 {v2.2s}, [X] // X1, X0 ld1 {v2.2s}, [X] // X1, X0
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0
@@ -366,7 +366,7 @@ zscal_kernel_I_zero_1:
#endif #endif
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_I_zero_1
bne .Lzscal_kernel_I_zero_1


mov w0, wzr mov w0, wzr
ret ret
@@ -375,16 +375,16 @@ zscal_kernel_I_zero_1:
* A_R == 0 && A_I == 0 * A_R == 0 && A_I == 0
*******************************************************************************/ *******************************************************************************/


zscal_kernel_RI_zero:
.Lzscal_kernel_RI_zero:


INIT_S INIT_S


zscal_kernel_RI_zero_1:
.Lzscal_kernel_RI_zero_1:


stp DA_R, DA_I, [X] stp DA_R, DA_I, [X]
add X, X, INC_X add X, X, INC_X
subs N, N, #1 subs N, N, #1
bne zscal_kernel_RI_zero_1
bne .Lzscal_kernel_RI_zero_1


mov w0, wzr mov w0, wzr
ret ret


+ 130
- 130
kernel/arm64/ztrmm_kernel_4x4.S View File

@@ -1078,9 +1078,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
mov counterJ, origN mov counterJ, origN
asr counterJ, counterJ, #2 // J = J / 4 asr counterJ, counterJ, #2 // J = J / 4
cmp counterJ, #0 cmp counterJ, #0
ble ztrmm_kernel_L2_BEGIN
ble .Lztrmm_kernel_L2_BEGIN


ztrmm_kernel_L4_BEGIN:
.Lztrmm_kernel_L4_BEGIN:
mov pCRow0, pC mov pCRow0, pC
add pCRow1, pCRow0, LDC add pCRow1, pCRow0, LDC
add pCRow2, pCRow1, LDC add pCRow2, pCRow1, LDC
@@ -1094,15 +1094,15 @@ ztrmm_kernel_L4_BEGIN:
#endif #endif
mov pA, origPA // pA = start of A array mov pA, origPA // pA = start of A array


ztrmm_kernel_L4_M4_BEGIN:
.Lztrmm_kernel_L4_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ztrmm_kernel_L4_M2_BEGIN
ble .Lztrmm_kernel_L4_M2_BEGIN


.align 5 .align 5
ztrmm_kernel_L4_M4_20:
.Lztrmm_kernel_L4_M4_20:


#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
mov pB, origPB mov pB, origPB
@@ -1123,7 +1123,7 @@ ztrmm_kernel_L4_M4_20:


asr counterL , tempK, #3 asr counterL , tempK, #3
cmp counterL , #2 cmp counterL , #2
blt ztrmm_kernel_L4_M4_32
blt .Lztrmm_kernel_L4_M4_32


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1135,10 +1135,10 @@ ztrmm_kernel_L4_M4_20:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #2 subs counterL, counterL, #2
ble ztrmm_kernel_L4_M4_22a
ble .Lztrmm_kernel_L4_M4_22a


.align 5 .align 5
ztrmm_kernel_L4_M4_22:
.Lztrmm_kernel_L4_M4_22:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1150,10 +1150,10 @@ ztrmm_kernel_L4_M4_22:
KERNEL4x4_M2 KERNEL4x4_M2


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M4_22
bgt .Lztrmm_kernel_L4_M4_22


.align 5 .align 5
ztrmm_kernel_L4_M4_22a:
.Lztrmm_kernel_L4_M4_22a:


KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1164,13 +1164,13 @@ ztrmm_kernel_L4_M4_22a:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44


.align 5 .align 5
ztrmm_kernel_L4_M4_32:
.Lztrmm_kernel_L4_M4_32:


tst counterL, #1 tst counterL, #1
ble ztrmm_kernel_L4_M4_40
ble .Lztrmm_kernel_L4_M4_40


KERNEL4x4_I KERNEL4x4_I
KERNEL4x4_M2 KERNEL4x4_M2
@@ -1181,26 +1181,26 @@ ztrmm_kernel_L4_M4_32:
KERNEL4x4_M1 KERNEL4x4_M1
KERNEL4x4_E KERNEL4x4_E


b ztrmm_kernel_L4_M4_44
b .Lztrmm_kernel_L4_M4_44




ztrmm_kernel_L4_M4_40:
.Lztrmm_kernel_L4_M4_40:


INIT4x4 INIT4x4


ztrmm_kernel_L4_M4_44:
.Lztrmm_kernel_L4_M4_44:


ands counterL , tempK, #7 ands counterL , tempK, #7
ble ztrmm_kernel_L4_M4_100
ble .Lztrmm_kernel_L4_M4_100


.align 5 .align 5
ztrmm_kernel_L4_M4_46:
.Lztrmm_kernel_L4_M4_46:
KERNEL4x4_SUB KERNEL4x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bne ztrmm_kernel_L4_M4_46
bne .Lztrmm_kernel_L4_M4_46


ztrmm_kernel_L4_M4_100:
.Lztrmm_kernel_L4_M4_100:


SAVE4x4 SAVE4x4


@@ -1223,20 +1223,20 @@ ztrmm_kernel_L4_M4_100:
prfm PLDL1KEEP, [pA, #64] prfm PLDL1KEEP, [pA, #64]
prfm PLDL1KEEP, [origPB] prfm PLDL1KEEP, [origPB]


ztrmm_kernel_L4_M4_END:
.Lztrmm_kernel_L4_M4_END:
subs counterI, counterI, #1 subs counterI, counterI, #1
bne ztrmm_kernel_L4_M4_20
bne .Lztrmm_kernel_L4_M4_20


ztrmm_kernel_L4_M2_BEGIN:
.Lztrmm_kernel_L4_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L4_M1_BEGIN
ble .Lztrmm_kernel_L4_M1_BEGIN


ztrmm_kernel_L4_M2_20:
.Lztrmm_kernel_L4_M2_20:


INIT2x4 INIT2x4


@@ -1260,9 +1260,9 @@ ztrmm_kernel_L4_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L4_M2_40
ble .Lztrmm_kernel_L4_M2_40


ztrmm_kernel_L4_M2_22:
.Lztrmm_kernel_L4_M2_22:


KERNEL2x4_SUB KERNEL2x4_SUB
KERNEL2x4_SUB KERNEL2x4_SUB
@@ -1275,22 +1275,22 @@ ztrmm_kernel_L4_M2_22:
KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_22
bgt .Lztrmm_kernel_L4_M2_22




ztrmm_kernel_L4_M2_40:
.Lztrmm_kernel_L4_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M2_100
ble .Lztrmm_kernel_L4_M2_100


ztrmm_kernel_L4_M2_42:
.Lztrmm_kernel_L4_M2_42:


KERNEL2x4_SUB KERNEL2x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M2_42
bgt .Lztrmm_kernel_L4_M2_42


ztrmm_kernel_L4_M2_100:
.Lztrmm_kernel_L4_M2_100:


SAVE2x4 SAVE2x4


@@ -1310,15 +1310,15 @@ ztrmm_kernel_L4_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ztrmm_kernel_L4_M2_END:
.Lztrmm_kernel_L4_M2_END:




ztrmm_kernel_L4_M1_BEGIN:
.Lztrmm_kernel_L4_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L4_END
ble .Lztrmm_kernel_L4_END


ztrmm_kernel_L4_M1_20:
.Lztrmm_kernel_L4_M1_20:


INIT1x4 INIT1x4


@@ -1342,9 +1342,9 @@ ztrmm_kernel_L4_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L4_M1_40
ble .Lztrmm_kernel_L4_M1_40


ztrmm_kernel_L4_M1_22:
.Lztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
KERNEL1x4_SUB KERNEL1x4_SUB
@@ -1356,22 +1356,22 @@ ztrmm_kernel_L4_M1_22:
KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_22
bgt .Lztrmm_kernel_L4_M1_22




ztrmm_kernel_L4_M1_40:
.Lztrmm_kernel_L4_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L4_M1_100
ble .Lztrmm_kernel_L4_M1_100


ztrmm_kernel_L4_M1_42:
.Lztrmm_kernel_L4_M1_42:


KERNEL1x4_SUB KERNEL1x4_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L4_M1_42
bgt .Lztrmm_kernel_L4_M1_42


ztrmm_kernel_L4_M1_100:
.Lztrmm_kernel_L4_M1_100:


SAVE1x4 SAVE1x4


@@ -1392,7 +1392,7 @@ ztrmm_kernel_L4_M1_100:
#endif #endif




ztrmm_kernel_L4_END:
.Lztrmm_kernel_L4_END:


lsl temp, origK, #6 lsl temp, origK, #6
add origPB, origPB, temp // B = B + K * 4 * 8 * 2 add origPB, origPB, temp // B = B + K * 4 * 8 * 2
@@ -1402,19 +1402,19 @@ ztrmm_kernel_L4_END:
#endif #endif


subs counterJ, counterJ , #1 // j-- subs counterJ, counterJ , #1 // j--
bgt ztrmm_kernel_L4_BEGIN
bgt .Lztrmm_kernel_L4_BEGIN




/******************************************************************************/ /******************************************************************************/


ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction
.Lztrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov counterJ , origN mov counterJ , origN
tst counterJ , #3 tst counterJ , #3
ble ztrmm_kernel_L999 // error, N was less than 4?
ble .Lztrmm_kernel_L999 // error, N was less than 4?


tst counterJ , #2 tst counterJ , #2
ble ztrmm_kernel_L1_BEGIN
ble .Lztrmm_kernel_L1_BEGIN


mov pCRow0, pC // pCRow0 = pC mov pCRow0, pC // pCRow0 = pC


@@ -1426,14 +1426,14 @@ ztrmm_kernel_L2_BEGIN: // less than 2 left in N direction


mov pA, origPA // pA = A mov pA, origPA // pA = A


ztrmm_kernel_L2_M4_BEGIN:
.Lztrmm_kernel_L2_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI,#0 cmp counterI,#0
ble ztrmm_kernel_L2_M2_BEGIN
ble .Lztrmm_kernel_L2_M2_BEGIN


ztrmm_kernel_L2_M4_20:
.Lztrmm_kernel_L2_M4_20:


INIT4x2 INIT4x2


@@ -1457,10 +1457,10 @@ ztrmm_kernel_L2_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ztrmm_kernel_L2_M4_40
ble .Lztrmm_kernel_L2_M4_40
.align 5 .align 5


ztrmm_kernel_L2_M4_22:
.Lztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
KERNEL4x2_SUB KERNEL4x2_SUB
@@ -1472,22 +1472,22 @@ ztrmm_kernel_L2_M4_22:
KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_22
bgt .Lztrmm_kernel_L2_M4_22




ztrmm_kernel_L2_M4_40:
.Lztrmm_kernel_L2_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M4_100
ble .Lztrmm_kernel_L2_M4_100


ztrmm_kernel_L2_M4_42:
.Lztrmm_kernel_L2_M4_42:


KERNEL4x2_SUB KERNEL4x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M4_42
bgt .Lztrmm_kernel_L2_M4_42


ztrmm_kernel_L2_M4_100:
.Lztrmm_kernel_L2_M4_100:


SAVE4x2 SAVE4x2


@@ -1507,22 +1507,22 @@ ztrmm_kernel_L2_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ztrmm_kernel_L2_M4_END:
.Lztrmm_kernel_L2_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ztrmm_kernel_L2_M4_20
bgt .Lztrmm_kernel_L2_M4_20




ztrmm_kernel_L2_M2_BEGIN:
.Lztrmm_kernel_L2_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L2_M1_BEGIN
ble .Lztrmm_kernel_L2_M1_BEGIN


ztrmm_kernel_L2_M2_20:
.Lztrmm_kernel_L2_M2_20:


INIT2x2 INIT2x2


@@ -1546,9 +1546,9 @@ ztrmm_kernel_L2_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL,#0 cmp counterL,#0
ble ztrmm_kernel_L2_M2_40
ble .Lztrmm_kernel_L2_M2_40


ztrmm_kernel_L2_M2_22:
.Lztrmm_kernel_L2_M2_22:


KERNEL2x2_SUB KERNEL2x2_SUB
KERNEL2x2_SUB KERNEL2x2_SUB
@@ -1561,22 +1561,22 @@ ztrmm_kernel_L2_M2_22:
KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_22
bgt .Lztrmm_kernel_L2_M2_22




ztrmm_kernel_L2_M2_40:
.Lztrmm_kernel_L2_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M2_100
ble .Lztrmm_kernel_L2_M2_100


ztrmm_kernel_L2_M2_42:
.Lztrmm_kernel_L2_M2_42:


KERNEL2x2_SUB KERNEL2x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M2_42
bgt .Lztrmm_kernel_L2_M2_42


ztrmm_kernel_L2_M2_100:
.Lztrmm_kernel_L2_M2_100:


SAVE2x2 SAVE2x2


@@ -1596,15 +1596,15 @@ ztrmm_kernel_L2_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ztrmm_kernel_L2_M2_END:
.Lztrmm_kernel_L2_M2_END:




ztrmm_kernel_L2_M1_BEGIN:
.Lztrmm_kernel_L2_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L2_END
ble .Lztrmm_kernel_L2_END


ztrmm_kernel_L2_M1_20:
.Lztrmm_kernel_L2_M1_20:


INIT1x2 INIT1x2


@@ -1628,9 +1628,9 @@ ztrmm_kernel_L2_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL, #0 cmp counterL, #0
ble ztrmm_kernel_L2_M1_40
ble .Lztrmm_kernel_L2_M1_40


ztrmm_kernel_L2_M1_22:
.Lztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
KERNEL1x2_SUB KERNEL1x2_SUB
@@ -1642,22 +1642,22 @@ ztrmm_kernel_L2_M1_22:
KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_22
bgt .Lztrmm_kernel_L2_M1_22




ztrmm_kernel_L2_M1_40:
.Lztrmm_kernel_L2_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L2_M1_100
ble .Lztrmm_kernel_L2_M1_100


ztrmm_kernel_L2_M1_42:
.Lztrmm_kernel_L2_M1_42:


KERNEL1x2_SUB KERNEL1x2_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L2_M1_42
bgt .Lztrmm_kernel_L2_M1_42


ztrmm_kernel_L2_M1_100:
.Lztrmm_kernel_L2_M1_100:


SAVE1x2 SAVE1x2


@@ -1678,7 +1678,7 @@ ztrmm_kernel_L2_M1_100:
#endif #endif




ztrmm_kernel_L2_END:
.Lztrmm_kernel_L2_END:
#if !defined(LEFT) #if !defined(LEFT)
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif
@@ -1688,11 +1688,11 @@ ztrmm_kernel_L2_END:


/******************************************************************************/ /******************************************************************************/


ztrmm_kernel_L1_BEGIN:
.Lztrmm_kernel_L1_BEGIN:


mov counterJ , origN mov counterJ , origN
tst counterJ , #1 tst counterJ , #1
ble ztrmm_kernel_L999 // done
ble .Lztrmm_kernel_L999 // done




mov pCRow0, pC // pCRow0 = C mov pCRow0, pC // pCRow0 = C
@@ -1706,14 +1706,14 @@ ztrmm_kernel_L1_BEGIN:






ztrmm_kernel_L1_M4_BEGIN:
.Lztrmm_kernel_L1_M4_BEGIN:


mov counterI, origM mov counterI, origM
asr counterI, counterI, #2 // counterI = counterI / 4 asr counterI, counterI, #2 // counterI = counterI / 4
cmp counterI, #0 cmp counterI, #0
ble ztrmm_kernel_L1_M2_BEGIN
ble .Lztrmm_kernel_L1_M2_BEGIN


ztrmm_kernel_L1_M4_20:
.Lztrmm_kernel_L1_M4_20:


INIT4x1 INIT4x1


@@ -1737,10 +1737,10 @@ ztrmm_kernel_L1_M4_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M4_40
ble .Lztrmm_kernel_L1_M4_40
.align 5 .align 5


ztrmm_kernel_L1_M4_22:
.Lztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
KERNEL4x1_SUB KERNEL4x1_SUB
@@ -1752,22 +1752,22 @@ ztrmm_kernel_L1_M4_22:
KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_22
bgt .Lztrmm_kernel_L1_M4_22




ztrmm_kernel_L1_M4_40:
.Lztrmm_kernel_L1_M4_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M4_100
ble .Lztrmm_kernel_L1_M4_100


ztrmm_kernel_L1_M4_42:
.Lztrmm_kernel_L1_M4_42:


KERNEL4x1_SUB KERNEL4x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M4_42
bgt .Lztrmm_kernel_L1_M4_42


ztrmm_kernel_L1_M4_100:
.Lztrmm_kernel_L1_M4_100:


SAVE4x1 SAVE4x1


@@ -1787,22 +1787,22 @@ ztrmm_kernel_L1_M4_100:
add tempOffset, tempOffset, #4 add tempOffset, tempOffset, #4
#endif #endif


ztrmm_kernel_L1_M4_END:
.Lztrmm_kernel_L1_M4_END:


subs counterI, counterI, #1 subs counterI, counterI, #1
bgt ztrmm_kernel_L1_M4_20
bgt .Lztrmm_kernel_L1_M4_20




ztrmm_kernel_L1_M2_BEGIN:
.Lztrmm_kernel_L1_M2_BEGIN:


mov counterI, origM mov counterI, origM
tst counterI , #3 tst counterI , #3
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END


tst counterI, #2 // counterI = counterI / 2 tst counterI, #2 // counterI = counterI / 2
ble ztrmm_kernel_L1_M1_BEGIN
ble .Lztrmm_kernel_L1_M1_BEGIN


ztrmm_kernel_L1_M2_20:
.Lztrmm_kernel_L1_M2_20:


INIT2x1 INIT2x1


@@ -1826,9 +1826,9 @@ ztrmm_kernel_L1_M2_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M2_40
ble .Lztrmm_kernel_L1_M2_40


ztrmm_kernel_L1_M2_22:
.Lztrmm_kernel_L1_M2_22:


KERNEL2x1_SUB KERNEL2x1_SUB
KERNEL2x1_SUB KERNEL2x1_SUB
@@ -1841,22 +1841,22 @@ ztrmm_kernel_L1_M2_22:
KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_22
bgt .Lztrmm_kernel_L1_M2_22




ztrmm_kernel_L1_M2_40:
.Lztrmm_kernel_L1_M2_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M2_100
ble .Lztrmm_kernel_L1_M2_100


ztrmm_kernel_L1_M2_42:
.Lztrmm_kernel_L1_M2_42:


KERNEL2x1_SUB KERNEL2x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M2_42
bgt .Lztrmm_kernel_L1_M2_42


ztrmm_kernel_L1_M2_100:
.Lztrmm_kernel_L1_M2_100:


SAVE2x1 SAVE2x1


@@ -1876,15 +1876,15 @@ ztrmm_kernel_L1_M2_100:
add tempOffset, tempOffset, #2 add tempOffset, tempOffset, #2
#endif #endif


ztrmm_kernel_L1_M2_END:
.Lztrmm_kernel_L1_M2_END:




ztrmm_kernel_L1_M1_BEGIN:
.Lztrmm_kernel_L1_M1_BEGIN:


tst counterI, #1 // counterI = counterI % 2 tst counterI, #1 // counterI = counterI % 2
ble ztrmm_kernel_L1_END
ble .Lztrmm_kernel_L1_END


ztrmm_kernel_L1_M1_20:
.Lztrmm_kernel_L1_M1_20:


INIT1x1 INIT1x1


@@ -1908,9 +1908,9 @@ ztrmm_kernel_L1_M1_20:


asr counterL , tempK, #3 // counterL = counterL / 8 asr counterL , tempK, #3 // counterL = counterL / 8
cmp counterL , #0 cmp counterL , #0
ble ztrmm_kernel_L1_M1_40
ble .Lztrmm_kernel_L1_M1_40


ztrmm_kernel_L1_M1_22:
.Lztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
KERNEL1x1_SUB KERNEL1x1_SUB
@@ -1922,30 +1922,30 @@ ztrmm_kernel_L1_M1_22:
KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_22
bgt .Lztrmm_kernel_L1_M1_22




ztrmm_kernel_L1_M1_40:
.Lztrmm_kernel_L1_M1_40:


ands counterL , tempK, #7 // counterL = counterL % 8 ands counterL , tempK, #7 // counterL = counterL % 8
ble ztrmm_kernel_L1_M1_100
ble .Lztrmm_kernel_L1_M1_100


ztrmm_kernel_L1_M1_42:
.Lztrmm_kernel_L1_M1_42:


KERNEL1x1_SUB KERNEL1x1_SUB


subs counterL, counterL, #1 subs counterL, counterL, #1
bgt ztrmm_kernel_L1_M1_42
bgt .Lztrmm_kernel_L1_M1_42


ztrmm_kernel_L1_M1_100:
.Lztrmm_kernel_L1_M1_100:


SAVE1x1 SAVE1x1




ztrmm_kernel_L1_END:
.Lztrmm_kernel_L1_END:




ztrmm_kernel_L999:
.Lztrmm_kernel_L999:
mov x0, #0 // set return value mov x0, #0 // set return value
ldp d8, d9, [sp, #(0 * 16)] ldp d8, d9, [sp, #(0 * 16)]
ldp d10, d11, [sp, #(1 * 16)] ldp d10, d11, [sp, #(1 * 16)]


Loading…
Cancel
Save