@@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
str TMPF, [Y], #SZ | |||
#else | |||
#if !defined(DOUBLE) | |||
ld1 {v0.2s}, [X], #8 | |||
st1 {v0.2s}, [Y], #8 | |||
ldr d0, [X], #8 | |||
str d0, [Y], #8 | |||
#else | |||
ld1 {v0.2d}, [X], #16 | |||
st1 {v0.2d}, [Y], #16 | |||
ldr q0, [X], #16 | |||
str q0, [Y], #16 | |||
#endif | |||
#endif | |||
.endm | |||
.macro KERNEL_F4 | |||
#if !defined(COMPLEX) | |||
#if !defined(DOUBLE) | |||
ld1 {v0.4s}, [X], #16 | |||
st1 {v0.4s}, [Y], #16 | |||
ldr q0, [X], #16 | |||
str q0, [Y], #16 | |||
#else // DOUBLE | |||
ld1 {v0.4s}, [X], #16 | |||
ld1 {v1.4s}, [X], #16 | |||
st1 {v0.4s}, [Y], #16 | |||
st1 {v1.4s}, [Y], #16 | |||
ldr q0, [X], #16 | |||
str q0, [Y], #16 | |||
ldr q1, [X], #16 | |||
str q1, [Y], #16 | |||
#endif | |||
#else // COMPLEX | |||
#if !defined(DOUBLE) | |||
ld1 {v0.4s}, [X], #16 | |||
ld1 {v1.4s}, [X], #16 | |||
st1 {v0.4s}, [Y], #16 | |||
st1 {v1.4s}, [Y], #16 | |||
ldr q0, [X], #16 | |||
str q0, [Y], #16 | |||
ldr q1, [X], #16 | |||
str q1, [Y], #16 | |||
#else // DOUBLE | |||
ld1 {v0.4s}, [X], #16 | |||
ld1 {v1.4s}, [X], #16 | |||
ld1 {v2.4s}, [X], #16 | |||
ld1 {v3.4s}, [X], #16 | |||
st1 {v0.4s}, [Y], #16 | |||
st1 {v1.4s}, [Y], #16 | |||
st1 {v2.4s}, [Y], #16 | |||
st1 {v3.4s}, [Y], #16 | |||
ldr q0, [X], #16 | |||
str q0, [Y], #16 | |||
ldr q1, [X], #16 | |||
str q1, [Y], #16 | |||
ldr q2, [X], #16 | |||
str q2, [Y], #16 | |||
ldr q3, [X], #16 | |||
str q3, [Y], #16 | |||
#endif | |||
#endif | |||
@@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fabs MAXF, MAXF | |||
.endm | |||
.macro KERNEL_F8 | |||
#if !defined(DOUBLE) | |||
ldp q2, q3, [X], #32 | |||
fabs v2.4s, v2.4s | |||
fabs v3.4s, v3.4s | |||
fmax v2.4s, v2.4s, v3.4s | |||
fmaxv TMPF, v2.4s | |||
fcmp MAXF, TMPF | |||
fcsel MAXF, MAXF, TMPF, COND | |||
csel INDEX, INDEX, Z, COND | |||
add Z, Z, #8 | |||
#else | |||
ldp q2, q3, [X], #32 | |||
ldp q4, q5, [X], #32 | |||
fabs v2.2d, v2.2d | |||
fabs v3.2d, v3.2d | |||
fabs v4.2d, v4.2d | |||
fabs v5.2d, v5.2d | |||
fmax v2.2d, v2.2d, v3.2d | |||
fmax v4.2d, v4.2d, v5.2d | |||
fmax v2.2d, v2.2d, v4.2d | |||
fmaxp TMPF, v2.2d | |||
fcmp MAXF, TMPF | |||
fcsel MAXF, MAXF, TMPF, COND | |||
csel INDEX, INDEX, Z, COND | |||
add Z, Z, #8 | |||
#endif | |||
PRFM PLDL1KEEP, [X, #1024] | |||
.endm | |||
.macro KERNEL_F8_FINALIZE | |||
sub x6, INDEX, #1 | |||
#if !defined(DOUBLE) | |||
lsl x6, x6, #2 | |||
add x7, x7, x6 | |||
ldp q2, q3, [x7] | |||
fabs v2.4s, v2.4s | |||
fabs v3.4s, v3.4s | |||
ins v4.s[0], v3.s[0] | |||
ins v5.s[0], v3.s[1] | |||
ins v6.s[0], v3.s[2] | |||
ins v7.s[0], v3.s[3] | |||
add x6, INDEX, #7 | |||
fcmp MAXF, s7 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, s6 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, s5 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v2.s[0] | |||
ins v5.s[0], v2.s[1] | |||
ins v6.s[0], v2.s[2] | |||
ins v7.s[0], v2.s[3] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s7 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, s6 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, s5 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
#else | |||
add x6, x6, #4 | |||
lsl x6, x6, #3 | |||
add x7, x7, x6 | |||
ldp q2, q3, [x7] | |||
fabs v2.2d, v2.2d | |||
fabs v3.2d, v3.2d | |||
ins v4.d[0], v2.d[0] | |||
ins v5.d[0], v2.d[1] | |||
ins v6.d[0], v3.d[0] | |||
ins v7.d[0], v3.d[1] | |||
add x6, INDEX, #7 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, d6 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, d5 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, d4 | |||
csel INDEX, x6, INDEX, eq | |||
sub x7, x7, #32 | |||
ldp q2, q3, [x7] | |||
fabs v2.2d, v2.2d | |||
fabs v3.2d, v3.2d | |||
ins v4.d[0], v2.d[0] | |||
ins v5.d[0], v2.d[1] | |||
ins v6.d[0], v3.d[0] | |||
ins v7.d[0], v3.d[1] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, d6 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, d5 | |||
csel INDEX, x6, INDEX, eq | |||
sub x6, x6, #1 | |||
fcmp MAXF, d4 | |||
csel INDEX, x6, INDEX, eq | |||
#endif | |||
.endm | |||
.macro KERNEL_S1 | |||
ld1 TMPVF, [X], INC_X | |||
add Z, Z, #1 | |||
@@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
cmp INC_X, xzr | |||
ble iamax_kernel_zero | |||
cmp INC_X, #1 | |||
bne iamax_kernel_S_BEGIN | |||
mov x7, X | |||
iamax_kernel_F_BEGIN: | |||
INIT_S | |||
subs N, N, #1 | |||
ble iamax_kernel_L999 | |||
asr I, N, #3 | |||
cmp I, xzr | |||
beq iamax_kernel_F1 | |||
add Z, Z, #1 | |||
iamax_kernel_F8: | |||
KERNEL_F8 | |||
subs I, I, #1 | |||
bne iamax_kernel_F8 | |||
KERNEL_F8_FINALIZE | |||
sub Z, Z, #1 | |||
iamax_kernel_F1: | |||
ands I, N, #7 | |||
ble iamax_kernel_L999 | |||
iamax_kernel_F10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne iamax_kernel_F10 | |||
b iamax_kernel_L999 | |||
iamax_kernel_S_BEGIN: | |||
INIT_S | |||
subs N, N, #1 | |||
@@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#endif | |||
.endm | |||
.macro KERNEL_F8 | |||
#if !defined(DOUBLE) | |||
ldp q2, q3, [X], #32 | |||
ldp q4, q5, [X], #32 | |||
fabs v2.4s, v2.4s | |||
fabs v3.4s, v3.4s | |||
fabs v4.4s, v4.4s | |||
fabs v5.4s, v5.4s | |||
faddp v2.4s, v2.4s, v3.4s | |||
faddp v3.4s, v4.4s, v5.4s | |||
fmax v2.4s, v2.4s, v3.4s | |||
fmaxv TMPF, v2.4s | |||
fcmp MAXF, TMPF | |||
fcsel MAXF, MAXF, TMPF, COND | |||
csel INDEX, INDEX, Z, COND | |||
add Z, Z, #8 | |||
#else | |||
ldp q2, q3, [X], #32 | |||
ldp q4, q5, [X], #32 | |||
ldp q16, q17, [X], #32 | |||
ldp q18, q19, [X], #32 | |||
fabs v2.2d, v2.2d | |||
fabs v3.2d, v3.2d | |||
fabs v4.2d, v4.2d | |||
fabs v5.2d, v5.2d | |||
fabs v16.2d, v16.2d | |||
fabs v17.2d, v17.2d | |||
fabs v18.2d, v18.2d | |||
fabs v19.2d, v19.2d | |||
faddp v2.2d, v2.2d, v3.2d | |||
faddp v3.2d, v4.2d, v5.2d | |||
faddp v4.2d, v16.2d, v17.2d | |||
faddp v5.2d, v18.2d, v19.2d | |||
fmax v2.2d, v2.2d, v3.2d | |||
fmax v4.2d, v4.2d, v5.2d | |||
fmax v2.2d, v2.2d, v4.2d | |||
fmaxp TMPF, v2.2d | |||
fcmp MAXF, TMPF | |||
fcsel MAXF, MAXF, TMPF, COND | |||
csel INDEX, INDEX, Z, COND | |||
add Z, Z, #8 | |||
#endif | |||
PRFM PLDL1KEEP, [X, #1024] | |||
.endm | |||
.macro KERNEL_F8_FINALIZE | |||
sub x6, INDEX, #1 | |||
#if !defined(DOUBLE) | |||
lsl x6, x6, #3 | |||
add x7, x7, x6 | |||
ldp q2, q3, [x7] | |||
ldp q4, q5, [x7, #32] | |||
fabs v2.4s, v2.4s | |||
fabs v3.4s, v3.4s | |||
fabs v4.4s, v4.4s | |||
fabs v5.4s, v5.4s | |||
faddp v2.4s, v2.4s, v3.4s | |||
faddp v3.4s, v4.4s, v5.4s | |||
ins v4.s[0], v3.s[3] | |||
add x6, INDEX, #7 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v3.s[2] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v3.s[1] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v3.s[0] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v2.s[3] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v2.s[2] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v2.s[1] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
ins v4.s[0], v2.s[0] | |||
sub x6, x6, #1 | |||
fcmp MAXF, s4 | |||
csel INDEX, x6, INDEX, eq | |||
#else | |||
lsl x6, x6, #4 | |||
add x7, x7, x6 | |||
ldp q2, q3, [x7] | |||
ldp q4, q5, [x7, #32] | |||
ldp q16, q17, [x7, #64] | |||
ldp q18, q19, [x7, #96] | |||
fabs v2.2d, v2.2d | |||
fabs v3.2d, v3.2d | |||
fabs v4.2d, v4.2d | |||
fabs v5.2d, v5.2d | |||
fabs v16.2d, v16.2d | |||
fabs v17.2d, v17.2d | |||
fabs v18.2d, v18.2d | |||
fabs v19.2d, v19.2d | |||
faddp v2.2d, v2.2d, v3.2d | |||
faddp v3.2d, v4.2d, v5.2d | |||
faddp v4.2d, v16.2d, v17.2d | |||
faddp v5.2d, v18.2d, v19.2d | |||
ins v7.d[0], v5.d[1] | |||
add x6, INDEX, #7 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
ins v7.d[0], v5.d[0] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
ins v7.d[0], v4.d[1] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
ins v7.d[0], v4.d[0] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
ins v7.d[0], v3.d[1] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
ins v7.d[0], v3.d[0] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
ins v7.d[0], v2.d[1] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
ins v7.d[0], v2.d[0] | |||
sub x6, x6, #1 | |||
fcmp MAXF, d7 | |||
csel INDEX, x6, INDEX, eq | |||
#endif | |||
.endm | |||
.macro KERNEL_S1 | |||
#if !defined(DOUBLE) | |||
ld1 {v1.2s}, [X], INC_X | |||
@@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
cmp INC_X, xzr | |||
ble iamax_kernel_zero | |||
cmp INC_X, #1 | |||
bne iamax_kernel_S_BEGIN | |||
mov x7, X | |||
iamax_kernel_F_BEGIN: | |||
INIT_S | |||
subs N, N, #1 | |||
ble iamax_kernel_L999 | |||
asr I, N, #3 | |||
cmp I, xzr | |||
ble iamax_kernel_F1 | |||
add Z, Z, #1 | |||
iamax_kernel_F8: | |||
KERNEL_F8 | |||
subs I, I, #1 | |||
bne iamax_kernel_F8 | |||
KERNEL_F8_FINALIZE | |||
sub Z, Z, #1 | |||
iamax_kernel_F1: | |||
ands I, N, #7 | |||
ble iamax_kernel_L999 | |||
iamax_kernel_F10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne iamax_kernel_F10 | |||
b iamax_kernel_L999 | |||
iamax_kernel_S_BEGIN: | |||
INIT_S | |||
subs N, N, #1 | |||