@@ -5,8 +5,8 @@ DAMAXKERNEL = amax.S | |||
CAMAXKERNEL = zamax.S | |||
ZAMAXKERNEL = zamax.S | |||
ISAMAXKERNEL = isamax.S | |||
IDAMAXKERNEL = idamax.S | |||
ISAMAXKERNEL = iamax.S | |||
IDAMAXKERNEL = iamax.S | |||
ICAMAXKERNEL = izamax.S | |||
IZAMAXKERNEL = izamax.S | |||
@@ -25,22 +25,22 @@ DCOPYKERNEL = copy.S | |||
CCOPYKERNEL = copy.S | |||
ZCOPYKERNEL = copy.S | |||
DOTKERNEL = dot.S | |||
SDOTKERNEL = dot.S | |||
DDOTKERNEL = dot.S | |||
CDOTKERNEL = zdot.S | |||
ZDOTKERNEL = zdot.S | |||
SNRM2KERNEL = snrm2.S | |||
DNRM2KERNEL = dnrm2.S | |||
CNRM2KERNEL = znrm2.S | |||
ZNRM2KERNEL = znrm2.S | |||
#SNRM2KERNEL = snrm2.S | |||
#DNRM2KERNEL = dnrm2.S | |||
#CNRM2KERNEL = znrm2.S | |||
#ZNRM2KERNEL = znrm2.S | |||
SROTKERNEL = rot.S | |||
DROTKERNEL = rot.S | |||
CROTKERNEL = zrot.S | |||
ZROTKERNEL = zrot.S | |||
SCALKERNEL = scal.S | |||
SSCALKERNEL = scal.S | |||
DSCALKERNEL = scal.S | |||
CSCALKERNEL = zscal.S | |||
ZSCALKERNEL = zscal.S | |||
@@ -181,73 +181,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmul v16.4s, v0.4s, v8.4s[0] | |||
OP_ii v16.4s, v1.4s, v9.4s[0] | |||
fmul v17.4s, v0.4s, v9.4s[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v17.4s, v17.4s | |||
eor v17.16b, v17.16b, v17.16b | |||
fmls v17.4s, v0.4s, v9.4s[0] | |||
#else | |||
fmul v17.4s, v0.4s, v9.4s[0] | |||
#endif | |||
OP_ir v17.4s, v1.4s, v8.4s[0] | |||
fmul v20.4s, v0.4s, v8.4s[1] | |||
OP_ii v20.4s, v1.4s, v9.4s[1] | |||
fmul v21.4s, v0.4s, v9.4s[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v21.4s, v21.4s | |||
eor v21.16b, v21.16b, v21.16b | |||
fmls v21.4s, v0.4s, v9.4s[1] | |||
#else | |||
fmul v21.4s, v0.4s, v9.4s[1] | |||
#endif | |||
OP_ir v21.4s, v1.4s, v8.4s[1] | |||
fmul v24.4s, v0.4s, v8.4s[2] | |||
OP_ii v24.4s, v1.4s, v9.4s[2] | |||
fmul v25.4s, v0.4s, v9.4s[2] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v25.4s, v25.4s | |||
eor v25.16b, v25.16b, v25.16b | |||
fmls v25.4s, v0.4s, v9.4s[2] | |||
#else | |||
fmul v25.4s, v0.4s, v9.4s[2] | |||
#endif | |||
OP_ir v25.4s, v1.4s, v8.4s[2] | |||
fmul v28.4s, v0.4s, v8.4s[3] | |||
OP_ii v28.4s, v1.4s, v9.4s[3] | |||
fmul v29.4s, v0.4s, v9.4s[3] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v29.4s, v29.4s | |||
eor v29.16b, v29.16b, v29.16b | |||
fmls v29.4s, v0.4s, v9.4s[3] | |||
#else | |||
fmul v29.4s, v0.4s, v9.4s[3] | |||
#endif | |||
OP_ir v29.4s, v1.4s, v8.4s[3] | |||
fmul v18.4s, v2.4s, v8.4s[0] | |||
OP_ii v18.4s, v3.4s, v9.4s[0] | |||
fmul v19.4s, v2.4s, v9.4s[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v19.4s, v19.4s | |||
eor v19.16b, v19.16b, v19.16b | |||
fmls v19.4s, v2.4s, v9.4s[0] | |||
#else | |||
fmul v19.4s, v2.4s, v9.4s[0] | |||
#endif | |||
OP_ir v19.4s, v3.4s, v8.4s[0] | |||
fmul v22.4s, v2.4s, v8.4s[1] | |||
OP_ii v22.4s, v3.4s, v9.4s[1] | |||
fmul v23.4s, v2.4s, v9.4s[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v23.4s, v23.4s | |||
eor v23.16b, v23.16b, v23.16b | |||
fmls v23.4s, v2.4s, v9.4s[1] | |||
#else | |||
fmul v23.4s, v2.4s, v9.4s[1] | |||
#endif | |||
OP_ir v23.4s, v3.4s, v8.4s[1] | |||
fmul v26.4s, v2.4s, v8.4s[2] | |||
OP_ii v26.4s, v3.4s, v9.4s[2] | |||
fmul v27.4s, v2.4s, v9.4s[2] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v27.4s, v27.4s | |||
eor v27.16b, v27.16b, v27.16b | |||
fmls v27.4s, v2.4s, v9.4s[2] | |||
#else | |||
fmul v27.4s, v2.4s, v9.4s[2] | |||
#endif | |||
OP_ir v27.4s, v3.4s, v8.4s[2] | |||
fmul v30.4s, v2.4s, v8.4s[3] | |||
OP_ii v30.4s, v3.4s, v9.4s[3] | |||
fmul v31.4s, v2.4s, v9.4s[3] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v31.4s, v31.4s | |||
eor v31.16b, v31.16b, v31.16b | |||
fmls v31.4s, v2.4s, v9.4s[3] | |||
#else | |||
fmul v31.4s, v2.4s, v9.4s[3] | |||
#endif | |||
OP_ir v31.4s, v3.4s, v8.4s[3] | |||
@@ -172,37 +172,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmul v16.4s, v0.4s, v8.4s[0] | |||
OP_ii v16.4s, v1.4s, v9.4s[0] | |||
fmul v17.4s, v0.4s, v9.4s[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v17.4s, v17.4s | |||
eor v17.16b, v17.16b, v17.16b | |||
fmls v17.4s, v0.4s, v9.4s[0] | |||
#else | |||
fmul v17.4s, v0.4s, v9.4s[0] | |||
#endif | |||
OP_ir v17.4s, v1.4s, v8.4s[0] | |||
fmul v20.4s, v0.4s, v8.4s[1] | |||
OP_ii v20.4s, v1.4s, v9.4s[1] | |||
fmul v21.4s, v0.4s, v9.4s[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v21.4s, v21.4s | |||
eor v21.16b, v21.16b, v21.16b | |||
fmls v21.4s, v0.4s, v9.4s[1] | |||
#else | |||
fmul v21.4s, v0.4s, v9.4s[1] | |||
#endif | |||
OP_ir v21.4s, v1.4s, v8.4s[1] | |||
fmul v24.4s, v0.4s, v8.4s[2] | |||
OP_ii v24.4s, v1.4s, v9.4s[2] | |||
fmul v25.4s, v0.4s, v9.4s[2] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v25.4s, v25.4s | |||
eor v25.16b, v25.16b, v25.16b | |||
fmls v25.4s, v0.4s, v9.4s[2] | |||
#else | |||
fmul v25.4s, v0.4s, v9.4s[2] | |||
#endif | |||
OP_ir v25.4s, v1.4s, v8.4s[2] | |||
fmul v28.4s, v0.4s, v8.4s[3] | |||
OP_ii v28.4s, v1.4s, v9.4s[3] | |||
fmul v29.4s, v0.4s, v9.4s[3] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v29.4s, v29.4s | |||
eor v29.16b, v29.16b, v29.16b | |||
fmls v29.4s, v0.4s, v9.4s[3] | |||
#else | |||
fmul v29.4s, v0.4s, v9.4s[3] | |||
#endif | |||
OP_ir v29.4s, v1.4s, v8.4s[3] | |||
@@ -45,16 +45,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define COND ge | |||
#endif | |||
#if !defined(DOUBLE) | |||
#define MAXF s0 | |||
#define TMPF s1 | |||
#define TMPVF {v1.s}[0] | |||
#define SZ 4 | |||
#else | |||
#define MAXF d0 | |||
#define TMPF d1 | |||
#define TMPVF {v1.d}[0] | |||
#define SZ 8 | |||
#endif | |||
/******************************************************************************/ | |||
.macro INIT_S | |||
#if !defined(DOUBLE) | |||
lsl INC_X, INC_X, #2 | |||
ld1 {v0.s}[0], [X], INC_X | |||
#else | |||
lsl INC_X, INC_X, #3 | |||
ld1 {v0.d}[0], [X], INC_X | |||
#endif | |||
mov Z, #1 | |||
mov INDEX, Z | |||
fabs MAXF, MAXF | |||
@@ -107,9 +119,8 @@ iamax_kernel_S1: | |||
iamax_kernel_S10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne iamax_kernel_S10 | |||
subs I, I, #1 | |||
bne iamax_kernel_S10 | |||
iamax_kernel_L999: | |||
@@ -1,213 +0,0 @@ | |||
/******************************************************************************* | |||
Copyright (c) 2015, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
#define ASSEMBLER | |||
#include "common.h" | |||
#define N x0 /* vector length */ | |||
#define X x1 /* X vector address */ | |||
#define INC_X x2 /* X stride */ | |||
#define INDEX x3 /* index of max/min value */ | |||
#define Z x4 /* vector index */ | |||
#define I x5 /* loop variable */ | |||
#define X_COPY x6 /* copy of X address */ | |||
#define MAXF_Z x7 | |||
/******************************************************************************* | |||
* Macro definitions | |||
*******************************************************************************/ | |||
#define MAXF s5 | |||
#define TMPF s6 | |||
#define TMPVF {v6.s}[0] | |||
#define SZ 4 | |||
/******************************************************************************/ | |||
.macro INIT_F1 | |||
ldr MAXF, [X], #SZ | |||
mov Z, #1 | |||
mov INDEX, Z | |||
fabs MAXF, MAXF | |||
.endm | |||
.macro KERNEL_F1 | |||
ldr TMPF, [X], #SZ | |||
add Z, Z, #1 | |||
fabs TMPF, TMPF | |||
fcmp TMPF, MAXF | |||
fcsel MAXF, MAXF, TMPF, le | |||
csel INDEX, INDEX, Z, le | |||
.endm | |||
.macro INIT_F4 | |||
ld1 {v0.4s}, [X], #16 | |||
fabs v0.4s, v0.4s | |||
fmaxv MAXF, v0.4s | |||
mov Z, #5 | |||
mov MAXF_Z, #1 | |||
.endm | |||
.macro KERNEL_F4 | |||
ld1 {v0.4s}, [X], #16 | |||
fabs v0.4s, v0.4s | |||
fmaxv TMPF, v0.4s | |||
PRFM PLDL1KEEP, [X, #512] | |||
fcmp TMPF, MAXF | |||
fcsel MAXF, MAXF, TMPF, le | |||
csel MAXF_Z, MAXF_Z, Z, le | |||
add Z, Z, #4 | |||
.endm | |||
.macro KERNEL_F4_FINALIZE | |||
mov INDEX, MAXF_Z | |||
sub MAXF_Z, MAXF_Z, #1 | |||
lsl MAXF_Z, MAXF_Z, #2 | |||
add X_COPY, X_COPY, MAXF_Z | |||
ldr TMPF, [X_COPY], #SZ | |||
fabs TMPF, TMPF | |||
fcmp TMPF, MAXF | |||
beq KERNEL_F4_FINALIZE_DONE | |||
add INDEX, INDEX, #1 | |||
ldr TMPF, [X_COPY], #SZ | |||
fabs TMPF, TMPF | |||
fcmp TMPF, MAXF | |||
beq KERNEL_F4_FINALIZE_DONE | |||
add INDEX, INDEX, #1 | |||
ldr TMPF, [X_COPY], #SZ | |||
fabs TMPF, TMPF | |||
fcmp TMPF, MAXF | |||
beq KERNEL_F4_FINALIZE_DONE | |||
add INDEX, INDEX, #1 | |||
KERNEL_F4_FINALIZE_DONE: | |||
.endm | |||
.macro INIT_S | |||
lsl INC_X, INC_X, #2 | |||
ld1 TMPVF, [X], INC_X | |||
mov Z, #1 | |||
mov INDEX, Z | |||
fabs MAXF, TMPF | |||
.endm | |||
.macro KERNEL_S1 | |||
ld1 TMPVF, [X], INC_X | |||
add Z, Z, #1 | |||
fabs TMPF, TMPF | |||
fcmp TMPF, MAXF | |||
fcsel MAXF, MAXF, TMPF, le | |||
csel INDEX, INDEX, Z, le | |||
.endm | |||
/******************************************************************************* | |||
* End of macro definitions | |||
*******************************************************************************/ | |||
PROLOGUE | |||
cmp N, xzr | |||
ble iamax_kernel_zero | |||
cmp INC_X, xzr | |||
ble iamax_kernel_zero | |||
PRFM PLDL1KEEP, [X] | |||
mov X_COPY, X | |||
cmp INC_X, #1 | |||
bne iamax_kernel_S_BEGIN | |||
iamax_kernel_F_BEGIN: | |||
asr I, N, #2 | |||
cmp I, xzr | |||
beq iamax_kernel_F1_INIT | |||
INIT_F4 | |||
subs I, I, #1 | |||
beq iamax_kernel_F4_FINALIZE | |||
iamax_kernel_F4: | |||
KERNEL_F4 | |||
subs I, I, #1 | |||
bne iamax_kernel_F4 | |||
iamax_kernel_F4_FINALIZE: | |||
KERNEL_F4_FINALIZE | |||
iamax_kernel_F1: | |||
ands I, N, #3 | |||
ble iamax_kernel_L999 | |||
iamax_kernel_F10: | |||
KERNEL_F1 | |||
subs I, I, #1 | |||
bne iamax_kernel_F10 | |||
b iamax_kernel_L999 | |||
iamax_kernel_F1_INIT: | |||
INIT_F1 | |||
subs N, N, #1 | |||
b iamax_kernel_F1 | |||
iamax_kernel_S_BEGIN: | |||
INIT_S | |||
subs N, N, #1 | |||
ble iamax_kernel_L999 | |||
asr I, N, #2 | |||
cmp I, xzr | |||
ble iamax_kernel_S1 | |||
iamax_kernel_S4: | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne iamax_kernel_S4 | |||
iamax_kernel_S1: | |||
ands I, N, #3 | |||
ble iamax_kernel_L999 | |||
iamax_kernel_S10: | |||
KERNEL_S1 | |||
subs I, I, #1 | |||
bne iamax_kernel_S10 | |||
iamax_kernel_L999: | |||
mov x0, INDEX | |||
ret | |||
iamax_kernel_zero: | |||
mov x0, xzr | |||
ret | |||
EPILOGUE |
@@ -59,10 +59,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro INIT_F1 | |||
#if !defined(DOUBLE) | |||
fneg s2, S | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub s2, s2, S | |||
ins v1.s[1], v2.s[0] // [-S, S] | |||
#else | |||
fneg d2, S | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub d2, d2, S | |||
ins v1.d[1], v2.d[0] // [-S, S] | |||
#endif | |||
.endm | |||
@@ -43,14 +43,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if !defined(DOUBLE) | |||
#define DA_R s0 /* scale input value */ | |||
#define DA_I s1 /* scale input value */ | |||
#define TMPX v2.2s | |||
#define TMPY v3.2s | |||
#define SZ 4 | |||
#else | |||
#define DA_R d0 /* scale input value */ | |||
#define DA_I d1 /* scale input value */ | |||
#define TMPX v2.2d | |||
#define TMPY v3.2d | |||
#define SZ 8 | |||
#endif | |||
@@ -61,22 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if !defined(CONJ) | |||
#if !defined(DOUBLE) | |||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||
fneg s2, DA_I | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub s2, s2, DA_I | |||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | |||
#else | |||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||
fneg d2, DA_I | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub d2, d2, DA_I | |||
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | |||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | |||
#endif | |||
#else | |||
#if !defined(DOUBLE) | |||
fneg s2, DA_R | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub s2, s2, DA_R | |||
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R | |||
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I | |||
#else | |||
fneg d2, DA_R | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub d2, d2, DA_R | |||
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R | |||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I | |||
#endif | |||
@@ -111,9 +111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro KERNEL_INIT_F4 | |||
#if !defined(DOUBLE) | |||
// Replicate the lower 2 floats into the upper 2 slots | |||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||
ins v16.s[0], v0.s[0] | |||
ins v16.s[1], v16.s[0] | |||
ins v16.d[1], v16.d[0] | |||
#if !defined(CONJ) | |||
ins v17.s[0], v1.s[1] | |||
#else | |||
ins v17.s[0], v1.s[0] | |||
#endif | |||
ins v17.s[1], v17.s[0] | |||
ins v17.d[1], v17.d[0] | |||
#else //DOUBLE | |||
ins v16.d[0], v0.d[0] | |||
ins v16.d[1], v16.d[0] | |||
#if !defined(CONJ) | |||
ins v17.d[0], v1.d[1] | |||
#else | |||
ins v17.d[0], v1.d[0] | |||
#endif | |||
ins v17.d[1], v17.d[0] | |||
#endif | |||
.endm | |||
@@ -121,55 +137,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro KERNEL_F4 | |||
#if !defined(DOUBLE) | |||
ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0] | |||
// V3 = X[7], X[6], X[5], X[4] | |||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||
ld2 {v2.4s, v3.4s}, [X], #32 | |||
ld2 {v4.4s, v5.4s}, [Y_COPY], #32 | |||
ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0] | |||
// V5 = Y[7], Y[6], Y[5], Y[4] | |||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||
fmla v4.4s, v2.4s, v16.4s | |||
#if !defined(CONJ) | |||
fmls v4.4s, v3.4s, v17.4s | |||
#else | |||
fmla v4.4s, v3.4s, v17.4s | |||
#endif | |||
fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix] | |||
// Y[iy+1] += +-DA_R * X[ix+1] | |||
fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1] | |||
// Y[iy+1] += DA_I * X[ix] | |||
st1 {v4.4s}, [Y], #16 | |||
#if !defined(CONJ) | |||
fmla v5.4s, v2.4s, v17.4s | |||
#else | |||
fmls v5.4s, v2.4s, v17.4s | |||
#endif | |||
fmla v5.4s, v3.4s, v16.4s | |||
fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix] | |||
fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1] | |||
// Y[iy+1] += +-DA_R * X[ix+1] | |||
// Y[iy+1] += DA_I * X[ix] | |||
st1 {v5.4s}, [Y], #16 | |||
st2 {v4.4s, v5.4s}, [Y], #32 | |||
#else // DOUBLE | |||
ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||
ld2 {v2.2d, v3.2d}, [X], #32 | |||
ld2 {v4.2d, v5.2d}, [Y_COPY], #32 | |||
ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||
ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||
fmla v4.2d, v2.2d, v16.2d | |||
#if !defined(CONJ) | |||
fmls v4.2d, v3.2d, v17.2d | |||
#else | |||
fmla v4.2d, v3.2d, v17.2d | |||
#endif | |||
#if !defined(CONJ) | |||
fmla v5.2d, v2.2d, v17.2d | |||
#else | |||
fmls v5.2d, v2.2d, v17.2d | |||
#endif | |||
fmla v5.2d, v3.2d, v16.2d | |||
fmla v16.2d, v0.2d, v2.2d | |||
fmla v17.2d, v0.2d, v3.2d | |||
st2 {v4.2d, v5.2d}, [Y], #32 | |||
ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||
ld2 {v18.2d, v19.2d}, [X], #32 | |||
ld2 {v20.2d, v21.2d}, [Y_COPY], #32 | |||
fmla v16.2d, v1.2d, v20.2d | |||
fmla v17.2d, v1.2d, v21.2d | |||
st1 {v16.2d,v17.2d}, [Y], #32 | |||
fmla v20.2d, v18.2d, v16.2d | |||
#if !defined(CONJ) | |||
fmls v20.2d, v19.2d, v17.2d | |||
#else | |||
fmla v20.2d, v19.2d, v17.2d | |||
#endif | |||
#if !defined(CONJ) | |||
fmla v21.2d, v18.2d, v17.2d | |||
#else | |||
fmls v21.2d, v18.2d, v17.2d | |||
#endif | |||
fmla v21.2d, v19.2d, v16.2d | |||
fmla v18.2d, v0.2d, v4.2d | |||
fmla v19.2d, v0.2d, v5.2d | |||
fmla v18.2d, v1.2d, v22.2d | |||
fmla v19.2d, v1.2d, v23.2d | |||
st1 {v18.2d,v19.2d}, [Y], #32 | |||
st2 {v20.2d, v21.2d}, [Y], #32 | |||
#endif | |||
PRFM PLDL1KEEP, [X, #512] | |||
PRFM PLDL1KEEP, [Y, #512] | |||
@@ -184,73 +184,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmul v16.2d, v0.2d, v8.2d[0] | |||
OP_ii v16.2d, v1.2d, v9.2d[0] | |||
fmul v17.2d, v0.2d, v9.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v17.2d, v17.2d | |||
eor v17.16b, v17.16b, v17.16b | |||
fmls v17.2d, v0.2d, v9.2d[0] | |||
#else | |||
fmul v17.2d, v0.2d, v9.2d[0] | |||
#endif | |||
OP_ir v17.2d, v1.2d, v8.2d[0] | |||
fmul v18.2d, v2.2d, v8.2d[0] | |||
OP_ii v18.2d, v3.2d, v9.2d[0] | |||
fmul v19.2d, v2.2d, v9.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v19.2d, v19.2d | |||
eor v19.16b, v19.16b, v19.16b | |||
fmls v19.2d, v2.2d, v9.2d[0] | |||
#else | |||
fmul v19.2d, v2.2d, v9.2d[0] | |||
#endif | |||
OP_ir v19.2d, v3.2d, v8.2d[0] | |||
fmul v20.2d, v0.2d, v8.2d[1] | |||
OP_ii v20.2d, v1.2d, v9.2d[1] | |||
fmul v21.2d, v0.2d, v9.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v21.2d, v21.2d | |||
eor v21.16b, v21.16b, v21.16b | |||
fmls v21.2d, v0.2d, v9.2d[1] | |||
#else | |||
fmul v21.2d, v0.2d, v9.2d[1] | |||
#endif | |||
OP_ir v21.2d, v1.2d, v8.2d[1] | |||
fmul v22.2d, v2.2d, v8.2d[1] | |||
OP_ii v22.2d, v3.2d, v9.2d[1] | |||
fmul v23.2d, v2.2d, v9.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v23.2d, v23.2d | |||
eor v23.16b, v23.16b, v23.16b | |||
fmls v23.2d, v2.2d, v9.2d[1] | |||
#else | |||
fmul v23.2d, v2.2d, v9.2d[1] | |||
#endif | |||
OP_ir v23.2d, v3.2d, v8.2d[1] | |||
fmul v24.2d, v0.2d, v10.2d[0] | |||
OP_ii v24.2d, v1.2d, v11.2d[0] | |||
fmul v25.2d, v0.2d, v11.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v25.2d, v25.2d | |||
eor v25.16b, v25.16b, v25.16b | |||
fmls v25.2d, v0.2d, v11.2d[0] | |||
#else | |||
fmul v25.2d, v0.2d, v11.2d[0] | |||
#endif | |||
OP_ir v25.2d, v1.2d, v10.2d[0] | |||
fmul v26.2d, v2.2d, v10.2d[0] | |||
OP_ii v26.2d, v3.2d, v11.2d[0] | |||
fmul v27.2d, v2.2d, v11.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v27.2d, v27.2d | |||
eor v27.16b, v27.16b, v27.16b | |||
fmls v27.2d, v2.2d, v11.2d[0] | |||
#else | |||
fmul v27.2d, v2.2d, v11.2d[0] | |||
#endif | |||
OP_ir v27.2d, v3.2d, v10.2d[0] | |||
fmul v28.2d, v0.2d, v10.2d[1] | |||
OP_ii v28.2d, v1.2d, v11.2d[1] | |||
fmul v29.2d, v0.2d, v11.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v29.2d, v29.2d | |||
eor v29.16b, v29.16b, v29.16b | |||
fmls v29.2d, v0.2d, v11.2d[1] | |||
#else | |||
fmul v29.2d, v0.2d, v11.2d[1] | |||
#endif | |||
OP_ir v29.2d, v1.2d, v10.2d[1] | |||
fmul v30.2d, v2.2d, v10.2d[1] | |||
OP_ii v30.2d, v3.2d, v11.2d[1] | |||
fmul v31.2d, v2.2d, v11.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v31.2d, v31.2d | |||
eor v31.16b, v31.16b, v31.16b | |||
fmls v31.2d, v2.2d, v11.2d[1] | |||
#else | |||
fmul v31.2d, v2.2d, v11.2d[1] | |||
#endif | |||
OP_ir v31.2d, v3.2d, v10.2d[1] | |||
@@ -110,15 +110,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
/******* INIT FOR F1 AND S1 LOOP ******/ | |||
#if !defined(DOUBLE) | |||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||
fneg s2, ALPHA_I | |||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub s2, s2, ALPHA_I | |||
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) | |||
#if !defined(XCONJ) | |||
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) | |||
#endif | |||
#else | |||
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||
fneg d2, ALPHA_I | |||
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub d2, d2, ALPHA_I | |||
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) | |||
#if !defined(XCONJ) | |||
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) | |||
@@ -156,8 +158,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#else | |||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||
fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] | |||
fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] | |||
eor v12.16b, v12.16b, v12.16b | |||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||
#endif | |||
#endif // CONJ | |||
@@ -170,24 +172,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ins v3.s[0], v2.s[1] | |||
#if !defined(CONJ) | |||
#if !defined(XCONJ) | |||
fneg s4, s3 | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub s4, s4, s3 | |||
ins v3.s[1], v4.s[0] | |||
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] | |||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | |||
#else | |||
fneg s4, s3 | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub s4, s4, s3 | |||
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] | |||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | |||
#endif | |||
#else // CONJ | |||
#if !defined(XCONJ) | |||
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] | |||
fneg s4, s2 | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub s4, s4, s2 | |||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | |||
#else | |||
fneg s3, s3 | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub s3, s4, s3 | |||
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] | |||
fneg s4, s2 | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub s4, s4, s2 | |||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | |||
#endif | |||
#endif // CONJ | |||
@@ -220,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#else | |||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||
fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] | |||
fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] | |||
eor v12.16b, v12.16b, v12.16b | |||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||
#endif | |||
#endif // CONJ | |||
@@ -234,24 +241,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ins v3.d[0], v2.d[1] // I(TEMP) | |||
#if !defined(CONJ) | |||
#if !defined(XCONJ) | |||
fneg d4, d3 // -I(TEMP) | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub d4, d4, d3 | |||
ins v3.d[1], v4.d[0] | |||
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] | |||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | |||
#else | |||
fneg d4, d3 // -I(TEMP) | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub d4, d4, d3 | |||
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] | |||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | |||
#endif | |||
#else // CONJ | |||
#if !defined(XCONJ) | |||
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] | |||
fneg d4, d2 // -R(TEMP) | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub d4, d4, d2 | |||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | |||
#else | |||
fneg d3, d3 // -I(TEMP) | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub d3, d4, d3 | |||
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] | |||
fneg d4, d2 // -R(TEMP) | |||
eor v4.16b, v4.16b, v4.16b | |||
fsub d4, d4, d2 | |||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | |||
#endif | |||
#endif // CONJ | |||
@@ -96,22 +96,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#if !defined(XCONJ) | |||
#if !defined(DOUBLE) | |||
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R | |||
fneg s2, ALPHA_I | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub s2, s2, ALPHA_I | |||
ins v1.s[1], v2.s[0] | |||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I | |||
#else | |||
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R | |||
fneg d2, ALPHA_I | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub d2, d2, ALPHA_I | |||
ins v1.d[1], v2.d[0] | |||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I | |||
#endif | |||
#else // XCONJ | |||
#if !defined(DOUBLE) | |||
fneg s2, ALPHA_R | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub s2, s2, ALPHA_R | |||
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R | |||
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I | |||
#else | |||
fneg d2, ALPHA_R | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub d2, d2, ALPHA_R | |||
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R | |||
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I | |||
#endif | |||
@@ -136,89 +140,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld2 {v11.4s, v12.4s}, [X_PTR], #32 | |||
ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||
#if !defined(CONJ) | |||
#if !defined(XCONJ) | |||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | |||
#else | |||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||
#endif | |||
#else // CONJ | |||
#if !defined(XCONJ) | |||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | |||
#else | |||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||
#endif | |||
#endif // CONJ | |||
#else // DOUBLE | |||
ld2 {v11.2d, v12.2d}, [X_PTR], #32 | |||
ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||
prfm PLDL1STRM, [X_PTR, #512] | |||
#if !defined(CONJ) | |||
#if !defined(XCONJ) | |||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | |||
#else | |||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||
#endif | |||
#else // CONJ | |||
#if !defined(XCONJ) | |||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | |||
#else | |||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||
#endif | |||
#endif // CONJ | |||
ld2 {v17.2d, v18.2d}, [X_PTR], #32 | |||
ld2 {v19.2d, v20.2d}, [A_PTR], #32 | |||
prfm PLDL1STRM, [A_PTR, #512] | |||
#if !defined(CONJ) | |||
#if !defined(XCONJ) | |||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
#else | |||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
#endif | |||
#else // CONJ | |||
#if !defined(XCONJ) | |||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
#else | |||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||
#endif | |||
#endif // CONJ | |||
#endif //DOUBLE | |||
.endm | |||
@@ -252,7 +218,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | |||
ld1 {v5.s}[0], [A_PTR], #4 // A1 | |||
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] | |||
fneg s16, s5 | |||
eor v16.16b, v16.16b, v16.16b | |||
fsub s16, s16, s5 | |||
ins v5.s[1], v16.s[0] // [-A1, A1] | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | |||
@@ -264,7 +231,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | |||
ld1 {v5.d}[0], [A_PTR], #8 // A1 | |||
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] | |||
fneg d16, d5 | |||
eor v16.16b, v16.16b, v16.16b | |||
fsub d16, d16, d5 | |||
ins v5.d[1], v16.d[0] // [-A1, A1] | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | |||
@@ -284,7 +252,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | |||
ld1 {v5.s}[0], [A_PTR], #4 // A1 | |||
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] | |||
fneg s16, s5 | |||
eor v16.16b, v16.16b, v16.16b | |||
fsub s16, s16, s5 | |||
ins v5.s[1], v16.s[0] // [-A1, A1] | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | |||
@@ -296,7 +265,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | |||
ld1 {v5.d}[0], [A_PTR], #8 // A1 | |||
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] | |||
fneg d16, d5 | |||
eor v16.16b, v16.16b, v16.16b | |||
fsub d16, d16, d5 | |||
ins v5.d[1], v16.d[0] // [-A1, A1] | |||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | |||
@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
#define X x3 /* X vector address */ | |||
#define INC_X x4 /* X stride */ | |||
#define I x5 /* loop variable */ | |||
#define X_COPY x6 /* Copy of X */ | |||
/******************************************************************************* | |||
* Macro definitions | |||
@@ -50,43 +51,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro INIT | |||
#if !defined(DOUBLE) | |||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||
fneg s2, DA_I | |||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | |||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||
#else | |||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||
fneg d2, DA_I | |||
ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I | |||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | |||
#endif | |||
.endm | |||
.macro KERNEL_F1 | |||
#if !defined(DOUBLE) | |||
ld1 {v2.2s}, [X] // X1, X0 | |||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v2.2s}, [X], #8 | |||
fmul s3, DA_R, v2.s[0] // DA_R*X0 | |||
fmul s5, DA_I, v2.s[1] // DA_I*X1 | |||
fsub s3, s3, s5 // DA_R*X0-DA_I*X1 | |||
fmul s4, DA_I, v2.s[0] // DA_I*X0 | |||
fmul s5, DA_R, v2.s[1] // DA_R*X1 | |||
fadd s4, s4, s5 // DA_I*X0+DA_R*X1 | |||
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v3.2s}, [X], #8 | |||
#else | |||
ld1 {v2.2d}, [X] // X1, X0 | |||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v2.2d}, [X], #16 | |||
#endif | |||
fmul d3, DA_R, v2.d[0] // DA_R*X0 | |||
fmul d5, DA_I, v2.d[1] // DA_I*X1 | |||
fsub d3, d3, d5 // DA_R*X0-DA_I*X1 | |||
fmul d4, DA_I, v2.d[0] // DA_I*X0 | |||
fmul d5, DA_R, v2.d[1] // DA_R*X1 | |||
fadd d4, d4, d5 // DA_I*X0+DA_R*X1 | |||
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v3.2d}, [X], #16 | |||
#endif | |||
.endm | |||
.macro KERNEL_INIT_F4 | |||
#if !defined(DOUBLE) | |||
// Replicate the lower 2 floats into the upper 2 slots | |||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||
ins v16.s[0], v0.s[0] | |||
ins v16.s[1], v16.s[0] | |||
ins v16.d[1], v16.d[0] | |||
ins v17.s[0], v1.s[0] | |||
ins v17.s[1], v17.s[0] | |||
ins v17.d[1], v17.d[0] | |||
#else //DOUBLE | |||
ins v16.d[0], v0.d[0] | |||
ins v16.d[1], v16.d[0] | |||
ins v17.d[0], v1.d[0] | |||
ins v17.d[1], v17.d[0] | |||
#endif | |||
.endm | |||
@@ -94,46 +107,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.macro KERNEL_F4 | |||
#if !defined(DOUBLE) | |||
ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0] | |||
// V3 = X[7], X[6], X[5], X[4] | |||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||
fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix] | |||
// X'[ix+1] += DA_R * X[ix+1] | |||
fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1] | |||
// X'[ix+1] += DA_I * X[ix] | |||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||
fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix] | |||
// X'[ix+1] += DA_R * X[ix+1] | |||
fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1] | |||
// X'[ix+1] += DA_I * X[ix] | |||
st1 {v2.4s,v3.4s}, [X], #32 | |||
ld2 {v2.4s, v3.4s}, [X], #32 | |||
fmul v4.4s, v2.4s, v16.4s | |||
fmul v6.4s, v3.4s, v17.4s | |||
fsub v4.4s, v4.4s, v6.4s | |||
fmul v5.4s, v2.4s, v17.4s | |||
fmul v6.4s, v3.4s, v16.4s | |||
fadd v5.4s, v5.4s, v6.4s | |||
st2 {v4.4s, v5.4s}, [X_COPY], #32 | |||
#else // DOUBLE | |||
ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3 | |||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||
ld2 {v2.2d, v3.2d}, [X], #32 | |||
fmul v2.2d, v0.2d, v2.2d | |||
fmla v2.2d, v1.2d, v20.2d | |||
fmul v4.2d, v2.2d, v16.2d | |||
fmul v6.2d, v3.2d, v17.2d | |||
fsub v4.2d, v4.2d, v6.2d | |||
fmul v5.2d, v2.2d, v17.2d | |||
fmul v6.2d, v3.2d, v16.2d | |||
fadd v5.2d, v5.2d, v6.2d | |||
fmul v3.2d, v0.2d, v3.2d | |||
fmla v3.2d, v1.2d, v21.2d | |||
st1 {v2.2d,v3.2d}, [X], #32 | |||
st2 {v4.2d, v5.2d}, [X_COPY], #32 | |||
fmul v4.2d, v0.2d, v4.2d | |||
fmla v4.2d, v1.2d, v22.2d | |||
ld2 {v18.2d, v19.2d}, [X], #32 | |||
fmul v5.2d, v0.2d, v5.2d | |||
fmla v5.2d, v1.2d, v23.2d | |||
st1 {v4.2d,v5.2d}, [X], #32 | |||
fmul v20.2d, v18.2d, v16.2d | |||
fmul v6.2d, v19.2d, v17.2d | |||
fsub v20.2d, v20.2d, v6.2d | |||
fmul v21.2d, v18.2d, v17.2d | |||
fmul v6.2d, v19.2d, v16.2d | |||
fadd v21.2d, v21.2d, v6.2d | |||
st2 {v20.2d, v21.2d}, [X_COPY], #32 | |||
#endif | |||
PRFM PLDL1KEEP, [X, #1024] | |||
.endm | |||
@@ -149,21 +155,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
.endm | |||
.macro KERNEL_S1 | |||
#if !defined(DOUBLE) | |||
ld1 {v2.2s}, [X] // X1, X0 | |||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v2.2s}, [X], INC_X | |||
fmul s3, DA_R, v2.s[0] // DA_R*X0 | |||
fmul s5, DA_I, v2.s[1] // DA_I*X1 | |||
fsub s3, s3, s5 // DA_R*X0-DA_I*X1 | |||
fmul s4, DA_I, v2.s[0] // DA_I*X0 | |||
fmul s5, DA_R, v2.s[1] // DA_R*X1 | |||
fadd s4, s4, s5 // DA_I*X0+DA_R*X1 | |||
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v3.2s}, [X], INC_X | |||
#else | |||
ld1 {v2.2d}, [X] // X1, X0 | |||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v2.2d}, [X], INC_X | |||
#endif | |||
fmul d3, DA_R, v2.d[0] // DA_R*X0 | |||
fmul d5, DA_I, v2.d[1] // DA_I*X1 | |||
fsub d3, d3, d5 // DA_R*X0-DA_I*X1 | |||
fmul d4, DA_I, v2.d[0] // DA_I*X0 | |||
fmul d5, DA_R, v2.d[1] // DA_R*X1 | |||
fadd d4, d4, d5 // DA_I*X0+DA_R*X1 | |||
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||
st1 {v3.2d}, [X], INC_X | |||
#endif | |||
.endm | |||
/******************************************************************************* | |||
@@ -171,21 +187,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*******************************************************************************/ | |||
PROLOGUE | |||
b zscal_begin | |||
data_ar: | |||
.word 0x3e44fae6 | |||
data_ai: | |||
.word 0x3d320fa2 | |||
data_xr: | |||
.word 0x3f4baff1 | |||
data_xi: | |||
.word 0xbe8ef0bd | |||
zscal_begin: | |||
ldr s20, data_ar | |||
ldr s21, data_ai | |||
ldr s22, data_xr | |||
ldr s23, data_xi | |||
fmul s24, s22, s21 | |||
fmla s24, s23, v20.s[0] | |||
fmul s25, s22, s21 | |||
fmul s26, s23, s20 | |||
fadd s25, s25, s26 | |||
mov X_COPY, X | |||
cmp N, xzr | |||
ble zscal_kernel_L999 | |||
fcmp DA_R, #0.0 | |||
bne zscal_kernel_1 | |||
bne zscal_kernel_R_non_zero | |||
fcmp DA_I, #0.0 | |||
beq zscal_kernel_zero | |||
beq zscal_kernel_RI_zero | |||
// TODO: special case DA_R == 0 && DA_I != 0 | |||
b zscal_kernel_R_zero | |||
zscal_kernel_1: | |||
zscal_kernel_R_non_zero: | |||
// TODO: special case DA_R != 0 && DA_I == 0 | |||
fcmp DA_I, #0.0 | |||
beq zscal_kernel_I_zero | |||
/******************************************************************************* | |||
* A_R != 0 && A_I != 0 | |||
*******************************************************************************/ | |||
zscal_kernel_RI_non_zero: | |||
INIT | |||
@@ -257,16 +306,85 @@ zscal_kernel_L999: | |||
mov w0, wzr | |||
ret | |||
zscal_kernel_zero: | |||
/******************************************************************************* | |||
* A_R == 0 && A_I != 0 | |||
*******************************************************************************/ | |||
zscal_kernel_R_zero: | |||
INIT_S | |||
#if !defined(DOUBLE) | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub s2, s2, DA_I | |||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||
#else | |||
eor v2.16b, v2.16b, v2.16b | |||
fsub d2, d2, DA_I | |||
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | |||
#endif | |||
zscal_kernel_R_zero_1: | |||
#if !defined(DOUBLE) | |||
ld1 {v2.2s}, [X] // X1, X0 | |||
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 | |||
ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1 | |||
st1 {v2.2s}, [X] | |||
#else | |||
ld1 {v2.2d}, [X] // X1, X0 | |||
fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0 | |||
ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1 | |||
st1 {v2.2d}, [X] | |||
#endif | |||
add X, X, INC_X | |||
subs N, N, #1 | |||
bne zscal_kernel_R_zero_1 | |||
mov w0, wzr | |||
ret | |||
/******************************************************************************* | |||
* A_R != 0 && A_I == 0 | |||
*******************************************************************************/ | |||
zscal_kernel_I_zero: | |||
INIT_S | |||
#if !defined(DOUBLE) | |||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||
#else | |||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||
#endif | |||
zscal_kernel_I_zero_1: | |||
#if !defined(DOUBLE) | |||
ld1 {v2.2s}, [X] // X1, X0 | |||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||
st1 {v2.2s}, [X] | |||
#else | |||
ld1 {v2.2d}, [X] // X1, X0 | |||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||
st1 {v2.2d}, [X] | |||
#endif | |||
add X, X, INC_X | |||
subs N, N, #1 | |||
bne zscal_kernel_I_zero_1 | |||
mov w0, wzr | |||
ret | |||
/******************************************************************************* | |||
* A_R == 0 && A_I == 0 | |||
*******************************************************************************/ | |||
zscal_kernel_RI_zero: | |||
INIT_S | |||
zscal_kernel_Z1: | |||
zscal_kernel_RI_zero_1: | |||
stp DA_R, DA_I, [X] | |||
add X, X, INC_X | |||
subs N, N, #1 | |||
bne zscal_kernel_Z1 | |||
subs N, N, #1 | |||
bne zscal_kernel_RI_zero_1 | |||
mov w0, wzr | |||
ret | |||
@@ -187,73 +187,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
fmul v16.2d, v0.2d, v8.2d[0] | |||
OP_ii v16.2d, v1.2d, v9.2d[0] | |||
fmul v17.2d, v0.2d, v9.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v17.2d, v17.2d | |||
eor v17.16b, v17.16b, v17.16b | |||
fmls v17.2d, v0.2d, v9.2d[0] | |||
#else | |||
fmul v17.2d, v0.2d, v9.2d[0] | |||
#endif | |||
OP_ir v17.2d, v1.2d, v8.2d[0] | |||
fmul v18.2d, v2.2d, v8.2d[0] | |||
OP_ii v18.2d, v3.2d, v9.2d[0] | |||
fmul v19.2d, v2.2d, v9.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v19.2d, v19.2d | |||
eor v19.16b, v19.16b, v19.16b | |||
fmls v19.2d, v2.2d, v9.2d[0] | |||
#else | |||
fmul v19.2d, v2.2d, v9.2d[0] | |||
#endif | |||
OP_ir v19.2d, v3.2d, v8.2d[0] | |||
fmul v20.2d, v0.2d, v8.2d[1] | |||
OP_ii v20.2d, v1.2d, v9.2d[1] | |||
fmul v21.2d, v0.2d, v9.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v21.2d, v21.2d | |||
eor v21.16b, v21.16b, v21.16b | |||
fmls v21.2d, v0.2d, v9.2d[1] | |||
#else | |||
fmul v21.2d, v0.2d, v9.2d[1] | |||
#endif | |||
OP_ir v21.2d, v1.2d, v8.2d[1] | |||
fmul v22.2d, v2.2d, v8.2d[1] | |||
OP_ii v22.2d, v3.2d, v9.2d[1] | |||
fmul v23.2d, v2.2d, v9.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v23.2d, v23.2d | |||
eor v23.16b, v23.16b, v23.16b | |||
fmls v23.2d, v2.2d, v9.2d[1] | |||
#else | |||
fmul v23.2d, v2.2d, v9.2d[1] | |||
#endif | |||
OP_ir v23.2d, v3.2d, v8.2d[1] | |||
fmul v24.2d, v0.2d, v10.2d[0] | |||
OP_ii v24.2d, v1.2d, v11.2d[0] | |||
fmul v25.2d, v0.2d, v11.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v25.2d, v25.2d | |||
eor v25.16b, v25.16b, v25.16b | |||
fmls v25.2d, v0.2d, v11.2d[0] | |||
#else | |||
fmul v25.2d, v0.2d, v11.2d[0] | |||
#endif | |||
OP_ir v25.2d, v1.2d, v10.2d[0] | |||
fmul v26.2d, v2.2d, v10.2d[0] | |||
OP_ii v26.2d, v3.2d, v11.2d[0] | |||
fmul v27.2d, v2.2d, v11.2d[0] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v27.2d, v27.2d | |||
eor v27.16b, v27.16b, v27.16b | |||
fmls v27.2d, v2.2d, v11.2d[0] | |||
#else | |||
fmul v27.2d, v2.2d, v11.2d[0] | |||
#endif | |||
OP_ir v27.2d, v3.2d, v10.2d[0] | |||
fmul v28.2d, v0.2d, v10.2d[1] | |||
OP_ii v28.2d, v1.2d, v11.2d[1] | |||
fmul v29.2d, v0.2d, v11.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v29.2d, v29.2d | |||
eor v29.16b, v29.16b, v29.16b | |||
fmls v29.2d, v0.2d, v11.2d[1] | |||
#else | |||
fmul v29.2d, v0.2d, v11.2d[1] | |||
#endif | |||
OP_ir v29.2d, v1.2d, v10.2d[1] | |||
fmul v30.2d, v2.2d, v10.2d[1] | |||
OP_ii v30.2d, v3.2d, v11.2d[1] | |||
fmul v31.2d, v2.2d, v11.2d[1] | |||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
fneg v31.2d, v31.2d | |||
eor v31.16b, v31.16b, v31.16b | |||
fmls v31.2d, v2.2d, v11.2d[1] | |||
#else | |||
fmul v31.2d, v2.2d, v11.2d[1] | |||
#endif | |||
OP_ir v31.2d, v3.2d, v10.2d[1] | |||