@@ -5,8 +5,8 @@ DAMAXKERNEL = amax.S | |||||
CAMAXKERNEL = zamax.S | CAMAXKERNEL = zamax.S | ||||
ZAMAXKERNEL = zamax.S | ZAMAXKERNEL = zamax.S | ||||
ISAMAXKERNEL = isamax.S | |||||
IDAMAXKERNEL = idamax.S | |||||
ISAMAXKERNEL = iamax.S | |||||
IDAMAXKERNEL = iamax.S | |||||
ICAMAXKERNEL = izamax.S | ICAMAXKERNEL = izamax.S | ||||
IZAMAXKERNEL = izamax.S | IZAMAXKERNEL = izamax.S | ||||
@@ -25,22 +25,22 @@ DCOPYKERNEL = copy.S | |||||
CCOPYKERNEL = copy.S | CCOPYKERNEL = copy.S | ||||
ZCOPYKERNEL = copy.S | ZCOPYKERNEL = copy.S | ||||
DOTKERNEL = dot.S | |||||
SDOTKERNEL = dot.S | |||||
DDOTKERNEL = dot.S | DDOTKERNEL = dot.S | ||||
CDOTKERNEL = zdot.S | CDOTKERNEL = zdot.S | ||||
ZDOTKERNEL = zdot.S | ZDOTKERNEL = zdot.S | ||||
SNRM2KERNEL = snrm2.S | |||||
DNRM2KERNEL = dnrm2.S | |||||
CNRM2KERNEL = znrm2.S | |||||
ZNRM2KERNEL = znrm2.S | |||||
#SNRM2KERNEL = snrm2.S | |||||
#DNRM2KERNEL = dnrm2.S | |||||
#CNRM2KERNEL = znrm2.S | |||||
#ZNRM2KERNEL = znrm2.S | |||||
SROTKERNEL = rot.S | SROTKERNEL = rot.S | ||||
DROTKERNEL = rot.S | DROTKERNEL = rot.S | ||||
CROTKERNEL = zrot.S | CROTKERNEL = zrot.S | ||||
ZROTKERNEL = zrot.S | ZROTKERNEL = zrot.S | ||||
SCALKERNEL = scal.S | |||||
SSCALKERNEL = scal.S | |||||
DSCALKERNEL = scal.S | DSCALKERNEL = scal.S | ||||
CSCALKERNEL = zscal.S | CSCALKERNEL = zscal.S | ||||
ZSCALKERNEL = zscal.S | ZSCALKERNEL = zscal.S | ||||
@@ -181,73 +181,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
fmul v16.4s, v0.4s, v8.4s[0] | fmul v16.4s, v0.4s, v8.4s[0] | ||||
OP_ii v16.4s, v1.4s, v9.4s[0] | OP_ii v16.4s, v1.4s, v9.4s[0] | ||||
fmul v17.4s, v0.4s, v9.4s[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v17.4s, v17.4s | |||||
eor v17.16b, v17.16b, v17.16b | |||||
fmls v17.4s, v0.4s, v9.4s[0] | |||||
#else | |||||
fmul v17.4s, v0.4s, v9.4s[0] | |||||
#endif | #endif | ||||
OP_ir v17.4s, v1.4s, v8.4s[0] | OP_ir v17.4s, v1.4s, v8.4s[0] | ||||
fmul v20.4s, v0.4s, v8.4s[1] | fmul v20.4s, v0.4s, v8.4s[1] | ||||
OP_ii v20.4s, v1.4s, v9.4s[1] | OP_ii v20.4s, v1.4s, v9.4s[1] | ||||
fmul v21.4s, v0.4s, v9.4s[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v21.4s, v21.4s | |||||
eor v21.16b, v21.16b, v21.16b | |||||
fmls v21.4s, v0.4s, v9.4s[1] | |||||
#else | |||||
fmul v21.4s, v0.4s, v9.4s[1] | |||||
#endif | #endif | ||||
OP_ir v21.4s, v1.4s, v8.4s[1] | OP_ir v21.4s, v1.4s, v8.4s[1] | ||||
fmul v24.4s, v0.4s, v8.4s[2] | fmul v24.4s, v0.4s, v8.4s[2] | ||||
OP_ii v24.4s, v1.4s, v9.4s[2] | OP_ii v24.4s, v1.4s, v9.4s[2] | ||||
fmul v25.4s, v0.4s, v9.4s[2] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v25.4s, v25.4s | |||||
eor v25.16b, v25.16b, v25.16b | |||||
fmls v25.4s, v0.4s, v9.4s[2] | |||||
#else | |||||
fmul v25.4s, v0.4s, v9.4s[2] | |||||
#endif | #endif | ||||
OP_ir v25.4s, v1.4s, v8.4s[2] | OP_ir v25.4s, v1.4s, v8.4s[2] | ||||
fmul v28.4s, v0.4s, v8.4s[3] | fmul v28.4s, v0.4s, v8.4s[3] | ||||
OP_ii v28.4s, v1.4s, v9.4s[3] | OP_ii v28.4s, v1.4s, v9.4s[3] | ||||
fmul v29.4s, v0.4s, v9.4s[3] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v29.4s, v29.4s | |||||
eor v29.16b, v29.16b, v29.16b | |||||
fmls v29.4s, v0.4s, v9.4s[3] | |||||
#else | |||||
fmul v29.4s, v0.4s, v9.4s[3] | |||||
#endif | #endif | ||||
OP_ir v29.4s, v1.4s, v8.4s[3] | OP_ir v29.4s, v1.4s, v8.4s[3] | ||||
fmul v18.4s, v2.4s, v8.4s[0] | fmul v18.4s, v2.4s, v8.4s[0] | ||||
OP_ii v18.4s, v3.4s, v9.4s[0] | OP_ii v18.4s, v3.4s, v9.4s[0] | ||||
fmul v19.4s, v2.4s, v9.4s[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v19.4s, v19.4s | |||||
eor v19.16b, v19.16b, v19.16b | |||||
fmls v19.4s, v2.4s, v9.4s[0] | |||||
#else | |||||
fmul v19.4s, v2.4s, v9.4s[0] | |||||
#endif | #endif | ||||
OP_ir v19.4s, v3.4s, v8.4s[0] | OP_ir v19.4s, v3.4s, v8.4s[0] | ||||
fmul v22.4s, v2.4s, v8.4s[1] | fmul v22.4s, v2.4s, v8.4s[1] | ||||
OP_ii v22.4s, v3.4s, v9.4s[1] | OP_ii v22.4s, v3.4s, v9.4s[1] | ||||
fmul v23.4s, v2.4s, v9.4s[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v23.4s, v23.4s | |||||
eor v23.16b, v23.16b, v23.16b | |||||
fmls v23.4s, v2.4s, v9.4s[1] | |||||
#else | |||||
fmul v23.4s, v2.4s, v9.4s[1] | |||||
#endif | #endif | ||||
OP_ir v23.4s, v3.4s, v8.4s[1] | OP_ir v23.4s, v3.4s, v8.4s[1] | ||||
fmul v26.4s, v2.4s, v8.4s[2] | fmul v26.4s, v2.4s, v8.4s[2] | ||||
OP_ii v26.4s, v3.4s, v9.4s[2] | OP_ii v26.4s, v3.4s, v9.4s[2] | ||||
fmul v27.4s, v2.4s, v9.4s[2] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v27.4s, v27.4s | |||||
eor v27.16b, v27.16b, v27.16b | |||||
fmls v27.4s, v2.4s, v9.4s[2] | |||||
#else | |||||
fmul v27.4s, v2.4s, v9.4s[2] | |||||
#endif | #endif | ||||
OP_ir v27.4s, v3.4s, v8.4s[2] | OP_ir v27.4s, v3.4s, v8.4s[2] | ||||
fmul v30.4s, v2.4s, v8.4s[3] | fmul v30.4s, v2.4s, v8.4s[3] | ||||
OP_ii v30.4s, v3.4s, v9.4s[3] | OP_ii v30.4s, v3.4s, v9.4s[3] | ||||
fmul v31.4s, v2.4s, v9.4s[3] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v31.4s, v31.4s | |||||
eor v31.16b, v31.16b, v31.16b | |||||
fmls v31.4s, v2.4s, v9.4s[3] | |||||
#else | |||||
fmul v31.4s, v2.4s, v9.4s[3] | |||||
#endif | #endif | ||||
OP_ir v31.4s, v3.4s, v8.4s[3] | OP_ir v31.4s, v3.4s, v8.4s[3] | ||||
@@ -172,37 +172,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
fmul v16.4s, v0.4s, v8.4s[0] | fmul v16.4s, v0.4s, v8.4s[0] | ||||
OP_ii v16.4s, v1.4s, v9.4s[0] | OP_ii v16.4s, v1.4s, v9.4s[0] | ||||
fmul v17.4s, v0.4s, v9.4s[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v17.4s, v17.4s | |||||
eor v17.16b, v17.16b, v17.16b | |||||
fmls v17.4s, v0.4s, v9.4s[0] | |||||
#else | |||||
fmul v17.4s, v0.4s, v9.4s[0] | |||||
#endif | #endif | ||||
OP_ir v17.4s, v1.4s, v8.4s[0] | OP_ir v17.4s, v1.4s, v8.4s[0] | ||||
fmul v20.4s, v0.4s, v8.4s[1] | fmul v20.4s, v0.4s, v8.4s[1] | ||||
OP_ii v20.4s, v1.4s, v9.4s[1] | OP_ii v20.4s, v1.4s, v9.4s[1] | ||||
fmul v21.4s, v0.4s, v9.4s[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v21.4s, v21.4s | |||||
eor v21.16b, v21.16b, v21.16b | |||||
fmls v21.4s, v0.4s, v9.4s[1] | |||||
#else | |||||
fmul v21.4s, v0.4s, v9.4s[1] | |||||
#endif | #endif | ||||
OP_ir v21.4s, v1.4s, v8.4s[1] | OP_ir v21.4s, v1.4s, v8.4s[1] | ||||
fmul v24.4s, v0.4s, v8.4s[2] | fmul v24.4s, v0.4s, v8.4s[2] | ||||
OP_ii v24.4s, v1.4s, v9.4s[2] | OP_ii v24.4s, v1.4s, v9.4s[2] | ||||
fmul v25.4s, v0.4s, v9.4s[2] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v25.4s, v25.4s | |||||
eor v25.16b, v25.16b, v25.16b | |||||
fmls v25.4s, v0.4s, v9.4s[2] | |||||
#else | |||||
fmul v25.4s, v0.4s, v9.4s[2] | |||||
#endif | #endif | ||||
OP_ir v25.4s, v1.4s, v8.4s[2] | OP_ir v25.4s, v1.4s, v8.4s[2] | ||||
fmul v28.4s, v0.4s, v8.4s[3] | fmul v28.4s, v0.4s, v8.4s[3] | ||||
OP_ii v28.4s, v1.4s, v9.4s[3] | OP_ii v28.4s, v1.4s, v9.4s[3] | ||||
fmul v29.4s, v0.4s, v9.4s[3] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v29.4s, v29.4s | |||||
eor v29.16b, v29.16b, v29.16b | |||||
fmls v29.4s, v0.4s, v9.4s[3] | |||||
#else | |||||
fmul v29.4s, v0.4s, v9.4s[3] | |||||
#endif | #endif | ||||
OP_ir v29.4s, v1.4s, v8.4s[3] | OP_ir v29.4s, v1.4s, v8.4s[3] | ||||
@@ -45,16 +45,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define COND ge | #define COND ge | ||||
#endif | #endif | ||||
#if !defined(DOUBLE) | |||||
#define MAXF s0 | |||||
#define TMPF s1 | |||||
#define TMPVF {v1.s}[0] | |||||
#define SZ 4 | |||||
#else | |||||
#define MAXF d0 | #define MAXF d0 | ||||
#define TMPF d1 | #define TMPF d1 | ||||
#define TMPVF {v1.d}[0] | #define TMPVF {v1.d}[0] | ||||
#define SZ 8 | #define SZ 8 | ||||
#endif | |||||
/******************************************************************************/ | /******************************************************************************/ | ||||
.macro INIT_S | .macro INIT_S | ||||
#if !defined(DOUBLE) | |||||
lsl INC_X, INC_X, #2 | |||||
ld1 {v0.s}[0], [X], INC_X | |||||
#else | |||||
lsl INC_X, INC_X, #3 | lsl INC_X, INC_X, #3 | ||||
ld1 {v0.d}[0], [X], INC_X | ld1 {v0.d}[0], [X], INC_X | ||||
#endif | |||||
mov Z, #1 | mov Z, #1 | ||||
mov INDEX, Z | mov INDEX, Z | ||||
fabs MAXF, MAXF | fabs MAXF, MAXF | ||||
@@ -107,9 +119,8 @@ iamax_kernel_S1: | |||||
iamax_kernel_S10: | iamax_kernel_S10: | ||||
KERNEL_S1 | KERNEL_S1 | ||||
subs I, I, #1 | |||||
bne iamax_kernel_S10 | |||||
subs I, I, #1 | |||||
bne iamax_kernel_S10 | |||||
iamax_kernel_L999: | iamax_kernel_L999: | ||||
@@ -1,213 +0,0 @@ | |||||
/******************************************************************************* | |||||
Copyright (c) 2015, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | |||||
#define ASSEMBLER | |||||
#include "common.h" | |||||
#define N x0 /* vector length */ | |||||
#define X x1 /* X vector address */ | |||||
#define INC_X x2 /* X stride */ | |||||
#define INDEX x3 /* index of max/min value */ | |||||
#define Z x4 /* vector index */ | |||||
#define I x5 /* loop variable */ | |||||
#define X_COPY x6 /* copy of X address */ | |||||
#define MAXF_Z x7 | |||||
/******************************************************************************* | |||||
* Macro definitions | |||||
*******************************************************************************/ | |||||
#define MAXF s5 | |||||
#define TMPF s6 | |||||
#define TMPVF {v6.s}[0] | |||||
#define SZ 4 | |||||
/******************************************************************************/ | |||||
.macro INIT_F1 | |||||
ldr MAXF, [X], #SZ | |||||
mov Z, #1 | |||||
mov INDEX, Z | |||||
fabs MAXF, MAXF | |||||
.endm | |||||
.macro KERNEL_F1 | |||||
ldr TMPF, [X], #SZ | |||||
add Z, Z, #1 | |||||
fabs TMPF, TMPF | |||||
fcmp TMPF, MAXF | |||||
fcsel MAXF, MAXF, TMPF, le | |||||
csel INDEX, INDEX, Z, le | |||||
.endm | |||||
.macro INIT_F4 | |||||
ld1 {v0.4s}, [X], #16 | |||||
fabs v0.4s, v0.4s | |||||
fmaxv MAXF, v0.4s | |||||
mov Z, #5 | |||||
mov MAXF_Z, #1 | |||||
.endm | |||||
.macro KERNEL_F4 | |||||
ld1 {v0.4s}, [X], #16 | |||||
fabs v0.4s, v0.4s | |||||
fmaxv TMPF, v0.4s | |||||
PRFM PLDL1KEEP, [X, #512] | |||||
fcmp TMPF, MAXF | |||||
fcsel MAXF, MAXF, TMPF, le | |||||
csel MAXF_Z, MAXF_Z, Z, le | |||||
add Z, Z, #4 | |||||
.endm | |||||
.macro KERNEL_F4_FINALIZE | |||||
mov INDEX, MAXF_Z | |||||
sub MAXF_Z, MAXF_Z, #1 | |||||
lsl MAXF_Z, MAXF_Z, #2 | |||||
add X_COPY, X_COPY, MAXF_Z | |||||
ldr TMPF, [X_COPY], #SZ | |||||
fabs TMPF, TMPF | |||||
fcmp TMPF, MAXF | |||||
beq KERNEL_F4_FINALIZE_DONE | |||||
add INDEX, INDEX, #1 | |||||
ldr TMPF, [X_COPY], #SZ | |||||
fabs TMPF, TMPF | |||||
fcmp TMPF, MAXF | |||||
beq KERNEL_F4_FINALIZE_DONE | |||||
add INDEX, INDEX, #1 | |||||
ldr TMPF, [X_COPY], #SZ | |||||
fabs TMPF, TMPF | |||||
fcmp TMPF, MAXF | |||||
beq KERNEL_F4_FINALIZE_DONE | |||||
add INDEX, INDEX, #1 | |||||
KERNEL_F4_FINALIZE_DONE: | |||||
.endm | |||||
.macro INIT_S | |||||
lsl INC_X, INC_X, #2 | |||||
ld1 TMPVF, [X], INC_X | |||||
mov Z, #1 | |||||
mov INDEX, Z | |||||
fabs MAXF, TMPF | |||||
.endm | |||||
.macro KERNEL_S1 | |||||
ld1 TMPVF, [X], INC_X | |||||
add Z, Z, #1 | |||||
fabs TMPF, TMPF | |||||
fcmp TMPF, MAXF | |||||
fcsel MAXF, MAXF, TMPF, le | |||||
csel INDEX, INDEX, Z, le | |||||
.endm | |||||
/******************************************************************************* | |||||
* End of macro definitions | |||||
*******************************************************************************/ | |||||
PROLOGUE | |||||
cmp N, xzr | |||||
ble iamax_kernel_zero | |||||
cmp INC_X, xzr | |||||
ble iamax_kernel_zero | |||||
PRFM PLDL1KEEP, [X] | |||||
mov X_COPY, X | |||||
cmp INC_X, #1 | |||||
bne iamax_kernel_S_BEGIN | |||||
iamax_kernel_F_BEGIN: | |||||
asr I, N, #2 | |||||
cmp I, xzr | |||||
beq iamax_kernel_F1_INIT | |||||
INIT_F4 | |||||
subs I, I, #1 | |||||
beq iamax_kernel_F4_FINALIZE | |||||
iamax_kernel_F4: | |||||
KERNEL_F4 | |||||
subs I, I, #1 | |||||
bne iamax_kernel_F4 | |||||
iamax_kernel_F4_FINALIZE: | |||||
KERNEL_F4_FINALIZE | |||||
iamax_kernel_F1: | |||||
ands I, N, #3 | |||||
ble iamax_kernel_L999 | |||||
iamax_kernel_F10: | |||||
KERNEL_F1 | |||||
subs I, I, #1 | |||||
bne iamax_kernel_F10 | |||||
b iamax_kernel_L999 | |||||
iamax_kernel_F1_INIT: | |||||
INIT_F1 | |||||
subs N, N, #1 | |||||
b iamax_kernel_F1 | |||||
iamax_kernel_S_BEGIN: | |||||
INIT_S | |||||
subs N, N, #1 | |||||
ble iamax_kernel_L999 | |||||
asr I, N, #2 | |||||
cmp I, xzr | |||||
ble iamax_kernel_S1 | |||||
iamax_kernel_S4: | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne iamax_kernel_S4 | |||||
iamax_kernel_S1: | |||||
ands I, N, #3 | |||||
ble iamax_kernel_L999 | |||||
iamax_kernel_S10: | |||||
KERNEL_S1 | |||||
subs I, I, #1 | |||||
bne iamax_kernel_S10 | |||||
iamax_kernel_L999: | |||||
mov x0, INDEX | |||||
ret | |||||
iamax_kernel_zero: | |||||
mov x0, xzr | |||||
ret | |||||
EPILOGUE |
@@ -59,10 +59,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro INIT_F1 | .macro INIT_F1 | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
fneg s2, S | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub s2, s2, S | |||||
ins v1.s[1], v2.s[0] // [-S, S] | ins v1.s[1], v2.s[0] // [-S, S] | ||||
#else | #else | ||||
fneg d2, S | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub d2, d2, S | |||||
ins v1.d[1], v2.d[0] // [-S, S] | ins v1.d[1], v2.d[0] // [-S, S] | ||||
#endif | #endif | ||||
.endm | .endm | ||||
@@ -43,14 +43,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
#define DA_R s0 /* scale input value */ | #define DA_R s0 /* scale input value */ | ||||
#define DA_I s1 /* scale input value */ | #define DA_I s1 /* scale input value */ | ||||
#define TMPX v2.2s | |||||
#define TMPY v3.2s | |||||
#define SZ 4 | #define SZ 4 | ||||
#else | #else | ||||
#define DA_R d0 /* scale input value */ | #define DA_R d0 /* scale input value */ | ||||
#define DA_I d1 /* scale input value */ | #define DA_I d1 /* scale input value */ | ||||
#define TMPX v2.2d | |||||
#define TMPY v3.2d | |||||
#define SZ 8 | #define SZ 8 | ||||
#endif | #endif | ||||
@@ -61,22 +57,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#if !defined(CONJ) | #if !defined(CONJ) | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | ||||
fneg s2, DA_I | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub s2, s2, DA_I | |||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | ||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | ||||
#else | #else | ||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ||||
fneg d2, DA_I | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub d2, d2, DA_I | |||||
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | ||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | ||||
#endif | #endif | ||||
#else | #else | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
fneg s2, DA_R | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub s2, s2, DA_R | |||||
ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R | ins v0.s[1], v2.s[0] // v0 = -DA_R, DA_R | ||||
ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I | ins v1.s[1], v1.s[0] // v1 = DA_I, DA_I | ||||
#else | #else | ||||
fneg d2, DA_R | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub d2, d2, DA_R | |||||
ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R | ins v0.d[1], v2.d[0] // v0 = -DA_R, DA_R | ||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I | ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I | ||||
#endif | #endif | ||||
@@ -111,9 +111,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNEL_INIT_F4 | .macro KERNEL_INIT_F4 | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
// Replicate the lower 2 floats into the upper 2 slots | |||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||||
ins v16.s[0], v0.s[0] | |||||
ins v16.s[1], v16.s[0] | |||||
ins v16.d[1], v16.d[0] | |||||
#if !defined(CONJ) | |||||
ins v17.s[0], v1.s[1] | |||||
#else | |||||
ins v17.s[0], v1.s[0] | |||||
#endif | |||||
ins v17.s[1], v17.s[0] | |||||
ins v17.d[1], v17.d[0] | |||||
#else //DOUBLE | |||||
ins v16.d[0], v0.d[0] | |||||
ins v16.d[1], v16.d[0] | |||||
#if !defined(CONJ) | |||||
ins v17.d[0], v1.d[1] | |||||
#else | |||||
ins v17.d[0], v1.d[0] | |||||
#endif | |||||
ins v17.d[1], v17.d[0] | |||||
#endif | #endif | ||||
.endm | .endm | ||||
@@ -121,55 +137,60 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNEL_F4 | .macro KERNEL_F4 | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ld1 {v2.4s,v3.4s}, [X], #32 // V2 = X[3], X[2], X[1], X[0] | |||||
// V3 = X[7], X[6], X[5], X[4] | |||||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||||
ld2 {v2.4s, v3.4s}, [X], #32 | |||||
ld2 {v4.4s, v5.4s}, [Y_COPY], #32 | |||||
ld1 {v4.4s,v5.4s}, [Y] // V4 = Y[3], Y[2], Y[1], Y[0] | |||||
// V5 = Y[7], Y[6], Y[5], Y[4] | |||||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||||
fmla v4.4s, v2.4s, v16.4s | |||||
#if !defined(CONJ) | |||||
fmls v4.4s, v3.4s, v17.4s | |||||
#else | |||||
fmla v4.4s, v3.4s, v17.4s | |||||
#endif | |||||
fmla v4.4s, v0.4s, v2.4s // Y[iy] += DA_R * X[ix] | |||||
// Y[iy+1] += +-DA_R * X[ix+1] | |||||
fmla v4.4s, v1.4s, v6.4s // Y[iy] += +-DA_I * X[ix+1] | |||||
// Y[iy+1] += DA_I * X[ix] | |||||
st1 {v4.4s}, [Y], #16 | |||||
#if !defined(CONJ) | |||||
fmla v5.4s, v2.4s, v17.4s | |||||
#else | |||||
fmls v5.4s, v2.4s, v17.4s | |||||
#endif | |||||
fmla v5.4s, v3.4s, v16.4s | |||||
fmla v5.4s, v0.4s, v3.4s // Y[iy] += DA_R * X[ix] | |||||
fmla v5.4s, v1.4s, v7.4s // Y[iy] += +-DA_I * X[ix+1] | |||||
// Y[iy+1] += +-DA_R * X[ix+1] | |||||
// Y[iy+1] += DA_I * X[ix] | |||||
st1 {v5.4s}, [Y], #16 | |||||
st2 {v4.4s, v5.4s}, [Y], #32 | |||||
#else // DOUBLE | #else // DOUBLE | ||||
ld1 {v2.2d,v3.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||||
ld2 {v2.2d, v3.2d}, [X], #32 | |||||
ld2 {v4.2d, v5.2d}, [Y_COPY], #32 | |||||
ld1 {v4.2d,v5.2d}, [X], #32 // CX0, CX1, CX2, CX3 | |||||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||||
ld1 {v16.2d,v17.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||||
fmla v4.2d, v2.2d, v16.2d | |||||
#if !defined(CONJ) | |||||
fmls v4.2d, v3.2d, v17.2d | |||||
#else | |||||
fmla v4.2d, v3.2d, v17.2d | |||||
#endif | |||||
#if !defined(CONJ) | |||||
fmla v5.2d, v2.2d, v17.2d | |||||
#else | |||||
fmls v5.2d, v2.2d, v17.2d | |||||
#endif | |||||
fmla v5.2d, v3.2d, v16.2d | |||||
fmla v16.2d, v0.2d, v2.2d | |||||
fmla v17.2d, v0.2d, v3.2d | |||||
st2 {v4.2d, v5.2d}, [Y], #32 | |||||
ld1 {v18.2d,v19.2d}, [Y_COPY], #32 // CY0, CY1, CY2, CY3 | |||||
ld2 {v18.2d, v19.2d}, [X], #32 | |||||
ld2 {v20.2d, v21.2d}, [Y_COPY], #32 | |||||
fmla v16.2d, v1.2d, v20.2d | |||||
fmla v17.2d, v1.2d, v21.2d | |||||
st1 {v16.2d,v17.2d}, [Y], #32 | |||||
fmla v20.2d, v18.2d, v16.2d | |||||
#if !defined(CONJ) | |||||
fmls v20.2d, v19.2d, v17.2d | |||||
#else | |||||
fmla v20.2d, v19.2d, v17.2d | |||||
#endif | |||||
#if !defined(CONJ) | |||||
fmla v21.2d, v18.2d, v17.2d | |||||
#else | |||||
fmls v21.2d, v18.2d, v17.2d | |||||
#endif | |||||
fmla v21.2d, v19.2d, v16.2d | |||||
fmla v18.2d, v0.2d, v4.2d | |||||
fmla v19.2d, v0.2d, v5.2d | |||||
fmla v18.2d, v1.2d, v22.2d | |||||
fmla v19.2d, v1.2d, v23.2d | |||||
st1 {v18.2d,v19.2d}, [Y], #32 | |||||
st2 {v20.2d, v21.2d}, [Y], #32 | |||||
#endif | #endif | ||||
PRFM PLDL1KEEP, [X, #512] | PRFM PLDL1KEEP, [X, #512] | ||||
PRFM PLDL1KEEP, [Y, #512] | PRFM PLDL1KEEP, [Y, #512] | ||||
@@ -184,73 +184,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
fmul v16.2d, v0.2d, v8.2d[0] | fmul v16.2d, v0.2d, v8.2d[0] | ||||
OP_ii v16.2d, v1.2d, v9.2d[0] | OP_ii v16.2d, v1.2d, v9.2d[0] | ||||
fmul v17.2d, v0.2d, v9.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v17.2d, v17.2d | |||||
eor v17.16b, v17.16b, v17.16b | |||||
fmls v17.2d, v0.2d, v9.2d[0] | |||||
#else | |||||
fmul v17.2d, v0.2d, v9.2d[0] | |||||
#endif | #endif | ||||
OP_ir v17.2d, v1.2d, v8.2d[0] | OP_ir v17.2d, v1.2d, v8.2d[0] | ||||
fmul v18.2d, v2.2d, v8.2d[0] | fmul v18.2d, v2.2d, v8.2d[0] | ||||
OP_ii v18.2d, v3.2d, v9.2d[0] | OP_ii v18.2d, v3.2d, v9.2d[0] | ||||
fmul v19.2d, v2.2d, v9.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v19.2d, v19.2d | |||||
eor v19.16b, v19.16b, v19.16b | |||||
fmls v19.2d, v2.2d, v9.2d[0] | |||||
#else | |||||
fmul v19.2d, v2.2d, v9.2d[0] | |||||
#endif | #endif | ||||
OP_ir v19.2d, v3.2d, v8.2d[0] | OP_ir v19.2d, v3.2d, v8.2d[0] | ||||
fmul v20.2d, v0.2d, v8.2d[1] | fmul v20.2d, v0.2d, v8.2d[1] | ||||
OP_ii v20.2d, v1.2d, v9.2d[1] | OP_ii v20.2d, v1.2d, v9.2d[1] | ||||
fmul v21.2d, v0.2d, v9.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v21.2d, v21.2d | |||||
eor v21.16b, v21.16b, v21.16b | |||||
fmls v21.2d, v0.2d, v9.2d[1] | |||||
#else | |||||
fmul v21.2d, v0.2d, v9.2d[1] | |||||
#endif | #endif | ||||
OP_ir v21.2d, v1.2d, v8.2d[1] | OP_ir v21.2d, v1.2d, v8.2d[1] | ||||
fmul v22.2d, v2.2d, v8.2d[1] | fmul v22.2d, v2.2d, v8.2d[1] | ||||
OP_ii v22.2d, v3.2d, v9.2d[1] | OP_ii v22.2d, v3.2d, v9.2d[1] | ||||
fmul v23.2d, v2.2d, v9.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v23.2d, v23.2d | |||||
eor v23.16b, v23.16b, v23.16b | |||||
fmls v23.2d, v2.2d, v9.2d[1] | |||||
#else | |||||
fmul v23.2d, v2.2d, v9.2d[1] | |||||
#endif | #endif | ||||
OP_ir v23.2d, v3.2d, v8.2d[1] | OP_ir v23.2d, v3.2d, v8.2d[1] | ||||
fmul v24.2d, v0.2d, v10.2d[0] | fmul v24.2d, v0.2d, v10.2d[0] | ||||
OP_ii v24.2d, v1.2d, v11.2d[0] | OP_ii v24.2d, v1.2d, v11.2d[0] | ||||
fmul v25.2d, v0.2d, v11.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v25.2d, v25.2d | |||||
eor v25.16b, v25.16b, v25.16b | |||||
fmls v25.2d, v0.2d, v11.2d[0] | |||||
#else | |||||
fmul v25.2d, v0.2d, v11.2d[0] | |||||
#endif | #endif | ||||
OP_ir v25.2d, v1.2d, v10.2d[0] | OP_ir v25.2d, v1.2d, v10.2d[0] | ||||
fmul v26.2d, v2.2d, v10.2d[0] | fmul v26.2d, v2.2d, v10.2d[0] | ||||
OP_ii v26.2d, v3.2d, v11.2d[0] | OP_ii v26.2d, v3.2d, v11.2d[0] | ||||
fmul v27.2d, v2.2d, v11.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v27.2d, v27.2d | |||||
eor v27.16b, v27.16b, v27.16b | |||||
fmls v27.2d, v2.2d, v11.2d[0] | |||||
#else | |||||
fmul v27.2d, v2.2d, v11.2d[0] | |||||
#endif | #endif | ||||
OP_ir v27.2d, v3.2d, v10.2d[0] | OP_ir v27.2d, v3.2d, v10.2d[0] | ||||
fmul v28.2d, v0.2d, v10.2d[1] | fmul v28.2d, v0.2d, v10.2d[1] | ||||
OP_ii v28.2d, v1.2d, v11.2d[1] | OP_ii v28.2d, v1.2d, v11.2d[1] | ||||
fmul v29.2d, v0.2d, v11.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v29.2d, v29.2d | |||||
eor v29.16b, v29.16b, v29.16b | |||||
fmls v29.2d, v0.2d, v11.2d[1] | |||||
#else | |||||
fmul v29.2d, v0.2d, v11.2d[1] | |||||
#endif | #endif | ||||
OP_ir v29.2d, v1.2d, v10.2d[1] | OP_ir v29.2d, v1.2d, v10.2d[1] | ||||
fmul v30.2d, v2.2d, v10.2d[1] | fmul v30.2d, v2.2d, v10.2d[1] | ||||
OP_ii v30.2d, v3.2d, v11.2d[1] | OP_ii v30.2d, v3.2d, v11.2d[1] | ||||
fmul v31.2d, v2.2d, v11.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v31.2d, v31.2d | |||||
eor v31.16b, v31.16b, v31.16b | |||||
fmls v31.2d, v2.2d, v11.2d[1] | |||||
#else | |||||
fmul v31.2d, v2.2d, v11.2d[1] | |||||
#endif | #endif | ||||
OP_ir v31.2d, v3.2d, v10.2d[1] | OP_ir v31.2d, v3.2d, v10.2d[1] | ||||
@@ -110,15 +110,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
/******* INIT FOR F1 AND S1 LOOP ******/ | /******* INIT FOR F1 AND S1 LOOP ******/ | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||||
fneg s2, ALPHA_I | |||||
ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub s2, s2, ALPHA_I | |||||
ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) | ins v1.s[1], v2.s[0] // -I(ALPHA), I(ALPHA) | ||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) | ext v1.8b, v1.8b, v1.8b, #4 // I(ALPHA), -I(ALPHA) | ||||
#endif | #endif | ||||
#else | #else | ||||
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||||
fneg d2, ALPHA_I | |||||
ins v0.d[1], v0.d[0] // R(ALPHA), R(ALPHA) | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub d2, d2, ALPHA_I | |||||
ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) | ins v1.d[1], v2.d[0] // -I(ALPHA), I(ALPHA) | ||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) | ext v1.16b, v1.16b, v1.16b, #8 // I(ALPHA), -I(ALPHA) | ||||
@@ -156,8 +158,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#else | #else | ||||
fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | ||||
fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | ||||
fmul v12.4s, v9.4s, v8.4s // [R(X) * I(ALPHA)] | |||||
fneg v12.4s, v12.4s // [- R(X) * I(ALPHA)] | |||||
eor v12.16b, v12.16b, v12.16b | |||||
fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||||
fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | ||||
#endif | #endif | ||||
#endif // CONJ | #endif // CONJ | ||||
@@ -170,24 +172,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ins v3.s[0], v2.s[1] | ins v3.s[0], v2.s[1] | ||||
#if !defined(CONJ) | #if !defined(CONJ) | ||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
fneg s4, s3 | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub s4, s4, s3 | |||||
ins v3.s[1], v4.s[0] | ins v3.s[1], v4.s[0] | ||||
ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] | ext v3.8b, v3.8b, v3.8b, #4 // [I(TEMP), -I(TEMP)] | ||||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ||||
#else | #else | ||||
fneg s4, s3 | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub s4, s4, s3 | |||||
ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] | ins v3.s[1], v4.s[0] // [-I(TEMP), I(TEMP)] | ||||
ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ins v2.s[1], v2.s[0] // [R(TEMP), R(TEMP)] | ||||
#endif | #endif | ||||
#else // CONJ | #else // CONJ | ||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] | ins v3.s[1], v3.s[0] // [I(TEMP), I(TEMP)] | ||||
fneg s4, s2 | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub s4, s4, s2 | |||||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ||||
#else | #else | ||||
fneg s3, s3 | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub s3, s4, s3 | |||||
ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] | ins v3.s[1], v3.s[0] // [-I(TEMP), -I(TEMP)] | ||||
fneg s4, s2 | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub s4, s4, s2 | |||||
ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ins v2.s[1], v4.s[0] // [-R(TEMP), R(TEMP)] | ||||
#endif | #endif | ||||
#endif // CONJ | #endif // CONJ | ||||
@@ -220,8 +227,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#else | #else | ||||
fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | ||||
fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | ||||
fmul v12.2d, v9.2d, v8.2d // [R(X) * I(ALPHA)] | |||||
fneg v12.2d, v12.2d // [- R(X) * I(ALPHA)] | |||||
eor v12.16b, v12.16b, v12.16b | |||||
fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||||
fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | ||||
#endif | #endif | ||||
#endif // CONJ | #endif // CONJ | ||||
@@ -234,24 +241,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ins v3.d[0], v2.d[1] // I(TEMP) | ins v3.d[0], v2.d[1] // I(TEMP) | ||||
#if !defined(CONJ) | #if !defined(CONJ) | ||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
fneg d4, d3 // -I(TEMP) | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub d4, d4, d3 | |||||
ins v3.d[1], v4.d[0] | ins v3.d[1], v4.d[0] | ||||
ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] | ext v3.16b, v3.16b, v3.16b, #8 // [I(TEMP), -I(TEMP)] | ||||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ||||
#else | #else | ||||
fneg d4, d3 // -I(TEMP) | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub d4, d4, d3 | |||||
ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] | ins v3.d[1], v4.d[0] // [-I(TEMP), I(TEMP)] | ||||
ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ins v2.d[1], v2.d[0] // [R(TEMP), R(TEMP)] | ||||
#endif | #endif | ||||
#else // CONJ | #else // CONJ | ||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] | ins v3.d[1], v3.d[0] // [I(TEMP), I(TEMP)] | ||||
fneg d4, d2 // -R(TEMP) | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub d4, d4, d2 | |||||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ||||
#else | #else | ||||
fneg d3, d3 // -I(TEMP) | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub d3, d4, d3 | |||||
ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] | ins v3.d[1], v3.d[0] // [-I(TEMP), -I(TEMP)] | ||||
fneg d4, d2 // -R(TEMP) | |||||
eor v4.16b, v4.16b, v4.16b | |||||
fsub d4, d4, d2 | |||||
ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ins v2.d[1], v4.d[0] // [-R(TEMP), R(TEMP)] | ||||
#endif | #endif | ||||
#endif // CONJ | #endif // CONJ | ||||
@@ -96,22 +96,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#if !defined(XCONJ) | #if !defined(XCONJ) | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R | ins v0.s[1], v0.s[0] // v0 = ALPHA_R, ALPHA_R | ||||
fneg s2, ALPHA_I | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub s2, s2, ALPHA_I | |||||
ins v1.s[1], v2.s[0] | ins v1.s[1], v2.s[0] | ||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I | ext v1.8b, v1.8b, v1.8b, #4 // v1 = ALPHA_I, -ALPHA_I | ||||
#else | #else | ||||
ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R | ins v0.d[1], v0.d[0] // v0 = ALPHA_R, ALPHA_R | ||||
fneg d2, ALPHA_I | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub d2, d2, ALPHA_I | |||||
ins v1.d[1], v2.d[0] | ins v1.d[1], v2.d[0] | ||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I | ext v1.16b, v1.16b, v1.16b, #8 // v1 = ALPHA_I, -ALPHA_I | ||||
#endif | #endif | ||||
#else // XCONJ | #else // XCONJ | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
fneg s2, ALPHA_R | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub s2, s2, ALPHA_R | |||||
ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R | ins v0.s[1], v2.s[0] // v0 = -ALPHA_R, ALPHA_R | ||||
ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I | ins v1.s[1], v1.s[0] // v1 = ALPHA_I, ALPHA_I | ||||
#else | #else | ||||
fneg d2, ALPHA_R | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub d2, d2, ALPHA_R | |||||
ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R | ins v0.d[1], v2.d[0] // v0 = -ALPHA_R, ALPHA_R | ||||
ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I | ins v1.d[1], v1.d[0] // v1 = ALPHA_I, ALPHA_I | ||||
#endif | #endif | ||||
@@ -136,89 +140,51 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ld2 {v11.4s, v12.4s}, [X_PTR], #32 | ld2 {v11.4s, v12.4s}, [X_PTR], #32 | ||||
ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ld2 {v13.4s, v14.4s}, [A_PTR], #32 | ||||
#if !defined(CONJ) | |||||
#if !defined(XCONJ) | |||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | ||||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | ||||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | ||||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | ||||
#else | #else | ||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | |||||
fmla v10.4s, v11.4s, v14.4s // [+ R(X) * A_I] | |||||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||||
#endif | |||||
#else // CONJ | |||||
#if !defined(XCONJ) | |||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | ||||
fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | fmla v9.4s, v12.4s, v14.4s // [+ I(X) * A_I] | ||||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | ||||
fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | fmla v10.4s, v12.4s, v13.4s // [+ I(X) * A_R] | ||||
#else | |||||
fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||||
fmls v9.4s, v12.4s, v14.4s // [- I(X) * A_I] | |||||
fmls v10.4s, v11.4s, v14.4s // [- R(X) * A_I] | |||||
fmls v10.4s, v12.4s, v13.4s // [- I(X) * A_R] | |||||
#endif | #endif | ||||
#endif // CONJ | |||||
#else // DOUBLE | #else // DOUBLE | ||||
ld2 {v11.2d, v12.2d}, [X_PTR], #32 | ld2 {v11.2d, v12.2d}, [X_PTR], #32 | ||||
ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ld2 {v13.2d, v14.2d}, [A_PTR], #32 | ||||
prfm PLDL1STRM, [X_PTR, #512] | prfm PLDL1STRM, [X_PTR, #512] | ||||
#if !defined(CONJ) | |||||
#if !defined(XCONJ) | |||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | ||||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | ||||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | ||||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | ||||
#else | #else | ||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | |||||
fmla v10.2d, v11.2d, v14.2d // [+ R(X) * A_I] | |||||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||||
#endif | |||||
#else // CONJ | |||||
#if !defined(XCONJ) | |||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | ||||
fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | fmla v9.2d, v12.2d, v14.2d // [+ I(X) * A_I] | ||||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | ||||
fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | fmla v10.2d, v12.2d, v13.2d // [+ I(X) * A_R] | ||||
#else | |||||
fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||||
fmls v9.2d, v12.2d, v14.2d // [- I(X) * A_I] | |||||
fmls v10.2d, v11.2d, v14.2d // [- R(X) * A_I] | |||||
fmls v10.2d, v12.2d, v13.2d // [- I(X) * A_R] | |||||
#endif | #endif | ||||
#endif // CONJ | |||||
ld2 {v17.2d, v18.2d}, [X_PTR], #32 | ld2 {v17.2d, v18.2d}, [X_PTR], #32 | ||||
ld2 {v19.2d, v20.2d}, [A_PTR], #32 | ld2 {v19.2d, v20.2d}, [A_PTR], #32 | ||||
prfm PLDL1STRM, [A_PTR, #512] | prfm PLDL1STRM, [A_PTR, #512] | ||||
#if !defined(CONJ) | |||||
#if !defined(XCONJ) | |||||
#if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | ||||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | ||||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | ||||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | ||||
#else | #else | ||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
fmla v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
#endif | |||||
#else // CONJ | |||||
#if !defined(XCONJ) | |||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | ||||
fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | fmla v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | ||||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | ||||
fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | fmla v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | ||||
#else | |||||
fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||||
fmls v15.2d, v18.2d, v20.2d // [- I(X) * A_I] | |||||
fmls v16.2d, v17.2d, v20.2d // [+ R(X) * A_I] | |||||
fmls v16.2d, v18.2d, v19.2d // [+ I(X) * A_R] | |||||
#endif | #endif | ||||
#endif // CONJ | |||||
#endif //DOUBLE | #endif //DOUBLE | ||||
.endm | .endm | ||||
@@ -252,7 +218,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ||||
ld1 {v5.s}[0], [A_PTR], #4 // A1 | ld1 {v5.s}[0], [A_PTR], #4 // A1 | ||||
ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] | ld1 {v6.2s}, [X_PTR], #8 // [X1, X0] | ||||
fneg s16, s5 | |||||
eor v16.16b, v16.16b, v16.16b | |||||
fsub s16, s16, s5 | |||||
ins v5.s[1], v16.s[0] // [-A1, A1] | ins v5.s[1], v16.s[0] // [-A1, A1] | ||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ||||
@@ -264,7 +231,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ||||
ld1 {v5.d}[0], [A_PTR], #8 // A1 | ld1 {v5.d}[0], [A_PTR], #8 // A1 | ||||
ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] | ld1 {v6.2d}, [X_PTR], #16 // [X1, X0] | ||||
fneg d16, d5 | |||||
eor v16.16b, v16.16b, v16.16b | |||||
fsub d16, d16, d5 | |||||
ins v5.d[1], v16.d[0] // [-A1, A1] | ins v5.d[1], v16.d[0] // [-A1, A1] | ||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ||||
@@ -284,7 +252,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ld1r {v4.2s}, [A_PTR], #4 // [A0, A0] | ||||
ld1 {v5.s}[0], [A_PTR], #4 // A1 | ld1 {v5.s}[0], [A_PTR], #4 // A1 | ||||
ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] | ld1 {v6.2s}, [X_PTR], INC_X // [X1, X0] | ||||
fneg s16, s5 | |||||
eor v16.16b, v16.16b, v16.16b | |||||
fsub s16, s16, s5 | |||||
ins v5.s[1], v16.s[0] // [-A1, A1] | ins v5.s[1], v16.s[0] // [-A1, A1] | ||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ext v5.8b, v5.8b, v5.8b, #4 // [A1, -A1] | ||||
@@ -296,7 +265,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ld1r {v4.2d}, [A_PTR], #8 // [A0, A0] | ||||
ld1 {v5.d}[0], [A_PTR], #8 // A1 | ld1 {v5.d}[0], [A_PTR], #8 // A1 | ||||
ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] | ld1 {v6.2d}, [X_PTR], INC_X // [X1, X0] | ||||
fneg d16, d5 | |||||
eor v16.16b, v16.16b, v16.16b | |||||
fsub d16, d16, d5 | |||||
ins v5.d[1], v16.d[0] // [-A1, A1] | ins v5.d[1], v16.d[0] // [-A1, A1] | ||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | ||||
ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ext v5.16b, v5.16b, v5.16b, #8 // [A1, -A1] | ||||
@@ -32,6 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
#define X x3 /* X vector address */ | #define X x3 /* X vector address */ | ||||
#define INC_X x4 /* X stride */ | #define INC_X x4 /* X stride */ | ||||
#define I x5 /* loop variable */ | #define I x5 /* loop variable */ | ||||
#define X_COPY x6 /* Copy of X */ | |||||
/******************************************************************************* | /******************************************************************************* | ||||
* Macro definitions | * Macro definitions | ||||
@@ -50,43 +51,55 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro INIT | .macro INIT | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
fneg s2, DA_I | |||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||||
ext v1.8b, v1.8b, v1.8b, #4 // v1 = DA_I, -DA_I | |||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
#else | #else | ||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | ||||
fneg d2, DA_I | |||||
ins v1.d[1], v2.d[0] // v1 = DA_I, DA_I | |||||
ext v1.16b, v1.16b, v1.16b, #8 // v1 = DA_I, -DA_I | |||||
#endif | #endif | ||||
.endm | .endm | ||||
.macro KERNEL_F1 | .macro KERNEL_F1 | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ld1 {v2.2s}, [X] // X1, X0 | ld1 {v2.2s}, [X] // X1, X0 | ||||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v2.2s}, [X], #8 | |||||
fmul s3, DA_R, v2.s[0] // DA_R*X0 | |||||
fmul s5, DA_I, v2.s[1] // DA_I*X1 | |||||
fsub s3, s3, s5 // DA_R*X0-DA_I*X1 | |||||
fmul s4, DA_I, v2.s[0] // DA_I*X0 | |||||
fmul s5, DA_R, v2.s[1] // DA_R*X1 | |||||
fadd s4, s4, s5 // DA_I*X0+DA_R*X1 | |||||
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v3.2s}, [X], #8 | |||||
#else | #else | ||||
ld1 {v2.2d}, [X] // X1, X0 | ld1 {v2.2d}, [X] // X1, X0 | ||||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v2.2d}, [X], #16 | |||||
#endif | |||||
fmul d3, DA_R, v2.d[0] // DA_R*X0 | |||||
fmul d5, DA_I, v2.d[1] // DA_I*X1 | |||||
fsub d3, d3, d5 // DA_R*X0-DA_I*X1 | |||||
fmul d4, DA_I, v2.d[0] // DA_I*X0 | |||||
fmul d5, DA_R, v2.d[1] // DA_R*X1 | |||||
fadd d4, d4, d5 // DA_I*X0+DA_R*X1 | |||||
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v3.2d}, [X], #16 | |||||
#endif | |||||
.endm | .endm | ||||
.macro KERNEL_INIT_F4 | .macro KERNEL_INIT_F4 | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
// Replicate the lower 2 floats into the upper 2 slots | |||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R, DA_R, DA_R | |||||
ins v1.d[1], v1.d[0] // v1 = DA_I, DA_I, DA_I, DA_I | |||||
ins v16.s[0], v0.s[0] | |||||
ins v16.s[1], v16.s[0] | |||||
ins v16.d[1], v16.d[0] | |||||
ins v17.s[0], v1.s[0] | |||||
ins v17.s[1], v17.s[0] | |||||
ins v17.d[1], v17.d[0] | |||||
#else //DOUBLE | |||||
ins v16.d[0], v0.d[0] | |||||
ins v16.d[1], v16.d[0] | |||||
ins v17.d[0], v1.d[0] | |||||
ins v17.d[1], v17.d[0] | |||||
#endif | #endif | ||||
.endm | .endm | ||||
@@ -94,46 +107,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.macro KERNEL_F4 | .macro KERNEL_F4 | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ld1 {v2.4s,v3.4s}, [X] // V2 = X[3], X[2], X[1], X[0] | |||||
// V3 = X[7], X[6], X[5], X[4] | |||||
ext v6.8b, v2.8b, v2.8b, #4 // V6 = - , - , X[0], X[1] | |||||
ins v6.s[2], v2.s[3] // V6 = - , X[3], X[0], X[1] | |||||
ins v6.s[3], v2.s[2] // V6 = X[2], X[3], X[0], X[1] | |||||
fmul v2.4s, v0.4s, v2.4s // X'[ix] += DA_R * X[ix] | |||||
// X'[ix+1] += DA_R * X[ix+1] | |||||
fmla v2.4s, v1.4s, v6.4s // X'[ix] += -DA_I * X[ix+1] | |||||
// X'[ix+1] += DA_I * X[ix] | |||||
ext v7.8b, v3.8b, v3.8b, #4 // V7 = - , - , X[4], X[5] | |||||
ins v7.s[2], v3.s[3] // V7 = - , X[7], X[4], X[5] | |||||
ins v7.s[3], v3.s[2] // V7 = X[6], X[7], X[4], X[5] | |||||
fmul v3.4s, v0.4s, v3.4s // X'[ix] += DA_R * X[ix] | |||||
// X'[ix+1] += DA_R * X[ix+1] | |||||
fmla v3.4s, v1.4s, v7.4s // X'[ix] += -DA_I * X[ix+1] | |||||
// X'[ix+1] += DA_I * X[ix] | |||||
st1 {v2.4s,v3.4s}, [X], #32 | |||||
ld2 {v2.4s, v3.4s}, [X], #32 | |||||
fmul v4.4s, v2.4s, v16.4s | |||||
fmul v6.4s, v3.4s, v17.4s | |||||
fsub v4.4s, v4.4s, v6.4s | |||||
fmul v5.4s, v2.4s, v17.4s | |||||
fmul v6.4s, v3.4s, v16.4s | |||||
fadd v5.4s, v5.4s, v6.4s | |||||
st2 {v4.4s, v5.4s}, [X_COPY], #32 | |||||
#else // DOUBLE | #else // DOUBLE | ||||
ld1 {v2.2d,v3.2d,v4.2d,v5.2d}, [X] // CX0, CX1, CX2, CX3 | |||||
ext v20.16b, v2.16b, v2.16b, #8 // X[ix], X[ix+1] | |||||
ext v21.16b, v3.16b, v3.16b, #8 // X[ix], X[ix+1] | |||||
ext v22.16b, v4.16b, v4.16b, #8 // X[ix], X[ix+1] | |||||
ext v23.16b, v5.16b, v5.16b, #8 // X[ix], X[ix+1] | |||||
ld2 {v2.2d, v3.2d}, [X], #32 | |||||
fmul v2.2d, v0.2d, v2.2d | |||||
fmla v2.2d, v1.2d, v20.2d | |||||
fmul v4.2d, v2.2d, v16.2d | |||||
fmul v6.2d, v3.2d, v17.2d | |||||
fsub v4.2d, v4.2d, v6.2d | |||||
fmul v5.2d, v2.2d, v17.2d | |||||
fmul v6.2d, v3.2d, v16.2d | |||||
fadd v5.2d, v5.2d, v6.2d | |||||
fmul v3.2d, v0.2d, v3.2d | |||||
fmla v3.2d, v1.2d, v21.2d | |||||
st1 {v2.2d,v3.2d}, [X], #32 | |||||
st2 {v4.2d, v5.2d}, [X_COPY], #32 | |||||
fmul v4.2d, v0.2d, v4.2d | |||||
fmla v4.2d, v1.2d, v22.2d | |||||
ld2 {v18.2d, v19.2d}, [X], #32 | |||||
fmul v5.2d, v0.2d, v5.2d | |||||
fmla v5.2d, v1.2d, v23.2d | |||||
st1 {v4.2d,v5.2d}, [X], #32 | |||||
fmul v20.2d, v18.2d, v16.2d | |||||
fmul v6.2d, v19.2d, v17.2d | |||||
fsub v20.2d, v20.2d, v6.2d | |||||
fmul v21.2d, v18.2d, v17.2d | |||||
fmul v6.2d, v19.2d, v16.2d | |||||
fadd v21.2d, v21.2d, v6.2d | |||||
st2 {v20.2d, v21.2d}, [X_COPY], #32 | |||||
#endif | #endif | ||||
PRFM PLDL1KEEP, [X, #1024] | PRFM PLDL1KEEP, [X, #1024] | ||||
.endm | .endm | ||||
@@ -149,21 +155,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
.endm | .endm | ||||
.macro KERNEL_S1 | .macro KERNEL_S1 | ||||
#if !defined(DOUBLE) | #if !defined(DOUBLE) | ||||
ld1 {v2.2s}, [X] // X1, X0 | ld1 {v2.2s}, [X] // X1, X0 | ||||
ext v3.8b, v2.8b, v2.8b, #4 // X0, X1 | |||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
fmla v2.2s, v3.2s, v1.2s // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v2.2s}, [X], INC_X | |||||
fmul s3, DA_R, v2.s[0] // DA_R*X0 | |||||
fmul s5, DA_I, v2.s[1] // DA_I*X1 | |||||
fsub s3, s3, s5 // DA_R*X0-DA_I*X1 | |||||
fmul s4, DA_I, v2.s[0] // DA_I*X0 | |||||
fmul s5, DA_R, v2.s[1] // DA_R*X1 | |||||
fadd s4, s4, s5 // DA_I*X0+DA_R*X1 | |||||
ins v3.s[1], v4.s[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v3.2s}, [X], INC_X | |||||
#else | #else | ||||
ld1 {v2.2d}, [X] // X1, X0 | ld1 {v2.2d}, [X] // X1, X0 | ||||
ext v3.16b, v2.16b, v2.16b, #8 // X0, X1 | |||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
fmla v2.2d, v3.2d, v1.2d // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v2.2d}, [X], INC_X | |||||
#endif | |||||
fmul d3, DA_R, v2.d[0] // DA_R*X0 | |||||
fmul d5, DA_I, v2.d[1] // DA_I*X1 | |||||
fsub d3, d3, d5 // DA_R*X0-DA_I*X1 | |||||
fmul d4, DA_I, v2.d[0] // DA_I*X0 | |||||
fmul d5, DA_R, v2.d[1] // DA_R*X1 | |||||
fadd d4, d4, d5 // DA_I*X0+DA_R*X1 | |||||
ins v3.d[1], v4.d[0] // DA_R*X1+DA_I*X0, DA_R*X0-DA_I*X1 | |||||
st1 {v3.2d}, [X], INC_X | |||||
#endif | |||||
.endm | .endm | ||||
/******************************************************************************* | /******************************************************************************* | ||||
@@ -171,21 +187,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*******************************************************************************/ | *******************************************************************************/ | ||||
PROLOGUE | PROLOGUE | ||||
b zscal_begin | |||||
data_ar: | |||||
.word 0x3e44fae6 | |||||
data_ai: | |||||
.word 0x3d320fa2 | |||||
data_xr: | |||||
.word 0x3f4baff1 | |||||
data_xi: | |||||
.word 0xbe8ef0bd | |||||
zscal_begin: | |||||
ldr s20, data_ar | |||||
ldr s21, data_ai | |||||
ldr s22, data_xr | |||||
ldr s23, data_xi | |||||
fmul s24, s22, s21 | |||||
fmla s24, s23, v20.s[0] | |||||
fmul s25, s22, s21 | |||||
fmul s26, s23, s20 | |||||
fadd s25, s25, s26 | |||||
mov X_COPY, X | |||||
cmp N, xzr | cmp N, xzr | ||||
ble zscal_kernel_L999 | ble zscal_kernel_L999 | ||||
fcmp DA_R, #0.0 | fcmp DA_R, #0.0 | ||||
bne zscal_kernel_1 | |||||
bne zscal_kernel_R_non_zero | |||||
fcmp DA_I, #0.0 | fcmp DA_I, #0.0 | ||||
beq zscal_kernel_zero | |||||
beq zscal_kernel_RI_zero | |||||
// TODO: special case DA_R == 0 && DA_I != 0 | |||||
b zscal_kernel_R_zero | |||||
zscal_kernel_1: | |||||
zscal_kernel_R_non_zero: | |||||
// TODO: special case DA_R != 0 && DA_I == 0 | |||||
fcmp DA_I, #0.0 | |||||
beq zscal_kernel_I_zero | |||||
/******************************************************************************* | |||||
* A_R != 0 && A_I != 0 | |||||
*******************************************************************************/ | |||||
zscal_kernel_RI_non_zero: | |||||
INIT | INIT | ||||
@@ -257,16 +306,85 @@ zscal_kernel_L999: | |||||
mov w0, wzr | mov w0, wzr | ||||
ret | ret | ||||
zscal_kernel_zero: | |||||
/******************************************************************************* | |||||
* A_R == 0 && A_I != 0 | |||||
*******************************************************************************/ | |||||
zscal_kernel_R_zero: | |||||
INIT_S | |||||
#if !defined(DOUBLE) | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub s2, s2, DA_I | |||||
ins v1.s[1], v2.s[0] // v1 = -DA_I, DA_I | |||||
#else | |||||
eor v2.16b, v2.16b, v2.16b | |||||
fsub d2, d2, DA_I | |||||
ins v1.d[1], v2.d[0] // v1 = -DA_I, DA_I | |||||
#endif | |||||
zscal_kernel_R_zero_1: | |||||
#if !defined(DOUBLE) | |||||
ld1 {v2.2s}, [X] // X1, X0 | |||||
fmul v2.2s, v2.2s, v1.2s // -DA_I*X1, DA_I*X0 | |||||
ext v2.8b, v2.8b, v2.8b, #4 // DA_I*X0, -DA_I*X1 | |||||
st1 {v2.2s}, [X] | |||||
#else | |||||
ld1 {v2.2d}, [X] // X1, X0 | |||||
fmul v2.2d, v2.2d, v1.2d // -DA_I*X1, DA_I*X0 | |||||
ext v2.16b, v2.16b, v2.16b, #8 // DA_I*X0, -DA_I*X1 | |||||
st1 {v2.2d}, [X] | |||||
#endif | |||||
add X, X, INC_X | |||||
subs N, N, #1 | |||||
bne zscal_kernel_R_zero_1 | |||||
mov w0, wzr | |||||
ret | |||||
/******************************************************************************* | |||||
* A_R != 0 && A_I == 0 | |||||
*******************************************************************************/ | |||||
zscal_kernel_I_zero: | |||||
INIT_S | |||||
#if !defined(DOUBLE) | |||||
ins v0.s[1], v0.s[0] // v0 = DA_R, DA_R | |||||
#else | |||||
ins v0.d[1], v0.d[0] // v0 = DA_R, DA_R | |||||
#endif | |||||
zscal_kernel_I_zero_1: | |||||
#if !defined(DOUBLE) | |||||
ld1 {v2.2s}, [X] // X1, X0 | |||||
fmul v2.2s, v2.2s, v0.2s // DA_R*X1, DA_R*X0 | |||||
st1 {v2.2s}, [X] | |||||
#else | |||||
ld1 {v2.2d}, [X] // X1, X0 | |||||
fmul v2.2d, v2.2d, v0.2d // DA_R*X1, DA_R*X0 | |||||
st1 {v2.2d}, [X] | |||||
#endif | |||||
add X, X, INC_X | |||||
subs N, N, #1 | |||||
bne zscal_kernel_I_zero_1 | |||||
mov w0, wzr | |||||
ret | |||||
/******************************************************************************* | |||||
* A_R == 0 && A_I == 0 | |||||
*******************************************************************************/ | |||||
zscal_kernel_RI_zero: | |||||
INIT_S | INIT_S | ||||
zscal_kernel_Z1: | |||||
zscal_kernel_RI_zero_1: | |||||
stp DA_R, DA_I, [X] | stp DA_R, DA_I, [X] | ||||
add X, X, INC_X | add X, X, INC_X | ||||
subs N, N, #1 | |||||
bne zscal_kernel_Z1 | |||||
subs N, N, #1 | |||||
bne zscal_kernel_RI_zero_1 | |||||
mov w0, wzr | mov w0, wzr | ||||
ret | ret | ||||
@@ -187,73 +187,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
fmul v16.2d, v0.2d, v8.2d[0] | fmul v16.2d, v0.2d, v8.2d[0] | ||||
OP_ii v16.2d, v1.2d, v9.2d[0] | OP_ii v16.2d, v1.2d, v9.2d[0] | ||||
fmul v17.2d, v0.2d, v9.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v17.2d, v17.2d | |||||
eor v17.16b, v17.16b, v17.16b | |||||
fmls v17.2d, v0.2d, v9.2d[0] | |||||
#else | |||||
fmul v17.2d, v0.2d, v9.2d[0] | |||||
#endif | #endif | ||||
OP_ir v17.2d, v1.2d, v8.2d[0] | OP_ir v17.2d, v1.2d, v8.2d[0] | ||||
fmul v18.2d, v2.2d, v8.2d[0] | fmul v18.2d, v2.2d, v8.2d[0] | ||||
OP_ii v18.2d, v3.2d, v9.2d[0] | OP_ii v18.2d, v3.2d, v9.2d[0] | ||||
fmul v19.2d, v2.2d, v9.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v19.2d, v19.2d | |||||
eor v19.16b, v19.16b, v19.16b | |||||
fmls v19.2d, v2.2d, v9.2d[0] | |||||
#else | |||||
fmul v19.2d, v2.2d, v9.2d[0] | |||||
#endif | #endif | ||||
OP_ir v19.2d, v3.2d, v8.2d[0] | OP_ir v19.2d, v3.2d, v8.2d[0] | ||||
fmul v20.2d, v0.2d, v8.2d[1] | fmul v20.2d, v0.2d, v8.2d[1] | ||||
OP_ii v20.2d, v1.2d, v9.2d[1] | OP_ii v20.2d, v1.2d, v9.2d[1] | ||||
fmul v21.2d, v0.2d, v9.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v21.2d, v21.2d | |||||
eor v21.16b, v21.16b, v21.16b | |||||
fmls v21.2d, v0.2d, v9.2d[1] | |||||
#else | |||||
fmul v21.2d, v0.2d, v9.2d[1] | |||||
#endif | #endif | ||||
OP_ir v21.2d, v1.2d, v8.2d[1] | OP_ir v21.2d, v1.2d, v8.2d[1] | ||||
fmul v22.2d, v2.2d, v8.2d[1] | fmul v22.2d, v2.2d, v8.2d[1] | ||||
OP_ii v22.2d, v3.2d, v9.2d[1] | OP_ii v22.2d, v3.2d, v9.2d[1] | ||||
fmul v23.2d, v2.2d, v9.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v23.2d, v23.2d | |||||
eor v23.16b, v23.16b, v23.16b | |||||
fmls v23.2d, v2.2d, v9.2d[1] | |||||
#else | |||||
fmul v23.2d, v2.2d, v9.2d[1] | |||||
#endif | #endif | ||||
OP_ir v23.2d, v3.2d, v8.2d[1] | OP_ir v23.2d, v3.2d, v8.2d[1] | ||||
fmul v24.2d, v0.2d, v10.2d[0] | fmul v24.2d, v0.2d, v10.2d[0] | ||||
OP_ii v24.2d, v1.2d, v11.2d[0] | OP_ii v24.2d, v1.2d, v11.2d[0] | ||||
fmul v25.2d, v0.2d, v11.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v25.2d, v25.2d | |||||
eor v25.16b, v25.16b, v25.16b | |||||
fmls v25.2d, v0.2d, v11.2d[0] | |||||
#else | |||||
fmul v25.2d, v0.2d, v11.2d[0] | |||||
#endif | #endif | ||||
OP_ir v25.2d, v1.2d, v10.2d[0] | OP_ir v25.2d, v1.2d, v10.2d[0] | ||||
fmul v26.2d, v2.2d, v10.2d[0] | fmul v26.2d, v2.2d, v10.2d[0] | ||||
OP_ii v26.2d, v3.2d, v11.2d[0] | OP_ii v26.2d, v3.2d, v11.2d[0] | ||||
fmul v27.2d, v2.2d, v11.2d[0] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v27.2d, v27.2d | |||||
eor v27.16b, v27.16b, v27.16b | |||||
fmls v27.2d, v2.2d, v11.2d[0] | |||||
#else | |||||
fmul v27.2d, v2.2d, v11.2d[0] | |||||
#endif | #endif | ||||
OP_ir v27.2d, v3.2d, v10.2d[0] | OP_ir v27.2d, v3.2d, v10.2d[0] | ||||
fmul v28.2d, v0.2d, v10.2d[1] | fmul v28.2d, v0.2d, v10.2d[1] | ||||
OP_ii v28.2d, v1.2d, v11.2d[1] | OP_ii v28.2d, v1.2d, v11.2d[1] | ||||
fmul v29.2d, v0.2d, v11.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v29.2d, v29.2d | |||||
eor v29.16b, v29.16b, v29.16b | |||||
fmls v29.2d, v0.2d, v11.2d[1] | |||||
#else | |||||
fmul v29.2d, v0.2d, v11.2d[1] | |||||
#endif | #endif | ||||
OP_ir v29.2d, v1.2d, v10.2d[1] | OP_ir v29.2d, v1.2d, v10.2d[1] | ||||
fmul v30.2d, v2.2d, v10.2d[1] | fmul v30.2d, v2.2d, v10.2d[1] | ||||
OP_ii v30.2d, v3.2d, v11.2d[1] | OP_ii v30.2d, v3.2d, v11.2d[1] | ||||
fmul v31.2d, v2.2d, v11.2d[1] | |||||
#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | ||||
defined(RR) || defined(RC) || defined(CR) || defined(CC) | defined(RR) || defined(RC) || defined(CR) || defined(CC) | ||||
fneg v31.2d, v31.2d | |||||
eor v31.16b, v31.16b, v31.16b | |||||
fmls v31.2d, v2.2d, v11.2d[1] | |||||
#else | |||||
fmul v31.2d, v2.2d, v11.2d[1] | |||||
#endif | #endif | ||||
OP_ir v31.2d, v3.2d, v10.2d[1] | OP_ir v31.2d, v3.2d, v10.2d[1] | ||||