|
|
@@ -1,4 +1,5 @@ |
|
|
|
/*************************************************************************** |
|
|
|
Copyright (c) 2023, The OpenBLAS Project |
|
|
|
Copyright (c) 2022, Arm Ltd |
|
|
|
All rights reserved. |
|
|
|
Redistribution and use in source and binary forms, with or without |
|
|
@@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
|
|
#include <arm_sve.h> |
|
|
|
|
|
|
|
#ifdef DOUBLE |
|
|
|
#define SVE_TYPE svfloat64_t |
|
|
|
#define SVE_ZERO svdup_f64(0.0) |
|
|
|
#define SVE_WHILELT svwhilelt_b64 |
|
|
|
#define SVE_ALL svptrue_b64() |
|
|
|
#define SVE_WIDTH svcntd() |
|
|
|
#define DTYPE "d" |
|
|
|
#define WIDTH "d" |
|
|
|
#define SHIFT "3" |
|
|
|
#else |
|
|
|
#define SVE_TYPE svfloat32_t |
|
|
|
#define SVE_ZERO svdup_f32(0.0) |
|
|
|
#define SVE_WHILELT svwhilelt_b32 |
|
|
|
#define SVE_ALL svptrue_b32() |
|
|
|
#define SVE_WIDTH svcntw() |
|
|
|
#define DTYPE "s" |
|
|
|
#define WIDTH "w" |
|
|
|
#define SHIFT "2" |
|
|
|
#endif |
|
|
|
|
|
|
|
static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) { |
|
|
|
SVE_TYPE acc_a = SVE_ZERO; |
|
|
|
SVE_TYPE acc_b = SVE_ZERO; |
|
|
|
#define COUNT \ |
|
|
|
" cnt"WIDTH" x9 \n" |
|
|
|
#define SETUP_TRUE \ |
|
|
|
" ptrue p0."DTYPE" \n" |
|
|
|
#define OFFSET_INPUTS \ |
|
|
|
" add x12, %[X_], x9, lsl #"SHIFT" \n" \ |
|
|
|
" add x13, %[Y_], x9, lsl #"SHIFT" \n" |
|
|
|
#define TAIL_WHILE \ |
|
|
|
" whilelo p1."DTYPE", x8, x0 \n" |
|
|
|
#define UPDATE(pg, x,y,out) \ |
|
|
|
" ld1"WIDTH" { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"] \n" \ |
|
|
|
" ld1"WIDTH" { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"] \n" \ |
|
|
|
" fmla "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE" \n" |
|
|
|
#define SUM_VECTOR(v) \ |
|
|
|
" faddv "DTYPE""v", p0, z"v"."DTYPE" \n" |
|
|
|
#define RET \ |
|
|
|
" fadd %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0 \n" |
|
|
|
|
|
|
|
BLASLONG sve_width = SVE_WIDTH; |
|
|
|
#define DOT_KERNEL \ |
|
|
|
COUNT \ |
|
|
|
" mov z1.d, #0 \n" \ |
|
|
|
" mov z0.d, #0 \n" \ |
|
|
|
" mov x8, #0 \n" \ |
|
|
|
" movi d1, #0x0 \n" \ |
|
|
|
SETUP_TRUE \ |
|
|
|
" neg x10, x9, lsl #1 \n" \ |
|
|
|
" ands x11, x10, x0 \n" \ |
|
|
|
" b.eq 2f // skip_2x \n" \ |
|
|
|
OFFSET_INPUTS \ |
|
|
|
"1: // vector_2x \n" \ |
|
|
|
UPDATE("p0", "%[X_]", "%[Y_]", "z1") \ |
|
|
|
UPDATE("p0", "x12", "x13", "z0") \ |
|
|
|
" sub x8, x8, x10 \n" \ |
|
|
|
" cmp x8, x11 \n" \ |
|
|
|
" b.lo 1b // vector_2x \n" \ |
|
|
|
SUM_VECTOR("1") \ |
|
|
|
"2: // skip_2x \n" \ |
|
|
|
" neg x10, x9 \n" \ |
|
|
|
" and x10, x10, x0 \n" \ |
|
|
|
" cmp x8, x10 \n" \ |
|
|
|
" b.hs 4f // tail \n" \ |
|
|
|
"3: // vector_1x \n" \ |
|
|
|
UPDATE("p0", "%[X_]", "%[Y_]", "z0") \ |
|
|
|
" add x8, x8, x9 \n" \ |
|
|
|
" cmp x8, x10 \n" \ |
|
|
|
" b.lo 3b // vector_1x \n" \ |
|
|
|
"4: // tail \n" \ |
|
|
|
" cmp x10, x0 \n" \ |
|
|
|
" b.eq 5f // end \n" \ |
|
|
|
TAIL_WHILE \ |
|
|
|
UPDATE("p1", "%[X_]", "%[Y_]", "z0") \ |
|
|
|
"5: // end \n" \ |
|
|
|
SUM_VECTOR("0") \ |
|
|
|
RET |
|
|
|
|
|
|
|
for (BLASLONG i = 0; i < n; i += sve_width * 2) { |
|
|
|
svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n); |
|
|
|
svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n); |
|
|
|
static |
|
|
|
FLOAT |
|
|
|
dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y) |
|
|
|
{ |
|
|
|
FLOAT ret; |
|
|
|
|
|
|
|
SVE_TYPE x_vec_a = svld1(pg_a, &x[i]); |
|
|
|
SVE_TYPE y_vec_a = svld1(pg_a, &y[i]); |
|
|
|
SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]); |
|
|
|
SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]); |
|
|
|
asm(DOT_KERNEL |
|
|
|
: |
|
|
|
[RET_] "=&w" (ret) |
|
|
|
: |
|
|
|
[N_] "r" (n), |
|
|
|
[X_] "r" (x), |
|
|
|
[Y_] "r" (y) |
|
|
|
:); |
|
|
|
|
|
|
|
acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a); |
|
|
|
acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b); |
|
|
|
} |
|
|
|
|
|
|
|
return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b); |
|
|
|
return ret; |
|
|
|
} |