Browse Source

Move the value assignment of vector x in gemv_n_sve.c to the outermost loop to reduce the repeated data retrieval.

1.Verify correctness using BLAS-Tester
    2.Using the built-in benchmark to verify performance, the performance of float and doule type improved by about 60% and about 40% respectively.The test command is:
     export OMP_NUM_THREADS=1;numactl -C 10 -l ./sgemv.goto 3000 4000 100
     export OMP_NUM_THREADS=1;numactl -C 10 -l ./dgemv.goto 3000 4000 100
pull/5420/head
yuanjia 1 month ago
parent
commit
803e8d4838
1 changed files with 6 additions and 12 deletions
  1. +6
    -12
      kernel/arm64/gemv_n_sve.c

+ 6
- 12
kernel/arm64/gemv_n_sve.c View File

@@ -69,13 +69,12 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
FLOAT *a2_ptr = a + lda * width * 2; FLOAT *a2_ptr = a + lda * width * 2;


for (j = 0; j < width; j++) { for (j = 0; j < width; j++) {
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
ix = j * inc_x;

SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);
ix = j * inc_x;


SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);
for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i); SV_TYPE a00_vec = svld1(pg_true, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i); SV_TYPE a01_vec = svld1(pg_true, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i); SV_TYPE a02_vec = svld1(pg_true, a2_ptr + i);
@@ -89,10 +88,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
} }


if (i < m) { if (i < m) {
SV_TYPE x0_vec = SV_DUP(alpha * x[ix + (inc_x * width * 0)]);
SV_TYPE x1_vec = SV_DUP(alpha * x[ix + (inc_x * width * 1)]);
SV_TYPE x2_vec = SV_DUP(alpha * x[ix + (inc_x * width * 2)]);

SV_TYPE a00_vec = svld1(pg, a0_ptr + i); SV_TYPE a00_vec = svld1(pg, a0_ptr + i);
SV_TYPE a01_vec = svld1(pg, a1_ptr + i); SV_TYPE a01_vec = svld1(pg, a1_ptr + i);
SV_TYPE a02_vec = svld1(pg, a2_ptr + i); SV_TYPE a02_vec = svld1(pg, a2_ptr + i);
@@ -115,9 +110,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr = a2_ptr; a_ptr = a2_ptr;
for (j = width * 3; j < n; j++) { for (j = width * 3; j < n; j++) {
ix = j * inc_x; ix = j * inc_x;
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
for (i = 0; (i + sve_size - 1) < m; i += sve_size) { for (i = 0; (i + sve_size - 1) < m; i += sve_size) {
SV_TYPE y_vec = svld1(pg_true, y + i); SV_TYPE y_vec = svld1(pg_true, y + i);
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
SV_TYPE a_vec = svld1(pg_true, a_ptr + i); SV_TYPE a_vec = svld1(pg_true, a_ptr + i);
y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec); y_vec = svmla_x(pg_true, y_vec, a_vec, x_vec);
svst1(pg_true, y + i, y_vec); svst1(pg_true, y + i, y_vec);
@@ -125,7 +120,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO


if (i < m) { if (i < m) {
SV_TYPE y_vec = svld1(pg, y + i); SV_TYPE y_vec = svld1(pg, y + i);
SV_TYPE x_vec = SV_DUP(alpha * x[(ix)]);
SV_TYPE a_vec = svld1(pg, a_ptr + i); SV_TYPE a_vec = svld1(pg, a_ptr + i);
y_vec = svmla_m(pg, y_vec, a_vec, x_vec); y_vec = svmla_m(pg, y_vec, a_vec, x_vec);
svst1(pg, y + i, y_vec); svst1(pg, y + i, y_vec);


Loading…
Cancel
Save