Browse Source

kernel/riscv64:Fixed the bug of openblas_utest_ext failing in c/zgemv and some c/zgbmv tests:

tags/v0.3.30
guoyuanplct 4 months ago
parent
commit
9a7e3f102b
1 changed files with 63 additions and 20 deletions
  1. +63
    -20
      kernel/riscv64/zgemv_n_vector.c

+ 63
- 20
kernel/riscv64/zgemv_n_vector.c View File

@@ -66,7 +66,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
BLASLONG lda2 = lda * 2;
vy0_new = VLSEV_FLOAT(&y[iy], stride_y, gvl);
vy1_new = VLSEV_FLOAT(&y[iy + 1], stride_y, gvl);
for (k = 0, j = 0; k < m / gvl; k++)
for (k = 0, j = 0; k < m / gvl; k ++)
{
a_ptr = a;
ix = 0;
@@ -121,30 +121,73 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
#endif
a_ptr += lda2;
ix += inc_x2;
}

for (; i < n; i += 4)
for (i = n % 4 ; i < n; i += 4)
{
#if !defined(XCONJ)

x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
// temp_rr[0] = alpha_r * x[ix] - alpha_i * x[ix + 1];
// temp_rr[1] = alpha_r * x[ix + inc_x2] - alpha_i * x[ix + inc_x2 + 1];
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);

// temp_ii[0] = alpha_r * x[ix + 1] + alpha_i * x[ix];
// temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] + alpha_i * x[ix + inc_x2];
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
VSEV_FLOAT(&temp_ii[0], temp_iv, 2);
// temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] - alpha_i * x[ix + inc_x2 * 2 + 1];
// temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] - alpha_i * x[ix + inc_x2 * 3 + 1];
x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFNMSACVF_FLOAT(temp_rv, alpha_i, x_v1, 2);

// temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] + alpha_i * x[ix + inc_x2 * 2];
// temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] + alpha_i * x[ix + inc_x2 * 3];
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 2);
temp_iv = VFMACCVF_FLOAT(temp_iv, alpha_r, x_v1, 2);
VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
VSEV_FLOAT(&temp_ii[2], temp_iv, 2);

#else
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 4);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 4);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 4);
temp_iv = VFMUL_VF_FLOAT(x_v0, alpha_i, 4);
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 4);
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_r, x_v1, 4);
VSEV_FLOAT(&temp_rr[0], temp_rv, 4);
VSEV_FLOAT(&temp_ii[0], temp_iv, 4);
// temp_rr[0] = alpha_r * x[ix] + alpha_i * x[ix + 1];
// temp_rr[1] = alpha_r * x[ix + inc_x2] + alpha_i * x[ix + inc_x2 + 1];
x_v0 = VLSEV_FLOAT(&x[ix], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);


// temp_ii[0] = alpha_r * x[ix + 1] - alpha_i * x[ix];
// temp_ii[1] = alpha_r * x[ix + inc_x2 + 1] - alpha_i * x[ix + inc_x2];
temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
VSEV_FLOAT(&temp_rr[0], temp_rv, 2);
VSEV_FLOAT(&temp_ii[0], temp_iv, 2);

// temp_rr[2] = alpha_r * x[ix + inc_x2 * 2] + alpha_i * x[ix + inc_x2 * 2 + 1];
// temp_rr[3] = alpha_r * x[ix + inc_x2 * 3] + alpha_i * x[ix + inc_x2 * 3 + 1];
x_v0 = VLSEV_FLOAT(&x[ix + inc_x2 * 2], inc_x2 * sizeof(FLOAT), 2);
x_v1 = VLSEV_FLOAT(&x[ix + inc_x2 * 2 + 1], inc_x2 * sizeof(FLOAT), 2);
temp_rv = VFMUL_VF_FLOAT(x_v0, alpha_r, 2);
temp_rv = VFMACCVF_FLOAT(temp_rv, alpha_i, x_v1, 2);


temp_ii[2] = alpha_r * x[ix + inc_x2 * 2 + 1] - alpha_i * x[ix + inc_x2 * 2];
temp_ii[3] = alpha_r * x[ix + inc_x2 * 3 + 1] - alpha_i * x[ix + inc_x2 * 3];
temp_iv = VFMUL_VF_FLOAT(x_v1, alpha_r, 2);
temp_iv = VFNMSACVF_FLOAT(temp_iv, alpha_i, x_v0, 2);
VSEV_FLOAT(&temp_rr[2], temp_rv, 2);
VSEV_FLOAT(&temp_ii[2], temp_iv, 2);



#endif

@@ -257,7 +300,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl);
VSSEV_FLOAT(&y[iy + 1], stride_y, vy1, gvl);
j += gvl * 2;
iy += inc_yv;
iy += inc_yv ;
}
// tail
if (j / 2 < m)


Loading…
Cancel
Save