Browse Source

Restore riscv64 fixes from develop branch: dot product double precision accumulation, zscal NaN handling

tags/v0.3.27
Sergei Lewis 1 year ago
parent
commit
a3b0ef6596
4 changed files with 26 additions and 154 deletions
  1. +1
    -0
      Makefile.prebuild
  2. +10
    -0
      kernel/riscv64/dot.c
  3. +14
    -76
      kernel/riscv64/zscal_rvv.c
  4. +1
    -78
      kernel/riscv64/zscal_vector.c

+ 1
- 0
Makefile.prebuild View File

@@ -57,6 +57,7 @@ endif

ifeq ($(TARGET), CK860FV)
TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
endif

ifeq ($(TARGET), x280)
TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d


+ 10
- 0
kernel/riscv64/dot.c View File

@@ -44,14 +44,24 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;

#if defined(DSDOT)
double dot = 0.0 ;
#else
FLOAT dot = 0.0 ;
#endif

if ( n < 1 ) return(dot);

while(i < n)
{

#if defined(DSDOT)
dot += (double) y[iy] * (double) x[ix] ;
#else
dot += y[iy] * x[ix] ;
#endif

ix += inc_x ;
iy += inc_y ;
i++ ;


+ 14
- 76
kernel/riscv64/zscal_rvv.c View File

@@ -69,49 +69,26 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
size_t vlmax = VSETVL_MAX;
FLOAT_VX2_T vx2;

if(da_r == 0.0 && da_i == 0.0) {
if(inc_x == 1) {

vr = VFMVVF_FLOAT(0.0, vlmax);
vi = VFMVVF_FLOAT(0.0, vlmax);

if(inc_x == 1) {

for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
vx2 = VSET_VX2(vx2, 0, vr);
vx2 = VSET_VX2(vx2, 1, vi);
VSSEG_FLOAT(x, vx2, vl);
}

} else {

for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
vx2 = VSET_VX2(vx2, 0, vr);
vx2 = VSET_VX2(vx2, 1, vi);
VSSSEG_FLOAT(x, stride_x, vx2, vl);
}
}

} else if(da_r == 0.0) {

for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);
vx2 = VLSSEG_FLOAT(x, stride_x, vl);

vx2 = VLSEG_FLOAT(x, vl);
vr = VGET_VX2(vx2, 0);
vi = VGET_VX2(vx2, 1);

vt = VFMULVF_FLOAT(vi, -da_i, vl);
vi = VFMULVF_FLOAT(vr, da_i, vl);
vt = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
vi = VFMULVF_FLOAT(vi, da_r, vl);
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);

vx2 = VSET_VX2(vx2, 0, vt);
vx2 = VSET_VX2(vx2, 1, vi);

VSSSEG_FLOAT(x, stride_x, vx2, vl);
VSSEG_FLOAT(x, vx2, vl);
}

} else if(da_i == 0.0) {
} else {

for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);
@@ -120,54 +97,15 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
vr = VGET_VX2(vx2, 0);
vi = VGET_VX2(vx2, 1);

vr = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
vi = VFMULVF_FLOAT(vi, da_r, vl);
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);

vx2 = VSET_VX2(vx2, 0, vr);
vx2 = VSET_VX2(vx2, 0, vt);
vx2 = VSET_VX2(vx2, 1, vi);
VSSSEG_FLOAT(x, stride_x, vx2, vl);
}

} else {

if(inc_x == 1) {

for (size_t vl; n > 0; n -= vl, x += vl*2) {
vl = VSETVL(n);

vx2 = VLSEG_FLOAT(x, vl);
vr = VGET_VX2(vx2, 0);
vi = VGET_VX2(vx2, 1);

vt = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
vi = VFMULVF_FLOAT(vi, da_r, vl);
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);

vx2 = VSET_VX2(vx2, 0, vt);
vx2 = VSET_VX2(vx2, 1, vi);
VSSEG_FLOAT(x, vx2, vl);
}

} else {

for (size_t vl; n > 0; n -= vl, x += vl*inc_x*2) {
vl = VSETVL(n);

vx2 = VLSSEG_FLOAT(x, stride_x, vl);
vr = VGET_VX2(vx2, 0);
vi = VGET_VX2(vx2, 1);

vt = VFMULVF_FLOAT(vr, da_r, vl);
vt = VFNMSACVF_FLOAT(vt, da_i, vi, vl);
vi = VFMULVF_FLOAT(vi, da_r, vl);
vi = VFMACCVF_FLOAT(vi, da_i, vr, vl);

vx2 = VSET_VX2(vx2, 0, vt);
vx2 = VSET_VX2(vx2, 1, vi);
VSSSEG_FLOAT(x, stride_x, vx2, vl);
}
}
}

return(0);


+ 1
- 78
kernel/riscv64/zscal_vector.c View File

@@ -59,84 +59,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F

unsigned int gvl = 0;
FLOAT_V_T vt, v0, v1;
if(da_r == 0.0 && da_i == 0.0){
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * 2 * gvl;
vt = VFMVVF_FLOAT(0.0, gvl);
for(i=0,j=0; i < n/(gvl*2); i++){
VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+inc_xv], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+inc_xv+1], stride_x, vt, gvl);

j += gvl*2;
ix += inc_xv*2;
}
for(; j < n; ){
gvl = VSETVL(n-j);
VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+1], stride_x, vt, gvl);
j += gvl;
ix += inc_x * 2 * gvl;
}
}else if(da_r == 0.0){
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * 2 * gvl;
for(i=0,j=0; i < n/gvl; i++){
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);

vt = VFMULVF_FLOAT(v1, -da_i, gvl);
v1 = VFMULVF_FLOAT(v0, da_i, gvl);

VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);

j += gvl;
ix += inc_xv;
}
if(j < n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);

vt = VFMULVF_FLOAT(v1, -da_i, gvl);
v1 = VFMULVF_FLOAT(v0, da_i, gvl);

VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
}
}else if(da_i == 0.0){
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * 2 * gvl;
for(i=0,j=0; i < n/gvl; i++){
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);

vt = VFMULVF_FLOAT(v0, da_r, gvl);
v1 = VFMULVF_FLOAT(v1, da_r, gvl);

VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);

j += gvl;
ix += inc_xv;
}
if(j < n){
gvl = VSETVL(n-j);
v0 = VLSEV_FLOAT(&x[ix], stride_x, gvl);
v1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl);

vt = VFMULVF_FLOAT(v0, da_r, gvl);
v1 = VFMULVF_FLOAT(v1, da_r, gvl);

VSSEV_FLOAT(&x[ix], stride_x, vt, gvl);
VSSEV_FLOAT(&x[ix+1], stride_x, v1, gvl);
}
}else{
{
gvl = VSETVL(n);
BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT);
BLASLONG inc_xv = inc_x * 2 * gvl;


Loading…
Cancel
Save