Further rearranged the rotm kernel for the different architectures.tags/v0.3.30
@@ -79,6 +79,9 @@ macro(SetDefaultL1) | |||
SetFallback(CROTKERNEL zrot.S) | |||
SetFallback(ZROTKERNEL zrot.S) | |||
SetFallback(XROTKERNEL zrot.S) | |||
SetFallback(SROTMKERNEL rotm.S) | |||
SetFallback(DROTMKERNEL rotm.S) | |||
SetFallback(QROTMKERNEL rotm.S) | |||
SetFallback(SSCALKERNEL scal.S) | |||
SetFallback(DSCALKERNEL scal.S) | |||
SetFallback(CSCALKERNEL zscal.S) | |||
@@ -22,6 +22,7 @@ | |||
#define DSUM_K dsum_k | |||
#define DSWAP_K dswap_k | |||
#define DROT_K drot_k | |||
#define DROTM_K drotm_k | |||
#define DGEMV_N dgemv_n | |||
#define DGEMV_T dgemv_t | |||
@@ -180,6 +181,7 @@ | |||
#define DSUM_K gotoblas -> dsum_k | |||
#define DSWAP_K gotoblas -> dswap_k | |||
#define DROT_K gotoblas -> drot_k | |||
#define DROTM_K gotoblas -> drotm_k | |||
#define DGEMV_N gotoblas -> dgemv_n | |||
#define DGEMV_T gotoblas -> dgemv_t | |||
@@ -213,9 +213,9 @@ int srotmg_k(float *, float *, float *, float *, float *); | |||
int drotmg_k(double *, double *, double *, double *, double *); | |||
int qrotmg_k(xdouble *, xdouble *, xdouble *, xdouble *, xdouble *); | |||
int srotm_k (BLASLONG, float, BLASLONG, float, BLASLONG, float); | |||
int drotm_k (BLASLONG, double, BLASLONG, double, BLASLONG, double); | |||
int qrotm_k (BLASLONG, xdouble, BLASLONG, xdouble, BLASLONG, xdouble); | |||
int srotm_k (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
int drotm_k (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
int qrotm_k (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
int saxpby_k (BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); | |||
@@ -70,6 +70,7 @@ | |||
#define SUM_K QSUM_K | |||
#define SWAP_K QSWAP_K | |||
#define ROT_K QROT_K | |||
#define ROTM_K QROTM_K | |||
#define GEMV_N QGEMV_N | |||
#define GEMV_T QGEMV_T | |||
@@ -361,6 +362,7 @@ | |||
#define SUM_K DSUM_K | |||
#define SWAP_K DSWAP_K | |||
#define ROT_K DROT_K | |||
#define ROTM_K DROTM_K | |||
#define GEMV_N DGEMV_N | |||
#define GEMV_T DGEMV_T | |||
@@ -977,6 +979,7 @@ | |||
#define SUM_K SSUM_K | |||
#define SWAP_K SSWAP_K | |||
#define ROT_K SROT_K | |||
#define ROTM_K SROTM_K | |||
#define GEMV_N SGEMV_N | |||
#define GEMV_T SGEMV_T | |||
@@ -197,6 +197,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
int (*srotm_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
#endif | |||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | |||
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
@@ -330,6 +331,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
#endif | |||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | |||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
int (*drotm_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); | |||
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
@@ -439,6 +441,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
int (*qrotm_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); | |||
int (*qaxpy_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
int (*qscal_k) (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
@@ -22,6 +22,7 @@ | |||
#define QSUM_K qsum_k | |||
#define QSWAP_K qswap_k | |||
#define QROT_K qrot_k | |||
#define QROTM_K qrotm_k | |||
#define QGEMV_N qgemv_n | |||
#define QGEMV_T qgemv_t | |||
@@ -165,6 +166,7 @@ | |||
#define QSUM_K gotoblas -> qsum_k | |||
#define QSWAP_K gotoblas -> qswap_k | |||
#define QROT_K gotoblas -> qrot_k | |||
#define QROTM_K gotoblas -> qrotm_k | |||
#define QGEMV_N gotoblas -> qgemv_n | |||
#define QGEMV_T gotoblas -> qgemv_t | |||
@@ -24,6 +24,7 @@ | |||
#define SSCAL_K sscal_k | |||
#define SSWAP_K sswap_k | |||
#define SROT_K srot_k | |||
#define SROTM_K srotm_k | |||
#define SGEMV_N sgemv_n | |||
#define SGEMV_T sgemv_t | |||
@@ -189,6 +190,7 @@ | |||
#define SSCAL_K gotoblas -> sscal_k | |||
#define SSWAP_K gotoblas -> sswap_k | |||
#define SROT_K gotoblas -> srot_k | |||
#define SROTM_K gotoblas -> srotm_k | |||
#define SGEMV_N gotoblas -> sgemv_n | |||
#define SGEMV_T gotoblas -> sgemv_t | |||
@@ -7,149 +7,21 @@ | |||
void NAME(blasint *N, FLOAT *dx, blasint *INCX, FLOAT *dy, blasint *INCY, FLOAT *dparam){ | |||
blasint n = *N; | |||
blasint incx = *INCX; | |||
blasint incy = *INCY; | |||
blasint n = *N; | |||
blasint incx = *INCX; | |||
blasint incy = *INCY; | |||
PRINT_DEBUG_NAME | |||
#else | |||
void CNAME(blasint n, FLOAT *dx, blasint incx, FLOAT *dy, blasint incy, FLOAT *dparam){ | |||
#endif | |||
blasint i__1, i__2; | |||
PRINT_DEBUG_CNAME; | |||
blasint i__; | |||
FLOAT w, z__; | |||
blasint kx, ky; | |||
FLOAT dh11, dh12, dh22, dh21, dflag; | |||
blasint nsteps; | |||
#ifndef CBLAS | |||
PRINT_DEBUG_CNAME; | |||
#else | |||
PRINT_DEBUG_CNAME; | |||
#endif | |||
--dparam; | |||
--dy; | |||
--dx; | |||
dflag = dparam[1]; | |||
if (n <= 0 || dflag == - 2.0) goto L140; | |||
if (! (incx == incy && incx > 0)) goto L70; | |||
nsteps = n * incx; | |||
if (dflag < 0.) { | |||
goto L50; | |||
} else if (dflag == 0) { | |||
goto L10; | |||
} else { | |||
goto L30; | |||
} | |||
L10: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w + z__ * dh12; | |||
dy[i__] = w * dh21 + z__; | |||
/* L20: */ | |||
} | |||
goto L140; | |||
L30: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = nsteps; | |||
i__1 = incx; | |||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__; | |||
dy[i__] = -w + dh22 * z__; | |||
/* L40: */ | |||
} | |||
goto L140; | |||
L50: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__ * dh12; | |||
dy[i__] = w * dh21 + z__ * dh22; | |||
/* L60: */ | |||
} | |||
goto L140; | |||
L70: | |||
kx = 1; | |||
ky = 1; | |||
if (incx < 0) { | |||
kx = (1 - n) * incx + 1; | |||
} | |||
if (incy < 0) { | |||
ky = (1 - n) * incy + 1; | |||
} | |||
ROTM_K(n, dx, incx, dy, incy, dparam); | |||
if (dflag < 0.) { | |||
goto L120; | |||
} else if (dflag == 0) { | |||
goto L80; | |||
} else { | |||
goto L100; | |||
} | |||
L80: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w + z__ * dh12; | |||
dy[ky] = w * dh21 + z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L90: */ | |||
} | |||
goto L140; | |||
L100: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__; | |||
dy[ky] = -w + dh22 * z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L110: */ | |||
} | |||
goto L140; | |||
L120: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__ * dh12; | |||
dy[ky] = w * dh21 + z__ * dh22; | |||
kx += incx; | |||
ky += incy; | |||
/* L130: */ | |||
} | |||
L140: | |||
return; | |||
} | |||
@@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}COPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}NRM2KERNEL}" "" "nrm2_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTKERNEL}" "" "rot_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}ROTMKERNEL}" "" "rotm_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | |||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | |||
@@ -125,6 +126,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${SNRM2KERNEL}" "" "nrm2_k" false "" "" false "SINGLE") | |||
GenerateNamedObjects("${KERNELDIR}/${SDOTKERNEL}" "" "dot_k" false "" "" false "SINGLE") | |||
GenerateNamedObjects("${KERNELDIR}/${SROTKERNEL}" "" "rot_k" false "" "" false "SINGLE") | |||
GenerateNamedObjects("${KERNELDIR}/${SROTMKERNEL}" "" "rotm_k" false "" "" false "SINGLE") | |||
endif () | |||
if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
GenerateNamedObjects("${KERNELDIR}/${DAMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false "DOUBLE") | |||
@@ -148,6 +150,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | |||
@@ -1105,6 +1108,7 @@ endif () | |||
GenerateNamedObjects("${KERNELDIR}/${DCOPYKERNEL}" "C_INTERFACE" "copy_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DNRM2KERNEL}" "" "nrm2_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTKERNEL}" "" "rot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DROTMKERNEL}" "" "rotm_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DDOTKERNEL}" "" "dot_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DSWAPKERNEL}" "" "swap_k" false "" "" false "DOUBLE") | |||
GenerateNamedObjects("${KERNELDIR}/${DAXPYKERNEL}" "" "axpy_k" false "" "" false "DOUBLE") | |||
@@ -336,6 +336,18 @@ ifndef XROTKERNEL | |||
XROTKERNEL = zrot.S | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = rotm.S | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = rotm.S | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = rotm.S | |||
endif | |||
### SCAL ### | |||
ifndef SSCALKERNEL | |||
@@ -504,21 +516,21 @@ SBLASOBJS += \ | |||
sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | |||
snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | |||
saxpby_k$(TSUFFIX).$(SUFFIX) | |||
saxpby_k$(TSUFFIX).$(SUFFIX) srotm_k$(TSUFFIX).$(SUFFIX) | |||
DBLASOBJS += \ | |||
damax_k$(TSUFFIX).$(SUFFIX) damin_k$(TSUFFIX).$(SUFFIX) dmax_k$(TSUFFIX).$(SUFFIX) dmin_k$(TSUFFIX).$(SUFFIX) \ | |||
idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | |||
dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | |||
dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | |||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||
daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) drotm_k$(TSUFFIX).$(SUFFIX) | |||
QBLASOBJS += \ | |||
qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | |||
iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | |||
qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | |||
qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||
qsum_k$(TSUFFIX).$(SUFFIX) | |||
qsum_k$(TSUFFIX).$(SUFFIX) qrotm_k$(TSUFFIX).$(SUFFIX) | |||
CBLASOBJS += \ | |||
camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | |||
@@ -842,7 +854,16 @@ $(KDIR)drot_k$(TSUFFIX).$(SUFFIX) $(KDIR)drot_k$(TPSUFFIX).$(PSUFFIX) : $(KERN | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)qrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTKERNEL) | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)srotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)srotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -UDOUBLE $< -o $@ | |||
$(KDIR)drotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)drotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DDOUBLE $< -o $@ | |||
$(KDIR)qrotm_k$(TSUFFIX).$(SUFFIX) $(KDIR)qrotm_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QROTMKERNEL) | |||
$(CC) -c $(CFLAGS) $(FMAFLAG) -UCOMPLEX -UCOMPLEX -DXDOUBLE $< -o $@ | |||
$(KDIR)csrot_k$(TSUFFIX).$(SUFFIX) $(KDIR)csrot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CROTKERNEL) | |||
$(CC) -c $(CFLAGS) -DCOMPLEX -DCOMPLEX -UDOUBLE $< -o $@ | |||
@@ -122,3 +122,15 @@ ZTRSMKERNEL_LN = ztrsm_kernel_2x2_LN.S | |||
ZTRSMKERNEL_LT = ztrsm_kernel_2x2_LT.S | |||
ZTRSMKERNEL_RN = ztrsm_kernel_2x2_LT.S | |||
ZTRSMKERNEL_RT = ztrsm_kernel_2x2_RT.S | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -45,4 +45,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -171,3 +171,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||
#Dump kernel | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c | |||
CGEMM_BETA = ../generic/zgemm_beta.c | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -146,4 +146,14 @@ DGEMM_BETA = ../generic/gemm_beta.c | |||
CGEMM_BETA = ../generic/zgemm_beta.c | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -0,0 +1,159 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) | |||
{ | |||
BLASLONG i__1, i__2; | |||
BLASLONG i__; | |||
FLOAT w, z__; | |||
BLASLONG kx, ky; | |||
FLOAT dh11, dh12, dh22, dh21, dflag; | |||
BLASLONG nsteps; | |||
--dparam; | |||
--dy; | |||
--dx; | |||
dflag = dparam[1]; | |||
if (n <= 0 || dflag == - 2.0) goto L140; | |||
if (! (incx == incy && incx > 0)) goto L70; | |||
nsteps = n * incx; | |||
if (dflag < 0.) { | |||
goto L50; | |||
} else if (dflag == 0) { | |||
goto L10; | |||
} else { | |||
goto L30; | |||
} | |||
L10: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w + z__ * dh12; | |||
dy[i__] = w * dh21 + z__; | |||
/* L20: */ | |||
} | |||
goto L140; | |||
L30: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = nsteps; | |||
i__1 = incx; | |||
for (i__ = 1; i__1 < 0 ? i__ >= i__2 : i__ <= i__2; i__ += i__1) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__; | |||
dy[i__] = -w + dh22 * z__; | |||
/* L40: */ | |||
} | |||
goto L140; | |||
L50: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
for (i__ = 1; i__2 < 0 ? i__ >= i__1 : i__ <= i__1; i__ += i__2) { | |||
w = dx[i__]; | |||
z__ = dy[i__]; | |||
dx[i__] = w * dh11 + z__ * dh12; | |||
dy[i__] = w * dh21 + z__ * dh22; | |||
/* L60: */ | |||
} | |||
goto L140; | |||
L70: | |||
kx = 1; | |||
ky = 1; | |||
if (incx < 0) { | |||
kx = (1 - n) * incx + 1; | |||
} | |||
if (incy < 0) { | |||
ky = (1 - n) * incy + 1; | |||
} | |||
if (dflag < 0.) { | |||
goto L120; | |||
} else if (dflag == 0) { | |||
goto L80; | |||
} else { | |||
goto L100; | |||
} | |||
L80: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w + z__ * dh12; | |||
dy[ky] = w * dh21 + z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L90: */ | |||
} | |||
goto L140; | |||
L100: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__; | |||
dy[ky] = -w + dh22 * z__; | |||
kx += incx; | |||
ky += incy; | |||
/* L110: */ | |||
} | |||
goto L140; | |||
L120: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__2 = n; | |||
for (i__ = 1; i__ <= i__2; ++i__) { | |||
w = dx[kx]; | |||
z__ = dy[ky]; | |||
dx[kx] = w * dh11 + z__ * dh12; | |||
dy[ky] = w * dh21 + z__ * dh22; | |||
kx += incx; | |||
ky += incy; | |||
/* L130: */ | |||
} | |||
L140: | |||
return(0); | |||
} |
@@ -142,3 +142,15 @@ ZTRSMKERNEL_RT = ztrsm_kernel_RT.S | |||
CGEMM3MKERNEL = zgemm3m_kernel.S | |||
ZGEMM3MKERNEL = zgemm3m_kernel.S | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -236,3 +236,15 @@ ZGEMM3MKERNEL = zgemm3m_kernel.S | |||
endif | |||
DSDOTKERNEL = dot.S | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -169,3 +169,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||
#Dump kernel | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -43,4 +43,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -199,3 +199,15 @@ endif | |||
ifndef IQMAXKERNEL | |||
IQMAXKERNEL = imax.S | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -158,3 +158,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -73,3 +73,15 @@ endif | |||
ifndef IQMAXKERNEL | |||
IQMAXKERNEL = imax.S | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -27,4 +27,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -71,6 +71,10 @@ DROTKERNEL = rot_vector.c | |||
CROTKERNEL = zrot_vector.c | |||
ZROTKERNEL = zrot_vector.c | |||
SROTMKERNEL = ../generic/rotm.c | |||
DROTMKERNEL = ../generic/rotm.c | |||
QROTMKERNEL = ../generic/rotm.c | |||
SSCALKERNEL = scal_vector.c | |||
DSCALKERNEL = scal_vector.c | |||
CSCALKERNEL = zscal_vector.c | |||
@@ -71,6 +71,10 @@ DROTKERNEL = ../riscv64/rot.c | |||
CROTKERNEL = ../riscv64/zrot.c | |||
ZROTKERNEL = ../riscv64/zrot.c | |||
SROTMKERNEL = ../generic/rotm.c | |||
DROTMKERNEL = ../generic/rotm.c | |||
QROTMKERNEL = ../generic/rotm.c | |||
SSCALKERNEL = ../riscv64/scal.c | |||
DSCALKERNEL = ../riscv64/scal.c | |||
CSCALKERNEL = ../riscv64/zscal.c | |||
@@ -71,6 +71,10 @@ DROTKERNEL = rot_rvv.c | |||
CROTKERNEL = zrot_rvv.c | |||
ZROTKERNEL = zrot_rvv.c | |||
SROTMKERNEL = ../generic/rotm.c | |||
DROTMKERNEL = ../generic/rotm.c | |||
QROTMKERNEL = ../generic/rotm.c | |||
SSCALKERNEL = scal_rvv.c | |||
DSCALKERNEL = scal_rvv.c | |||
CSCALKERNEL = zscal_rvv.c | |||
@@ -66,6 +66,10 @@ DROTKERNEL = rot_vector.c | |||
CROTKERNEL = zrot_vector.c | |||
ZROTKERNEL = zrot_vector.c | |||
SROTMKERNEL = ../generic/rotm.c | |||
DROTMKERNEL = ../generic/rotm.c | |||
QROTMKERNEL = ../generic/rotm.c | |||
SSCALKERNEL = scal_vector.c | |||
DSCALKERNEL = scal_vector.c | |||
CSCALKERNEL = zscal_vector.c | |||
@@ -98,6 +98,10 @@ DROTKERNEL = rot_rvv.c | |||
CROTKERNEL = zrot_rvv.c | |||
ZROTKERNEL = zrot_rvv.c | |||
SROTMKERNEL = rotm_rvv.c | |||
DROTMKERNEL = rotm_rvv.c | |||
QROTMKERNEL = ../generic/rotm.c | |||
SSCALKERNEL = scal_rvv.c | |||
DSCALKERNEL = scal_rvv.c | |||
CSCALKERNEL = zscal_rvv.c | |||
@@ -0,0 +1,260 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2013, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include "common.h" | |||
#if !defined(DOUBLE) | |||
#define VSETVL(n) __riscv_vsetvl_e32m8(n) | |||
#define FLOAT_V_T vfloat32m8_t | |||
#define VLSEV_FLOAT __riscv_vlse32_v_f32m8 | |||
#define VSSEV_FLOAT __riscv_vsse32_v_f32m8 | |||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m8 | |||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f32m8 | |||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f32m8 | |||
#else | |||
#define VSETVL(n) __riscv_vsetvl_e64m8(n) | |||
#define FLOAT_V_T vfloat64m8_t | |||
#define VLSEV_FLOAT __riscv_vlse64_v_f64m8 | |||
#define VSSEV_FLOAT __riscv_vsse64_v_f64m8 | |||
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m8 | |||
#define VFMULVF_FLOAT __riscv_vfmul_vf_f64m8 | |||
#define VFMSACVF_FLOAT __riscv_vfmsac_vf_f64m8 | |||
#endif | |||
int CNAME(BLASLONG n, FLOAT *dx, BLASLONG incx, FLOAT *dy, BLASLONG incy, FLOAT *dparam) | |||
{ | |||
BLASLONG i__1, i__2; | |||
BLASLONG kx, ky; | |||
FLOAT dh11, dh12, dh22, dh21, dflag; | |||
BLASLONG nsteps; | |||
--dparam; | |||
--dy; | |||
--dx; | |||
FLOAT_V_T v_w, v_z__, v_dx, v_dy; | |||
BLASLONG stride, stride_x, stride_y, offset; | |||
dflag = dparam[1]; | |||
if (n <= 0 || dflag == - 2.0) goto L140; | |||
if (!(incx == incy && incx > 0)) goto L70; | |||
nsteps = n * incx; | |||
if (dflag < 0.) { | |||
goto L50; | |||
} else if (dflag == 0) { | |||
goto L10; | |||
} else { | |||
goto L30; | |||
} | |||
L10: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
if(i__2 < 0){ | |||
offset = i__1 - 2; | |||
dx += offset; | |||
dy += offset; | |||
i__1 = -i__1; | |||
i__2 = -i__2; | |||
} | |||
stride = i__2 * sizeof(FLOAT); | |||
n = i__1 / i__2; | |||
for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { | |||
vl = VSETVL(n); | |||
v_w = VLSEV_FLOAT(&dx[1], stride, vl); | |||
v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); | |||
v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); | |||
v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); | |||
VSSEV_FLOAT(&dx[1], stride, v_dx, vl); | |||
VSSEV_FLOAT(&dy[1], stride, v_dy, vl); | |||
} | |||
goto L140; | |||
L30: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
i__2 = nsteps; | |||
i__1 = incx; | |||
if(i__1 < 0){ | |||
offset = i__2 - 2; | |||
dx += offset; | |||
dy += offset; | |||
i__1 = -i__1; | |||
i__2 = -i__2; | |||
} | |||
stride = i__1 * sizeof(FLOAT); | |||
n = i__2 / i__1; | |||
for (size_t vl; n > 0; n -= vl, dx += vl*i__1, dy += vl*i__1) { | |||
vl = VSETVL(n); | |||
v_w = VLSEV_FLOAT(&dx[1], stride, vl); | |||
v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); | |||
v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); | |||
v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); | |||
VSSEV_FLOAT(&dx[1], stride, v_dx, vl); | |||
VSSEV_FLOAT(&dy[1], stride, v_dy, vl); | |||
} | |||
goto L140; | |||
L50: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
i__1 = nsteps; | |||
i__2 = incx; | |||
if(i__2 < 0){ | |||
offset = i__1 - 2; | |||
dx += offset; | |||
dy += offset; | |||
i__1 = -i__1; | |||
i__2 = -i__2; | |||
} | |||
stride = i__2 * sizeof(FLOAT); | |||
n = i__1 / i__2; | |||
for (size_t vl; n > 0; n -= vl, dx += vl*i__2, dy += vl*i__2) { | |||
vl = VSETVL(n); | |||
v_w = VLSEV_FLOAT(&dx[1], stride, vl); | |||
v_z__ = VLSEV_FLOAT(&dy[1], stride, vl); | |||
v_dx = VFMULVF_FLOAT(v_w, dh11, vl); | |||
v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); | |||
VSSEV_FLOAT(&dx[1], stride, v_dx, vl); | |||
v_dy = VFMULVF_FLOAT(v_w, dh21, vl); | |||
v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); | |||
VSSEV_FLOAT(&dy[1], stride, v_dy, vl); | |||
} | |||
goto L140; | |||
L70: | |||
kx = 1; | |||
ky = 1; | |||
if (incx < 0) { | |||
kx = (1 - n) * incx + 1; | |||
} | |||
if (incy < 0) { | |||
ky = (1 - n) * incy + 1; | |||
} | |||
if (dflag < 0.) { | |||
goto L120; | |||
} else if (dflag == 0) { | |||
goto L80; | |||
} else { | |||
goto L100; | |||
} | |||
L80: | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
if(incx < 0){ | |||
incx = -incx; | |||
dx -= n*incx; | |||
} | |||
if(incy < 0){ | |||
incy = -incy; | |||
dy -= n*incy; | |||
} | |||
stride_x = incx * sizeof(FLOAT); | |||
stride_y = incy * sizeof(FLOAT); | |||
for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { | |||
vl = VSETVL(n); | |||
v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); | |||
v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); | |||
v_dx = VFMACCVF_FLOAT(v_w, dh12, v_z__, vl); | |||
v_dy = VFMACCVF_FLOAT(v_z__, dh21, v_w, vl); | |||
VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); | |||
VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); | |||
} | |||
goto L140; | |||
L100: | |||
dh11 = dparam[2]; | |||
dh22 = dparam[5]; | |||
if(incx < 0){ | |||
incx = -incx; | |||
dx -= n*incx; | |||
} | |||
if(incy < 0){ | |||
incy = -incy; | |||
dy -= n*incy; | |||
} | |||
stride_x = incx * sizeof(FLOAT); | |||
stride_y = incy * sizeof(FLOAT); | |||
for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { | |||
vl = VSETVL(n); | |||
v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); | |||
v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); | |||
v_dx = VFMACCVF_FLOAT(v_z__, dh11, v_w, vl); | |||
v_dy = VFMSACVF_FLOAT(v_w, dh22, v_z__, vl); | |||
VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); | |||
VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); | |||
} | |||
goto L140; | |||
L120: | |||
dh11 = dparam[2]; | |||
dh12 = dparam[4]; | |||
dh21 = dparam[3]; | |||
dh22 = dparam[5]; | |||
if(incx < 0){ | |||
incx = -incx; | |||
dx -= n*incx; | |||
} | |||
if(incy < 0){ | |||
incy = -incy; | |||
dy -= n*incy; | |||
} | |||
stride_x = incx * sizeof(FLOAT); | |||
stride_y = incy * sizeof(FLOAT); | |||
for (size_t vl; n > 0; n -= vl, dx += vl*incx, dy += vl*incy) { | |||
vl = VSETVL(n); | |||
v_w = VLSEV_FLOAT(&dx[kx], stride_x, vl); | |||
v_z__ = VLSEV_FLOAT(&dy[ky], stride_y, vl); | |||
v_dx = VFMULVF_FLOAT(v_w, dh11, vl); | |||
v_dx = VFMACCVF_FLOAT(v_dx, dh12, v_z__, vl); | |||
VSSEV_FLOAT(&dx[kx], stride_x, v_dx, vl); | |||
v_dy = VFMULVF_FLOAT(v_w, dh21, vl); | |||
v_dy = VFMACCVF_FLOAT(v_dy, dh22, v_z__, vl); | |||
VSSEV_FLOAT(&dy[ky], stride_y, v_dy, vl); | |||
} | |||
L140: | |||
return(0); | |||
} |
@@ -72,9 +72,9 @@ gotoblas_t TABLE_NAME = { | |||
samax_kTS, samin_kTS, smax_kTS, smin_kTS, | |||
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, | |||
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, | |||
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, | |||
dsdot_kTS, | |||
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | |||
srot_kTS, srotm_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, | |||
sbgemv_nTS, sbgemv_tTS, sger_kTS, | |||
ssymv_LTS, ssymv_UTS, | |||
@@ -158,7 +158,7 @@ gotoblas_t TABLE_NAME = { | |||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) | |||
scopy_kTS, sdot_kTS, | |||
// dsdot_kTS, | |||
srot_kTS, saxpy_kTS, | |||
srot_kTS, srotm_kTS, saxpy_kTS, | |||
#endif | |||
#if (BUILD_SINGLE==1) || (BUILD_DOUBLE==1) || (BUILD_COMPLEX==1) || (BUILD_COMPLEX16==1) | |||
sscal_kTS, | |||
@@ -260,6 +260,7 @@ gotoblas_t TABLE_NAME = { | |||
#endif | |||
#if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) | |||
drot_kTS, | |||
drotm_kTS, | |||
daxpy_kTS, | |||
dscal_kTS, | |||
dswap_kTS, | |||
@@ -331,10 +332,9 @@ gotoblas_t TABLE_NAME = { | |||
qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, | |||
iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, | |||
qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, | |||
qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, | |||
qrot_kTS, qrotm_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, | |||
qgemv_nTS, qgemv_tTS, qger_kTS, | |||
qsymv_LTS, qsymv_UTS, | |||
qgemm_kernelTS, qgemm_betaTS, | |||
#if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N | |||
qgemm_incopyTS, qgemm_itcopyTS, | |||
@@ -75,3 +75,14 @@ DGEMM_BETA = ../generic/gemm_beta.c | |||
CGEMM_BETA = ../generic/zgemm_beta.c | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -189,3 +189,14 @@ ZGEMM_BETA = ../generic/zgemm_beta.c | |||
QGEMM_BETA = ../generic/gemm_beta.c | |||
XGEMM_BETA = ../generic/zgemm_beta.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -162,3 +162,15 @@ ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -290,6 +290,18 @@ ifndef QROTKERNEL | |||
QROTKERNEL = rot.S | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef CROTKERNEL | |||
CROTKERNEL = zrot_sse.S | |||
endif | |||
@@ -168,3 +168,15 @@ QCABS_KERNEL = ../generic/cabs.c | |||
#Dump kernel | |||
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -27,4 +27,14 @@ ifndef ZGEMM_BETA | |||
ZGEMM_BETA = ../generic/zgemm_beta.c | |||
endif | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -135,5 +135,14 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
ifndef SROTMKERNEL | |||
SROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef DROTMKERNEL | |||
DROTMKERNEL = ../generic/rotm.c | |||
endif | |||
ifndef QROTMKERNEL | |||
QROTMKERNEL = ../generic/rotm.c | |||
endif |
@@ -70,6 +70,24 @@ CTEST(rot,drot_inc_1) | |||
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); | |||
} | |||
} | |||
CTEST(rot,drotm_inc_1) | |||
{ | |||
blasint i = 0; | |||
blasint N = 12, incX = 1, incY = 1; | |||
double param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; | |||
double x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||
double y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||
double x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; | |||
double y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; | |||
//OpenBLAS | |||
BLASFUNC(drotm)(&N, x_actual, &incX, y_actual, &incY, param); | |||
for(i = 0; i < N; i++){ | |||
ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], DOUBLE_EPS); | |||
ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], DOUBLE_EPS); | |||
} | |||
} | |||
#endif | |||
#ifdef BUILD_COMPLEX16 | |||
@@ -130,6 +148,24 @@ CTEST(rot,srot_inc_1) | |||
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); | |||
} | |||
} | |||
CTEST(rot,srotm_inc_1) | |||
{ | |||
blasint i = 0; | |||
blasint N = 12, incX = 1, incY = 1; | |||
float param[5] = {1.0, 2.0, 3.0, 4.0, 5.0}; | |||
float x_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||
float y_actual[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0}; | |||
float x_referece[] = {3.0, 6.0, 9.0, 12.0, 15.0, 18.0, 21.0, 24.0, 27.0, 30.0, 33.0, 36.0}; | |||
float y_referece[] = {4.0, 8.0, 12.0, 16.0, 20.0, 24.0, 28.0, 32.0, 36.0, 40.0, 44.0, 48.0}; | |||
//OpenBLAS | |||
BLASFUNC(srotm)(&N, x_actual, &incX, y_actual, &incY, param); | |||
for(i = 0; i < N; i++){ | |||
ASSERT_DBL_NEAR_TOL(x_referece[i], x_actual[i], SINGLE_EPS); | |||
ASSERT_DBL_NEAR_TOL(y_referece[i], y_actual[i], SINGLE_EPS); | |||
} | |||
} | |||
#endif | |||
#ifdef BUILD_COMPLEX | |||