diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 6a74fb764..a8d1c601c 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -11,7 +11,7 @@ set(SCLAUX la_constants.f90 sbdsdc.f sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f - slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f + slaed0.f slaed1.f slaed2.f slaed4.f slaed5.f slaed6.f slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f slagts.f slamrg.f slanst.f slapy2.f slapy3.f slarnv.f @@ -31,7 +31,7 @@ set(DZLAUX dbdsdc.f dbdsvdx.f dstevx.f dstein.f dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f - dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f + dlaed0.f dlaed1.f dlaed2.f dlaed4.f dlaed5.f dlaed6.f dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f dlagts.f dlamrg.f dlanst.f dlapy2.f dlapy3.f dlarnv.f @@ -517,7 +517,7 @@ set(SCLAUX scombssq.c sbdsvdx.c sstevx.c sstein.c sbdsdc.c sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c - slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c + slaed0.c slaed1.c slaed2.c slaed4.c slaed5.c slaed6.c slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c slagts.c slamrg.c slanst.c slapy2.c slapy3.c slarnv.c @@ -536,7 +536,7 @@ set(DZLAUX dbdsdc.c dbdsvdx.c dstevx.c dstein.c dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c - dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c + dlaed0.c dlaed1.c dlaed2.c dlaed4.c dlaed5.c dlaed6.c dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c dlagts.c dlamrg.c dlanst.c dlapy2.c dlapy3.c dlarnv.c diff --git a/common_lapack.h b/common_lapack.h index f9c36646a..2151b88ac 100644 --- a/common_lapack.h +++ b/common_lapack.h @@ -439,4 +439,9 @@ blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdo blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); +blasint slaed3_single(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); +blasint dlaed3_single(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); +blasint slaed3_parallel(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); +blasint dlaed3_parallel(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); + #endif diff --git a/common_macro.h b/common_macro.h index b967c2e60..b29a9c08d 100644 --- a/common_macro.h +++ b/common_macro.h @@ -3035,6 +3035,8 @@ typedef struct { #define NEG_TCOPY DNEG_TCOPY #define LARF_L DLARF_L #define LARF_R DLARF_R +#define LAED3_SINGLE dlaed3_single +#define LAED3_PARALLEL dlaed3_parallel #else #define GETF2 SGETF2 #define GETRF SGETRF @@ -3056,6 +3058,8 @@ typedef struct { #define NEG_TCOPY SNEG_TCOPY #define LARF_L SLARF_L #define LARF_R SLARF_R +#define LAED3_SINGLE slaed3_single +#define LAED3_PARALLEL slaed3_parallel #endif #else #ifdef XDOUBLE diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index a8fb5196a..c4306d8ee 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -221,6 +221,7 @@ if (NOT NO_LAPACK) GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) endif() + GenerateNamedObjects("lapack/laed3.c" "" "" 0 "" "" 0 1) endif () if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) diff --git a/interface/Makefile b/interface/Makefile index 3ac54628f..dc154a3c4 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -429,8 +429,8 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) SLAPACKOBJS = \ sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ - slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) - + slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) \ + slaed3.$(SUFFIX) #DLAPACKOBJS = \ # dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ @@ -440,8 +440,8 @@ SLAPACKOBJS = \ DLAPACKOBJS = \ dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ - dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) - + dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) \ + dlaed3.$(SUFFIX) QLAPACKOBJS = \ qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ @@ -2365,6 +2365,11 @@ zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c $(CC) -c $(CFLAGS) $< -o $(@F) +slaed3.$(SUFFIX) slaed3.$(PSUFFIX) : lapack/laed3.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dlaed3.$(SUFFIX) dlaed3.$(PSUFFIX) : lapack/laed3.c + $(CC) -c $(CFLAGS) $< -o $(@F) ############# BLAS EXTENSIONS ##################################### diff --git a/interface/lapack/laed3.c b/interface/lapack/laed3.c new file mode 100644 index 000000000..4e5215fcf --- /dev/null +++ b/interface/lapack/laed3.c @@ -0,0 +1,87 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#if defined(DOUBLE) +#define ERROR_NAME "DLAED3" +#else +#define ERROR_NAME "SLAED3" +#endif + +/* ===================================================================== */ +int NAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, + FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, + FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, + FLOAT *s, blasint *Info) +{ + blasint kval, nval, qdim, info; + + qdim = *ldq; + kval = *k; + nval = *n; + +/* Test the input parameters. */ + info = 0; + if (kval < 0) { + info = 1; + } else if (nval < kval) { + info = 2; + } else if (qdim < nval || qdim < 1) { + info = 6; + } + if (info) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); + *Info = - info; + return 0; + } + +/* Quick return if possible */ + + *Info = 0; + if (kval == 0) return 0; + +#ifdef SMP + int nthreads = num_cpu_avail(4); + + if (nthreads == 1) { +#endif + LAED3_SINGLE(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); +#ifdef SMP + } else { + LAED3_PARALLEL(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); + } +#endif + + return 0; +} diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 2a20fcdab..0f547dd0c 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -85,7 +85,7 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \ ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" -SCLAUX = la_constants.o \ +SCLAUX_O = la_constants.o \ sbdsvdx.o sstevx.o sstein.o \ sbdsdc.o \ sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ @@ -106,7 +106,7 @@ SCLAUX = la_constants.o \ endif ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" -DZLAUX = la_constants.o\ +DZLAUX_O = la_constants.o\ dcombssq.o \ dbdsvdx.o dstevx.o dstein.o \ dbdsdc.o \ @@ -572,6 +572,8 @@ endif # filter out optimized codes from OpenBLAS ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o +SCL_AUX_OBJS = slaed3.o +DZL_AUX_OBJS = dlaed3.o SLAPACKOBJS = \ sgetrf.o sgetrs.o spotrf.o sgetf2.o \ @@ -598,6 +600,8 @@ ZLAPACKOBJS = \ zsymv.o zsyr.o zspmv.o zspr.o ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) +SCLAUX = $(filter-out $(SCL_AUX_OBJS),$(SCLAUX_O)) +DZLAUX = $(filter-out $(DZL_AUX_OBJS),$(DZLAUX_O)) SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) DLASRC = $(filter-out $(DLAPACKOBJS),$(DLASRC_O)) CLASRC = $(filter-out $(CLAPACKOBJS),$(CLASRC_O)) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 880fb01b2..71df986af 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -10,6 +10,7 @@ set(LAPACK_SOURCES potrf/potrf_L_single.c lauum/lauum_U_single.c lauum/lauum_L_single.c + laed3/laed3_single.c ) # add a 'z' to filename for complex version @@ -79,6 +80,7 @@ if (USE_THREAD) lauum/lauum_L_parallel.c potrf/potrf_U_parallel.c potrf/potrf_L_parallel.c + laed3/laed3_parallel.c ) # this has a z version diff --git a/lapack/Makefile b/lapack/Makefile index 2bbb4603f..607981288 100644 --- a/lapack/Makefile +++ b/lapack/Makefile @@ -2,7 +2,7 @@ TOPDIR = .. include ../Makefile.system #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs -SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs +SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs laed3 FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 diff --git a/lapack/laed3/Makefile b/lapack/laed3/Makefile new file mode 100644 index 000000000..9fa75613a --- /dev/null +++ b/lapack/laed3/Makefile @@ -0,0 +1,36 @@ +TOPDIR = ../.. +include ../../Makefile.system + +SBLASOBJS = slaed3_single.$(SUFFIX) +DBLASOBJS = dlaed3_single.$(SUFFIX) + +ifdef SMP +SBLASOBJS += slaed3_parallel.$(SUFFIX) +DBLASOBJS += dlaed3_parallel.$(SUFFIX) +endif + +ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" +SBLASOBJS= +endif +ifneq ($(BUILD_DOUBLE),1) +DBLASOBJS= +endif + +slaed3_single.$(SUFFIX) : laed3_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) +dlaed3_single.$(SUFFIX) : laed3_single.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) +slaed3_parallel.$(SUFFIX) : laed3_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) +dlaed3_parallel.$(SUFFIX) : laed3_parallel.c + $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) +slaed3_single.$(PSUFFIX) : laed3_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) +dlaed3_single.$(PSUFFIX) : laed3_single.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) +slaed3_parallel.$(PSUFFIX) : laed3_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) +dlaed3_parallel.$(PSUFFIX) : laed3_parallel.c + $(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) + +include ../../Makefile.tail diff --git a/lapack/laed3/laed3_parallel.c b/lapack/laed3/laed3_parallel.c new file mode 100644 index 000000000..e02567b69 --- /dev/null +++ b/lapack/laed3/laed3_parallel.c @@ -0,0 +1,243 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#define max(a,b) ((a) > (b) ? (a) : (b)) +#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) + +#if defined(DOUBLE) +#define LAMC3 BLASFUNC(dlamc3) +#define LAED4 BLASFUNC(dlaed4) +#define GEMM BLASFUNC(dgemm) +#define NRM2 BLASFUNC(dnrm2) +#define COPY BLASFUNC(dcopy) +#define LACPY BLASFUNC(dlacpy) +#define LASET BLASFUNC(dlaset) +#else +#define LAMC3 BLASFUNC(slamc3) +#define LAED4 BLASFUNC(slaed4) +#define GEMM BLASFUNC(sgemm) +#define NRM2 BLASFUNC(snrm2) +#define COPY BLASFUNC(scopy) +#define LACPY BLASFUNC(slacpy) +#define LASET BLASFUNC(slaset) +#endif + +FLOAT LAMC3(FLOAT *, FLOAT *); +void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); +void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); +void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); + +/* Table of constant values */ +static blasint c1 = 1; +static FLOAT c1f = 1.; +static FLOAT c0f = 0.; + +static void inner_laed4_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + blasint kval = args -> m; + blasint j, j_from, j_to; + FLOAT *dlamda = (FLOAT *)args -> a; + FLOAT *w = (FLOAT *)args -> b; + FLOAT *q = (FLOAT *)args -> c; + BLASLONG qdim = args -> ldc; + FLOAT *d = (FLOAT *)args -> d; + FLOAT rho = *(FLOAT *)args -> alpha; + blasint *info = &((blasint*)args -> beta)[mypos]; + + j_from = range_m[0] + 1; + j_to = range_m[1]; + + for (j = j_from; j <= j_to; j++) { + LAED4(&kval, &j, dlamda, w, &q[(j - 1) * qdim], &rho, &d[j - 1], info); + if(*info != 0) break; + } +} + +static void inner_wloop_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ + blasint kval = args -> m; + blasint i, j, i_from, i_to; + FLOAT *dlamda = (FLOAT *)args -> a; + FLOAT *w = (FLOAT *)args -> b; + FLOAT *q = (FLOAT *)args -> c; + BLASLONG qdim = args -> ldc; + i_from = range_m[0]; + i_to = range_m[1]; + for (j = 0; j < kval; j++) { + for (i = i_from; i < i_to; i++) { + if (i != j) w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); + } + } +} + +/* ===================================================================== */ +blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, + FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, + FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, + FLOAT *s, blasint *info) +{ + FLOAT temp; + blasint kval, qdim; + blasint i, j, itmp; + blasint n2, n12, ii, n23, iq2; + blas_queue_t queue[MAX_CPU_NUMBER]; + blas_arg_t args; + BLASLONG range[MAX_CPU_NUMBER + 1]; + blasint infoarray[MAX_CPU_NUMBER]; + int width, num_cpu, mode, nthreads; + + qdim = *ldq; + kval = *k; + +/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ +/* be computed with high relative accuracy (barring over/underflow). */ + + for (i = 0; i < kval; i++) { + dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; + } + + nthreads = num_cpu_avail(4); + +#if defined(DOUBLE) + mode = BLAS_DOUBLE | BLAS_REAL; +#else + mode = BLAS_SINGLE | BLAS_REAL; +#endif + args.m = kval; + args.a = (void *)dlamda; + args.b = (void *)w; + args.c = (void *)q; + args.ldc = qdim; + args.d = (void *)d; + args.alpha = (void *)rho; + args.beta = (void *)infoarray; + num_cpu = 0; + range[0] = 0; + i = kval; + while (i > 0) { + width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); + range[num_cpu + 1] = range[num_cpu] + width; + queue[num_cpu].range_m = &range[num_cpu]; + queue[num_cpu].range_n = NULL; + queue[num_cpu].routine = inner_laed4_thread; + queue[num_cpu].args = &args; + queue[num_cpu].sa = NULL; + queue[num_cpu].sb = NULL; + queue[num_cpu].mode = mode; + queue[num_cpu].next = &queue[num_cpu + 1]; + infoarray[num_cpu] = 0; + num_cpu ++; + i -= width; + } + if (num_cpu) { + queue[num_cpu - 1].next = NULL; + exec_blas(num_cpu, queue); + } + for (i = 0; i < num_cpu; i++) { + *info = max(infoarray[i], *info); + } + +/* If the zero finder fails, the computation is terminated. */ + + if (*info != 0) { + return 0; + } + + if (kval == 2) { + for (j = 0; j < kval; j++) { + w[0] = q[j * qdim]; + w[1] = q[j * qdim + 1]; + ii = indx[0] - 1; + q[j * qdim] = w[ii]; + ii = indx[1] - 1; + q[j * qdim + 1] = w[ii]; + } + } else if (kval != 1) { + +/* Compute updated W. */ + + COPY(k, w, &c1, s, &c1); + +/* Initialize W(I) = Q(I,I) */ + + itmp = qdim + 1; + COPY(k, q, &itmp, w, &c1); + + for (i = 0; i < num_cpu; i++) { + queue[i].routine = inner_wloop_thread; + } + if (num_cpu) { + exec_blas(num_cpu, queue); + } + for (i = 0; i < kval; i++) { + temp = sqrt(-w[i]); + w[i] = copysign(temp, s[i]); + } + +/* Compute eigenvectors of the modified rank-1 modification. */ + + for (j = 0; j < kval; j++) { + for (i = 0; i < kval; i++) { + s[i] = w[i] / q[j * qdim + i]; + } + temp = NRM2(k, s, &c1); + for (i = 0; i < kval; i++) { + ii = indx[i] - 1; + q[j * qdim + i] = s[ii] / temp; + } + } + } + +/* Compute the updated eigenvectors. */ + + n2 = *n - *n1; + n12 = ctot[0] + ctot[1]; + n23 = ctot[1] + ctot[2]; + + LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); + iq2 = *n1 * n12; + if (n23 != 0) { + GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); + } else { + LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); + } + + LACPY("A", &n12, k, q, ldq, s, &n12); + if (n12 != 0) { + GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); + } else { + LASET("A", n1, k, &c0f, &c0f, q, ldq); + } + + return 0; +} diff --git a/lapack/laed3/laed3_single.c b/lapack/laed3/laed3_single.c new file mode 100644 index 000000000..b21bb9930 --- /dev/null +++ b/lapack/laed3/laed3_single.c @@ -0,0 +1,166 @@ +/*************************************************************************** +Copyright (c) 2025, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include "common.h" + +#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) + +#if defined(DOUBLE) +#define LAMC3 BLASFUNC(dlamc3) +#define LAED4 BLASFUNC(dlaed4) +#define GEMM BLASFUNC(dgemm) +#define NRM2 BLASFUNC(dnrm2) +#define COPY BLASFUNC(dcopy) +#define LACPY BLASFUNC(dlacpy) +#define LASET BLASFUNC(dlaset) +#else +#define LAMC3 BLASFUNC(slamc3) +#define LAED4 BLASFUNC(slaed4) +#define GEMM BLASFUNC(sgemm) +#define NRM2 BLASFUNC(snrm2) +#define COPY BLASFUNC(scopy) +#define LACPY BLASFUNC(slacpy) +#define LASET BLASFUNC(slaset) +#endif + +FLOAT LAMC3(FLOAT *, FLOAT *); +void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); +void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); +void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); + +/* Table of constant values */ +static blasint c1 = 1; +static FLOAT c1f = 1.; +static FLOAT c0f = 0.; + +/* ===================================================================== */ +blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, + FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, + FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, + FLOAT *s, blasint *info) +{ + FLOAT temp; + blasint kval, qdim; + blasint i, j, itmp; + blasint n2, n12, ii, n23, iq2; + + qdim = *ldq; + kval = *k; + +/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ +/* be computed with high relative accuracy (barring over/underflow). */ + + for (i = 0; i < kval; i++) { + dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; + } + + for (j = 1; j <= kval; j++) { + LAED4(k, &j, dlamda, w, &q[(j - 1) * qdim], rho, &d[j - 1], info); + if(*info != 0) break; + } + +/* If the zero finder fails, the computation is terminated. */ + + if (*info != 0) { + return 0; + } + + if (kval == 2) { + for (j = 0; j < kval; j++) { + w[0] = q[j * qdim]; + w[1] = q[j * qdim + 1]; + ii = indx[0] - 1; + q[j * qdim] = w[ii]; + ii = indx[1] - 1; + q[j * qdim + 1] = w[ii]; + } + } else if (kval != 1) { + +/* Compute updated W. */ + + COPY(k, w, &c1, s, &c1); + +/* Initialize W(I) = Q(I,I) */ + + itmp = qdim + 1; + COPY(k, q, &itmp, w, &c1); + for (j = 0; j < kval; j++) { + for (i = 0; i < j; i++) { + w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); + } + for (i = j + 1; i < kval; i++) { + w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); + } + } + for (i = 0; i < kval; i++) { + temp = sqrt(-w[i]); + w[i] = copysign(temp, s[i]); + } + +/* Compute eigenvectors of the modified rank-1 modification. */ + + for (j = 0; j < kval; j++) { + for (i = 0; i < kval; i++) { + s[i] = w[i] / q[j * qdim + i]; + } + temp = NRM2(k, s, &c1); + for (i = 0; i < kval; i++) { + ii = indx[i] - 1; + q[j * qdim + i] = s[ii] / temp; + } + } + } + +/* Compute the updated eigenvectors. */ + + n2 = *n - *n1; + n12 = ctot[0] + ctot[1]; + n23 = ctot[1] + ctot[2]; + + LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); + iq2 = *n1 * n12; + if (n23 != 0) { + GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); + } else { + LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); + } + + LACPY("A", &n12, k, q, ldq, s, &n12); + if (n12 != 0) { + GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); + } else { + LASET("A", n1, k, &c0f, &c0f, q, ldq); + } + + return 0; +}