Improve [SD]SYEVD performance by parallelizing [SD]LAED3pull/5356/head
@@ -11,7 +11,7 @@ set(SCLAUX | |||
la_constants.f90 | |||
sbdsdc.f | |||
sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f | |||
slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f | |||
slaed0.f slaed1.f slaed2.f slaed4.f slaed5.f slaed6.f | |||
slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f | |||
slagts.f slamrg.f slanst.f | |||
slapy2.f slapy3.f slarnv.f | |||
@@ -31,7 +31,7 @@ set(DZLAUX | |||
dbdsdc.f | |||
dbdsvdx.f dstevx.f dstein.f | |||
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f | |||
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f | |||
dlaed0.f dlaed1.f dlaed2.f dlaed4.f dlaed5.f dlaed6.f | |||
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f | |||
dlagts.f dlamrg.f dlanst.f | |||
dlapy2.f dlapy3.f dlarnv.f | |||
@@ -517,7 +517,7 @@ set(SCLAUX | |||
scombssq.c sbdsvdx.c sstevx.c sstein.c | |||
sbdsdc.c | |||
sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c | |||
slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c | |||
slaed0.c slaed1.c slaed2.c slaed4.c slaed5.c slaed6.c | |||
slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c | |||
slagts.c slamrg.c slanst.c | |||
slapy2.c slapy3.c slarnv.c | |||
@@ -536,7 +536,7 @@ set(DZLAUX | |||
dbdsdc.c | |||
dbdsvdx.c dstevx.c dstein.c | |||
dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c | |||
dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c | |||
dlaed0.c dlaed1.c dlaed2.c dlaed4.c dlaed5.c dlaed6.c | |||
dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c | |||
dlagts.c dlamrg.c dlanst.c | |||
dlapy2.c dlapy3.c dlarnv.c | |||
@@ -439,4 +439,9 @@ blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdo | |||
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | |||
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | |||
blasint slaed3_single(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); | |||
blasint dlaed3_single(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); | |||
blasint slaed3_parallel(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); | |||
blasint dlaed3_parallel(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); | |||
#endif |
@@ -3035,6 +3035,8 @@ typedef struct { | |||
#define NEG_TCOPY DNEG_TCOPY | |||
#define LARF_L DLARF_L | |||
#define LARF_R DLARF_R | |||
#define LAED3_SINGLE dlaed3_single | |||
#define LAED3_PARALLEL dlaed3_parallel | |||
#else | |||
#define GETF2 SGETF2 | |||
#define GETRF SGETRF | |||
@@ -3056,6 +3058,8 @@ typedef struct { | |||
#define NEG_TCOPY SNEG_TCOPY | |||
#define LARF_L SLARF_L | |||
#define LARF_R SLARF_R | |||
#define LAED3_SINGLE slaed3_single | |||
#define LAED3_PARALLEL slaed3_parallel | |||
#endif | |||
#else | |||
#ifdef XDOUBLE | |||
@@ -221,6 +221,7 @@ if (NOT NO_LAPACK) | |||
GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) | |||
GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) | |||
endif() | |||
GenerateNamedObjects("lapack/laed3.c" "" "" 0 "" "" 0 1) | |||
endif () | |||
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | |||
@@ -429,8 +429,8 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | |||
SLAPACKOBJS = \ | |||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | |||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | |||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) | |||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) \ | |||
slaed3.$(SUFFIX) | |||
#DLAPACKOBJS = \ | |||
# dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | |||
@@ -440,8 +440,8 @@ SLAPACKOBJS = \ | |||
DLAPACKOBJS = \ | |||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | |||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | |||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) | |||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) \ | |||
dlaed3.$(SUFFIX) | |||
QLAPACKOBJS = \ | |||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ | |||
@@ -2365,6 +2365,11 @@ zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c | |||
xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
slaed3.$(SUFFIX) slaed3.$(PSUFFIX) : lapack/laed3.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
dlaed3.$(SUFFIX) dlaed3.$(PSUFFIX) : lapack/laed3.c | |||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||
############# BLAS EXTENSIONS ##################################### | |||
@@ -0,0 +1,87 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#if defined(DOUBLE) | |||
#define ERROR_NAME "DLAED3" | |||
#else | |||
#define ERROR_NAME "SLAED3" | |||
#endif | |||
/* ===================================================================== */ | |||
int NAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, | |||
FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, | |||
FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, | |||
FLOAT *s, blasint *Info) | |||
{ | |||
blasint kval, nval, qdim, info; | |||
qdim = *ldq; | |||
kval = *k; | |||
nval = *n; | |||
/* Test the input parameters. */ | |||
info = 0; | |||
if (kval < 0) { | |||
info = 1; | |||
} else if (nval < kval) { | |||
info = 2; | |||
} else if (qdim < nval || qdim < 1) { | |||
info = 6; | |||
} | |||
if (info) { | |||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); | |||
*Info = - info; | |||
return 0; | |||
} | |||
/* Quick return if possible */ | |||
*Info = 0; | |||
if (kval == 0) return 0; | |||
#ifdef SMP | |||
int nthreads = num_cpu_avail(4); | |||
if (nthreads == 1) { | |||
#endif | |||
LAED3_SINGLE(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); | |||
#ifdef SMP | |||
} else { | |||
LAED3_PARALLEL(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); | |||
} | |||
#endif | |||
return 0; | |||
} |
@@ -85,7 +85,7 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \ | |||
../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o | |||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" | |||
SCLAUX = la_constants.o \ | |||
SCLAUX_O = la_constants.o \ | |||
sbdsvdx.o sstevx.o sstein.o \ | |||
sbdsdc.o \ | |||
sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ | |||
@@ -106,7 +106,7 @@ SCLAUX = la_constants.o \ | |||
endif | |||
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" | |||
DZLAUX = la_constants.o\ | |||
DZLAUX_O = la_constants.o\ | |||
dcombssq.o \ | |||
dbdsvdx.o dstevx.o dstein.o \ | |||
dbdsdc.o \ | |||
@@ -572,6 +572,8 @@ endif | |||
# filter out optimized codes from OpenBLAS | |||
ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o | |||
SCL_AUX_OBJS = slaed3.o | |||
DZL_AUX_OBJS = dlaed3.o | |||
SLAPACKOBJS = \ | |||
sgetrf.o sgetrs.o spotrf.o sgetf2.o \ | |||
@@ -598,6 +600,8 @@ ZLAPACKOBJS = \ | |||
zsymv.o zsyr.o zspmv.o zspr.o | |||
ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) | |||
SCLAUX = $(filter-out $(SCL_AUX_OBJS),$(SCLAUX_O)) | |||
DZLAUX = $(filter-out $(DZL_AUX_OBJS),$(DZLAUX_O)) | |||
SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) | |||
DLASRC = $(filter-out $(DLAPACKOBJS),$(DLASRC_O)) | |||
CLASRC = $(filter-out $(CLAPACKOBJS),$(CLASRC_O)) | |||
@@ -10,6 +10,7 @@ set(LAPACK_SOURCES | |||
potrf/potrf_L_single.c | |||
lauum/lauum_U_single.c | |||
lauum/lauum_L_single.c | |||
laed3/laed3_single.c | |||
) | |||
# add a 'z' to filename for complex version | |||
@@ -79,6 +80,7 @@ if (USE_THREAD) | |||
lauum/lauum_L_parallel.c | |||
potrf/potrf_U_parallel.c | |||
potrf/potrf_L_parallel.c | |||
laed3/laed3_parallel.c | |||
) | |||
# this has a z version | |||
@@ -2,7 +2,7 @@ TOPDIR = .. | |||
include ../Makefile.system | |||
#SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | |||
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs | |||
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs laed3 | |||
FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | |||
@@ -0,0 +1,36 @@ | |||
TOPDIR = ../.. | |||
include ../../Makefile.system | |||
SBLASOBJS = slaed3_single.$(SUFFIX) | |||
DBLASOBJS = dlaed3_single.$(SUFFIX) | |||
ifdef SMP | |||
SBLASOBJS += slaed3_parallel.$(SUFFIX) | |||
DBLASOBJS += dlaed3_parallel.$(SUFFIX) | |||
endif | |||
ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" | |||
SBLASOBJS= | |||
endif | |||
ifneq ($(BUILD_DOUBLE),1) | |||
DBLASOBJS= | |||
endif | |||
slaed3_single.$(SUFFIX) : laed3_single.c | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||
dlaed3_single.$(SUFFIX) : laed3_single.c | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||
slaed3_parallel.$(SUFFIX) : laed3_parallel.c | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||
dlaed3_parallel.$(SUFFIX) : laed3_parallel.c | |||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||
slaed3_single.$(PSUFFIX) : laed3_single.c | |||
$(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||
dlaed3_single.$(PSUFFIX) : laed3_single.c | |||
$(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||
slaed3_parallel.$(PSUFFIX) : laed3_parallel.c | |||
$(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||
dlaed3_parallel.$(PSUFFIX) : laed3_parallel.c | |||
$(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||
include ../../Makefile.tail |
@@ -0,0 +1,243 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#define max(a,b) ((a) > (b) ? (a) : (b)) | |||
#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) | |||
#if defined(DOUBLE) | |||
#define LAMC3 BLASFUNC(dlamc3) | |||
#define LAED4 BLASFUNC(dlaed4) | |||
#define GEMM BLASFUNC(dgemm) | |||
#define NRM2 BLASFUNC(dnrm2) | |||
#define COPY BLASFUNC(dcopy) | |||
#define LACPY BLASFUNC(dlacpy) | |||
#define LASET BLASFUNC(dlaset) | |||
#else | |||
#define LAMC3 BLASFUNC(slamc3) | |||
#define LAED4 BLASFUNC(slaed4) | |||
#define GEMM BLASFUNC(sgemm) | |||
#define NRM2 BLASFUNC(snrm2) | |||
#define COPY BLASFUNC(scopy) | |||
#define LACPY BLASFUNC(slacpy) | |||
#define LASET BLASFUNC(slaset) | |||
#endif | |||
FLOAT LAMC3(FLOAT *, FLOAT *); | |||
void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||
void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); | |||
void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||
/* Table of constant values */ | |||
static blasint c1 = 1; | |||
static FLOAT c1f = 1.; | |||
static FLOAT c0f = 0.; | |||
static void inner_laed4_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||
blasint kval = args -> m; | |||
blasint j, j_from, j_to; | |||
FLOAT *dlamda = (FLOAT *)args -> a; | |||
FLOAT *w = (FLOAT *)args -> b; | |||
FLOAT *q = (FLOAT *)args -> c; | |||
BLASLONG qdim = args -> ldc; | |||
FLOAT *d = (FLOAT *)args -> d; | |||
FLOAT rho = *(FLOAT *)args -> alpha; | |||
blasint *info = &((blasint*)args -> beta)[mypos]; | |||
j_from = range_m[0] + 1; | |||
j_to = range_m[1]; | |||
for (j = j_from; j <= j_to; j++) { | |||
LAED4(&kval, &j, dlamda, w, &q[(j - 1) * qdim], &rho, &d[j - 1], info); | |||
if(*info != 0) break; | |||
} | |||
} | |||
static void inner_wloop_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||
blasint kval = args -> m; | |||
blasint i, j, i_from, i_to; | |||
FLOAT *dlamda = (FLOAT *)args -> a; | |||
FLOAT *w = (FLOAT *)args -> b; | |||
FLOAT *q = (FLOAT *)args -> c; | |||
BLASLONG qdim = args -> ldc; | |||
i_from = range_m[0]; | |||
i_to = range_m[1]; | |||
for (j = 0; j < kval; j++) { | |||
for (i = i_from; i < i_to; i++) { | |||
if (i != j) w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); | |||
} | |||
} | |||
} | |||
/* ===================================================================== */ | |||
blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, | |||
FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, | |||
FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, | |||
FLOAT *s, blasint *info) | |||
{ | |||
FLOAT temp; | |||
blasint kval, qdim; | |||
blasint i, j, itmp; | |||
blasint n2, n12, ii, n23, iq2; | |||
blas_queue_t queue[MAX_CPU_NUMBER]; | |||
blas_arg_t args; | |||
BLASLONG range[MAX_CPU_NUMBER + 1]; | |||
blasint infoarray[MAX_CPU_NUMBER]; | |||
int width, num_cpu, mode, nthreads; | |||
qdim = *ldq; | |||
kval = *k; | |||
/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ | |||
/* be computed with high relative accuracy (barring over/underflow). */ | |||
for (i = 0; i < kval; i++) { | |||
dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; | |||
} | |||
nthreads = num_cpu_avail(4); | |||
#if defined(DOUBLE) | |||
mode = BLAS_DOUBLE | BLAS_REAL; | |||
#else | |||
mode = BLAS_SINGLE | BLAS_REAL; | |||
#endif | |||
args.m = kval; | |||
args.a = (void *)dlamda; | |||
args.b = (void *)w; | |||
args.c = (void *)q; | |||
args.ldc = qdim; | |||
args.d = (void *)d; | |||
args.alpha = (void *)rho; | |||
args.beta = (void *)infoarray; | |||
num_cpu = 0; | |||
range[0] = 0; | |||
i = kval; | |||
while (i > 0) { | |||
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); | |||
range[num_cpu + 1] = range[num_cpu] + width; | |||
queue[num_cpu].range_m = &range[num_cpu]; | |||
queue[num_cpu].range_n = NULL; | |||
queue[num_cpu].routine = inner_laed4_thread; | |||
queue[num_cpu].args = &args; | |||
queue[num_cpu].sa = NULL; | |||
queue[num_cpu].sb = NULL; | |||
queue[num_cpu].mode = mode; | |||
queue[num_cpu].next = &queue[num_cpu + 1]; | |||
infoarray[num_cpu] = 0; | |||
num_cpu ++; | |||
i -= width; | |||
} | |||
if (num_cpu) { | |||
queue[num_cpu - 1].next = NULL; | |||
exec_blas(num_cpu, queue); | |||
} | |||
for (i = 0; i < num_cpu; i++) { | |||
*info = max(infoarray[i], *info); | |||
} | |||
/* If the zero finder fails, the computation is terminated. */ | |||
if (*info != 0) { | |||
return 0; | |||
} | |||
if (kval == 2) { | |||
for (j = 0; j < kval; j++) { | |||
w[0] = q[j * qdim]; | |||
w[1] = q[j * qdim + 1]; | |||
ii = indx[0] - 1; | |||
q[j * qdim] = w[ii]; | |||
ii = indx[1] - 1; | |||
q[j * qdim + 1] = w[ii]; | |||
} | |||
} else if (kval != 1) { | |||
/* Compute updated W. */ | |||
COPY(k, w, &c1, s, &c1); | |||
/* Initialize W(I) = Q(I,I) */ | |||
itmp = qdim + 1; | |||
COPY(k, q, &itmp, w, &c1); | |||
for (i = 0; i < num_cpu; i++) { | |||
queue[i].routine = inner_wloop_thread; | |||
} | |||
if (num_cpu) { | |||
exec_blas(num_cpu, queue); | |||
} | |||
for (i = 0; i < kval; i++) { | |||
temp = sqrt(-w[i]); | |||
w[i] = copysign(temp, s[i]); | |||
} | |||
/* Compute eigenvectors of the modified rank-1 modification. */ | |||
for (j = 0; j < kval; j++) { | |||
for (i = 0; i < kval; i++) { | |||
s[i] = w[i] / q[j * qdim + i]; | |||
} | |||
temp = NRM2(k, s, &c1); | |||
for (i = 0; i < kval; i++) { | |||
ii = indx[i] - 1; | |||
q[j * qdim + i] = s[ii] / temp; | |||
} | |||
} | |||
} | |||
/* Compute the updated eigenvectors. */ | |||
n2 = *n - *n1; | |||
n12 = ctot[0] + ctot[1]; | |||
n23 = ctot[1] + ctot[2]; | |||
LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); | |||
iq2 = *n1 * n12; | |||
if (n23 != 0) { | |||
GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); | |||
} else { | |||
LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); | |||
} | |||
LACPY("A", &n12, k, q, ldq, s, &n12); | |||
if (n12 != 0) { | |||
GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); | |||
} else { | |||
LASET("A", n1, k, &c0f, &c0f, q, ldq); | |||
} | |||
return 0; | |||
} |
@@ -0,0 +1,166 @@ | |||
/*************************************************************************** | |||
Copyright (c) 2025, The OpenBLAS Project | |||
All rights reserved. | |||
Redistribution and use in source and binary forms, with or without | |||
modification, are permitted provided that the following conditions are | |||
met: | |||
1. Redistributions of source code must retain the above copyright | |||
notice, this list of conditions and the following disclaimer. | |||
2. Redistributions in binary form must reproduce the above copyright | |||
notice, this list of conditions and the following disclaimer in | |||
the documentation and/or other materials provided with the | |||
distribution. | |||
3. Neither the name of the OpenBLAS project nor the names of | |||
its contributors may be used to endorse or promote products | |||
derived from this software without specific prior written | |||
permission. | |||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
*****************************************************************************/ | |||
#include <stdio.h> | |||
#include "common.h" | |||
#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) | |||
#if defined(DOUBLE) | |||
#define LAMC3 BLASFUNC(dlamc3) | |||
#define LAED4 BLASFUNC(dlaed4) | |||
#define GEMM BLASFUNC(dgemm) | |||
#define NRM2 BLASFUNC(dnrm2) | |||
#define COPY BLASFUNC(dcopy) | |||
#define LACPY BLASFUNC(dlacpy) | |||
#define LASET BLASFUNC(dlaset) | |||
#else | |||
#define LAMC3 BLASFUNC(slamc3) | |||
#define LAED4 BLASFUNC(slaed4) | |||
#define GEMM BLASFUNC(sgemm) | |||
#define NRM2 BLASFUNC(snrm2) | |||
#define COPY BLASFUNC(scopy) | |||
#define LACPY BLASFUNC(slacpy) | |||
#define LASET BLASFUNC(slaset) | |||
#endif | |||
FLOAT LAMC3(FLOAT *, FLOAT *); | |||
void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||
void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); | |||
void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||
/* Table of constant values */ | |||
static blasint c1 = 1; | |||
static FLOAT c1f = 1.; | |||
static FLOAT c0f = 0.; | |||
/* ===================================================================== */ | |||
blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, | |||
FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, | |||
FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, | |||
FLOAT *s, blasint *info) | |||
{ | |||
FLOAT temp; | |||
blasint kval, qdim; | |||
blasint i, j, itmp; | |||
blasint n2, n12, ii, n23, iq2; | |||
qdim = *ldq; | |||
kval = *k; | |||
/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ | |||
/* be computed with high relative accuracy (barring over/underflow). */ | |||
for (i = 0; i < kval; i++) { | |||
dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; | |||
} | |||
for (j = 1; j <= kval; j++) { | |||
LAED4(k, &j, dlamda, w, &q[(j - 1) * qdim], rho, &d[j - 1], info); | |||
if(*info != 0) break; | |||
} | |||
/* If the zero finder fails, the computation is terminated. */ | |||
if (*info != 0) { | |||
return 0; | |||
} | |||
if (kval == 2) { | |||
for (j = 0; j < kval; j++) { | |||
w[0] = q[j * qdim]; | |||
w[1] = q[j * qdim + 1]; | |||
ii = indx[0] - 1; | |||
q[j * qdim] = w[ii]; | |||
ii = indx[1] - 1; | |||
q[j * qdim + 1] = w[ii]; | |||
} | |||
} else if (kval != 1) { | |||
/* Compute updated W. */ | |||
COPY(k, w, &c1, s, &c1); | |||
/* Initialize W(I) = Q(I,I) */ | |||
itmp = qdim + 1; | |||
COPY(k, q, &itmp, w, &c1); | |||
for (j = 0; j < kval; j++) { | |||
for (i = 0; i < j; i++) { | |||
w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); | |||
} | |||
for (i = j + 1; i < kval; i++) { | |||
w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); | |||
} | |||
} | |||
for (i = 0; i < kval; i++) { | |||
temp = sqrt(-w[i]); | |||
w[i] = copysign(temp, s[i]); | |||
} | |||
/* Compute eigenvectors of the modified rank-1 modification. */ | |||
for (j = 0; j < kval; j++) { | |||
for (i = 0; i < kval; i++) { | |||
s[i] = w[i] / q[j * qdim + i]; | |||
} | |||
temp = NRM2(k, s, &c1); | |||
for (i = 0; i < kval; i++) { | |||
ii = indx[i] - 1; | |||
q[j * qdim + i] = s[ii] / temp; | |||
} | |||
} | |||
} | |||
/* Compute the updated eigenvectors. */ | |||
n2 = *n - *n1; | |||
n12 = ctot[0] + ctot[1]; | |||
n23 = ctot[1] + ctot[2]; | |||
LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); | |||
iq2 = *n1 * n12; | |||
if (n23 != 0) { | |||
GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); | |||
} else { | |||
LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); | |||
} | |||
LACPY("A", &n12, k, q, ldq, s, &n12); | |||
if (n12 != 0) { | |||
GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); | |||
} else { | |||
LASET("A", n1, k, &c0f, &c0f, q, ldq); | |||
} | |||
return 0; | |||
} |