Improve [SD]SYEVD performance by parallelizing [SD]LAED3pull/5356/head
@@ -11,7 +11,7 @@ set(SCLAUX | |||||
la_constants.f90 | la_constants.f90 | ||||
sbdsdc.f | sbdsdc.f | ||||
sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f | sbdsqr.f sdisna.f slabad.f slacpy.f sladiv.f slae2.f slaebz.f | ||||
slaed0.f slaed1.f slaed2.f slaed3.f slaed4.f slaed5.f slaed6.f | |||||
slaed0.f slaed1.f slaed2.f slaed4.f slaed5.f slaed6.f | |||||
slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f | slaed7.f slaed8.f slaed9.f slaeda.f slaev2.f slagtf.f | ||||
slagts.f slamrg.f slanst.f | slagts.f slamrg.f slanst.f | ||||
slapy2.f slapy3.f slarnv.f | slapy2.f slapy3.f slarnv.f | ||||
@@ -31,7 +31,7 @@ set(DZLAUX | |||||
dbdsdc.f | dbdsdc.f | ||||
dbdsvdx.f dstevx.f dstein.f | dbdsvdx.f dstevx.f dstein.f | ||||
dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f | dbdsqr.f ddisna.f dlabad.f dlacpy.f dladiv.f dlae2.f dlaebz.f | ||||
dlaed0.f dlaed1.f dlaed2.f dlaed3.f dlaed4.f dlaed5.f dlaed6.f | |||||
dlaed0.f dlaed1.f dlaed2.f dlaed4.f dlaed5.f dlaed6.f | |||||
dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f | dlaed7.f dlaed8.f dlaed9.f dlaeda.f dlaev2.f dlagtf.f | ||||
dlagts.f dlamrg.f dlanst.f | dlagts.f dlamrg.f dlanst.f | ||||
dlapy2.f dlapy3.f dlarnv.f | dlapy2.f dlapy3.f dlarnv.f | ||||
@@ -517,7 +517,7 @@ set(SCLAUX | |||||
scombssq.c sbdsvdx.c sstevx.c sstein.c | scombssq.c sbdsvdx.c sstevx.c sstein.c | ||||
sbdsdc.c | sbdsdc.c | ||||
sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c | sbdsqr.c sdisna.c slabad.c slacpy.c sladiv.c slae2.c slaebz.c | ||||
slaed0.c slaed1.c slaed2.c slaed3.c slaed4.c slaed5.c slaed6.c | |||||
slaed0.c slaed1.c slaed2.c slaed4.c slaed5.c slaed6.c | |||||
slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c | slaed7.c slaed8.c slaed9.c slaeda.c slaev2.c slagtf.c | ||||
slagts.c slamrg.c slanst.c | slagts.c slamrg.c slanst.c | ||||
slapy2.c slapy3.c slarnv.c | slapy2.c slapy3.c slarnv.c | ||||
@@ -536,7 +536,7 @@ set(DZLAUX | |||||
dbdsdc.c | dbdsdc.c | ||||
dbdsvdx.c dstevx.c dstein.c | dbdsvdx.c dstevx.c dstein.c | ||||
dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c | dbdsqr.c ddisna.c dlabad.c dlacpy.c dladiv.c dlae2.c dlaebz.c | ||||
dlaed0.c dlaed1.c dlaed2.c dlaed3.c dlaed4.c dlaed5.c dlaed6.c | |||||
dlaed0.c dlaed1.c dlaed2.c dlaed4.c dlaed5.c dlaed6.c | |||||
dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c | dlaed7.c dlaed8.c dlaed9.c dlaeda.c dlaev2.c dlagtf.c | ||||
dlagts.c dlamrg.c dlanst.c | dlagts.c dlamrg.c dlanst.c | ||||
dlapy2.c dlapy3.c dlarnv.c | dlapy2.c dlapy3.c dlarnv.c | ||||
@@ -439,4 +439,9 @@ blasint xtrtrs_LRN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdo | |||||
blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | blasint xtrtrs_LCU_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | ||||
blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | blasint xtrtrs_LCN_parallel(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | ||||
blasint slaed3_single(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); | |||||
blasint dlaed3_single(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); | |||||
blasint slaed3_parallel(blasint *, blasint *, blasint *, float *, float *, blasint *, float *, float *, float *, blasint *, blasint *, float *, float *, blasint *); | |||||
blasint dlaed3_parallel(blasint *, blasint *, blasint *, double *, double *, blasint *, double *, double *, double *, blasint *, blasint *, double *, double *, blasint *); | |||||
#endif | #endif |
@@ -3035,6 +3035,8 @@ typedef struct { | |||||
#define NEG_TCOPY DNEG_TCOPY | #define NEG_TCOPY DNEG_TCOPY | ||||
#define LARF_L DLARF_L | #define LARF_L DLARF_L | ||||
#define LARF_R DLARF_R | #define LARF_R DLARF_R | ||||
#define LAED3_SINGLE dlaed3_single | |||||
#define LAED3_PARALLEL dlaed3_parallel | |||||
#else | #else | ||||
#define GETF2 SGETF2 | #define GETF2 SGETF2 | ||||
#define GETRF SGETRF | #define GETRF SGETRF | ||||
@@ -3056,6 +3058,8 @@ typedef struct { | |||||
#define NEG_TCOPY SNEG_TCOPY | #define NEG_TCOPY SNEG_TCOPY | ||||
#define LARF_L SLARF_L | #define LARF_L SLARF_L | ||||
#define LARF_R SLARF_R | #define LARF_R SLARF_R | ||||
#define LAED3_SINGLE slaed3_single | |||||
#define LAED3_PARALLEL slaed3_parallel | |||||
#endif | #endif | ||||
#else | #else | ||||
#ifdef XDOUBLE | #ifdef XDOUBLE | ||||
@@ -221,6 +221,7 @@ if (NOT NO_LAPACK) | |||||
GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) | GenerateNamedObjects("lapack/lauu2.c" "" "" 0 "" "" 0 3) | ||||
GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) | GenerateNamedObjects("lapack/trti2.c" "" "" 0 "" "" 0 3) | ||||
endif() | endif() | ||||
GenerateNamedObjects("lapack/laed3.c" "" "" 0 "" "" 0 1) | |||||
endif () | endif () | ||||
if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | ||||
@@ -429,8 +429,8 @@ XBLASOBJS = $(XBLAS1OBJS) $(XBLAS2OBJS) $(XBLAS3OBJS) | |||||
SLAPACKOBJS = \ | SLAPACKOBJS = \ | ||||
sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | sgetrf.$(SUFFIX) sgetrs.$(SUFFIX) spotrf.$(SUFFIX) sgetf2.$(SUFFIX) \ | ||||
spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | spotf2.$(SUFFIX) slaswp.$(SUFFIX) sgesv.$(SUFFIX) slauu2.$(SUFFIX) \ | ||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) | |||||
slauum.$(SUFFIX) strti2.$(SUFFIX) strtri.$(SUFFIX) strtrs.$(SUFFIX) \ | |||||
slaed3.$(SUFFIX) | |||||
#DLAPACKOBJS = \ | #DLAPACKOBJS = \ | ||||
# dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | # dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | ||||
@@ -440,8 +440,8 @@ SLAPACKOBJS = \ | |||||
DLAPACKOBJS = \ | DLAPACKOBJS = \ | ||||
dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | dgetrf.$(SUFFIX) dgetrs.$(SUFFIX) dpotrf.$(SUFFIX) dgetf2.$(SUFFIX) \ | ||||
dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | dpotf2.$(SUFFIX) dlaswp.$(SUFFIX) dgesv.$(SUFFIX) dlauu2.$(SUFFIX) \ | ||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) | |||||
dlauum.$(SUFFIX) dtrti2.$(SUFFIX) dtrtri.$(SUFFIX) dtrtrs.$(SUFFIX) \ | |||||
dlaed3.$(SUFFIX) | |||||
QLAPACKOBJS = \ | QLAPACKOBJS = \ | ||||
qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ | qgetf2.$(SUFFIX) qgetrf.$(SUFFIX) qlauu2.$(SUFFIX) qlauum.$(SUFFIX) \ | ||||
@@ -2365,6 +2365,11 @@ zlarf.$(SUFFIX) zlarf.$(PSUFFIX) : larf.c | |||||
xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c | xlarf.$(SUFFIX) xlarf.$(PSUFFIX) : larf.c | ||||
$(CC) -c $(CFLAGS) $< -o $(@F) | $(CC) -c $(CFLAGS) $< -o $(@F) | ||||
slaed3.$(SUFFIX) slaed3.$(PSUFFIX) : lapack/laed3.c | |||||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||||
dlaed3.$(SUFFIX) dlaed3.$(PSUFFIX) : lapack/laed3.c | |||||
$(CC) -c $(CFLAGS) $< -o $(@F) | |||||
############# BLAS EXTENSIONS ##################################### | ############# BLAS EXTENSIONS ##################################### | ||||
@@ -0,0 +1,87 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include <stdio.h> | |||||
#include "common.h" | |||||
#if defined(DOUBLE) | |||||
#define ERROR_NAME "DLAED3" | |||||
#else | |||||
#define ERROR_NAME "SLAED3" | |||||
#endif | |||||
/* ===================================================================== */ | |||||
int NAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, | |||||
FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, | |||||
FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, | |||||
FLOAT *s, blasint *Info) | |||||
{ | |||||
blasint kval, nval, qdim, info; | |||||
qdim = *ldq; | |||||
kval = *k; | |||||
nval = *n; | |||||
/* Test the input parameters. */ | |||||
info = 0; | |||||
if (kval < 0) { | |||||
info = 1; | |||||
} else if (nval < kval) { | |||||
info = 2; | |||||
} else if (qdim < nval || qdim < 1) { | |||||
info = 6; | |||||
} | |||||
if (info) { | |||||
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME) - 1); | |||||
*Info = - info; | |||||
return 0; | |||||
} | |||||
/* Quick return if possible */ | |||||
*Info = 0; | |||||
if (kval == 0) return 0; | |||||
#ifdef SMP | |||||
int nthreads = num_cpu_avail(4); | |||||
if (nthreads == 1) { | |||||
#endif | |||||
LAED3_SINGLE(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); | |||||
#ifdef SMP | |||||
} else { | |||||
LAED3_PARALLEL(k, n, n1, d, q, ldq, rho, dlamda, q2, indx, ctot, w, s, Info); | |||||
} | |||||
#endif | |||||
return 0; | |||||
} |
@@ -85,7 +85,7 @@ ALLAUX_O = ilaenv.o ilaenv2stage.o ieeeck.o lsamen.o xerbla.o xerbla_array.o \ | |||||
../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o | ../INSTALL/ilaver.o ../INSTALL/lsame.o ../INSTALL/slamch.o | ||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" | ifneq "$(or $(BUILD_SINGLE),$(BUILD_COMPLEX))" "" | ||||
SCLAUX = la_constants.o \ | |||||
SCLAUX_O = la_constants.o \ | |||||
sbdsvdx.o sstevx.o sstein.o \ | sbdsvdx.o sstevx.o sstein.o \ | ||||
sbdsdc.o \ | sbdsdc.o \ | ||||
sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ | sbdsqr.o sdisna.o slabad.o slacpy.o sladiv.o slae2.o slaebz.o \ | ||||
@@ -106,7 +106,7 @@ SCLAUX = la_constants.o \ | |||||
endif | endif | ||||
ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" | ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" | ||||
DZLAUX = la_constants.o\ | |||||
DZLAUX_O = la_constants.o\ | |||||
dcombssq.o \ | dcombssq.o \ | ||||
dbdsvdx.o dstevx.o dstein.o \ | dbdsvdx.o dstevx.o dstein.o \ | ||||
dbdsdc.o \ | dbdsdc.o \ | ||||
@@ -572,6 +572,8 @@ endif | |||||
# filter out optimized codes from OpenBLAS | # filter out optimized codes from OpenBLAS | ||||
ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o | ALL_AUX_OBJS = xerbla.o ../INSTALL/lsame.o | ||||
SCL_AUX_OBJS = slaed3.o | |||||
DZL_AUX_OBJS = dlaed3.o | |||||
SLAPACKOBJS = \ | SLAPACKOBJS = \ | ||||
sgetrf.o sgetrs.o spotrf.o sgetf2.o \ | sgetrf.o sgetrs.o spotrf.o sgetf2.o \ | ||||
@@ -598,6 +600,8 @@ ZLAPACKOBJS = \ | |||||
zsymv.o zsyr.o zspmv.o zspr.o | zsymv.o zsyr.o zspmv.o zspr.o | ||||
ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) | ALLAUX = $(filter-out $(ALL_AUX_OBJS),$(ALLAUX_O)) | ||||
SCLAUX = $(filter-out $(SCL_AUX_OBJS),$(SCLAUX_O)) | |||||
DZLAUX = $(filter-out $(DZL_AUX_OBJS),$(DZLAUX_O)) | |||||
SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) | SLASRC = $(filter-out $(SLAPACKOBJS),$(SLASRC_O)) | ||||
DLASRC = $(filter-out $(DLAPACKOBJS),$(DLASRC_O)) | DLASRC = $(filter-out $(DLAPACKOBJS),$(DLASRC_O)) | ||||
CLASRC = $(filter-out $(CLAPACKOBJS),$(CLASRC_O)) | CLASRC = $(filter-out $(CLAPACKOBJS),$(CLASRC_O)) | ||||
@@ -10,6 +10,7 @@ set(LAPACK_SOURCES | |||||
potrf/potrf_L_single.c | potrf/potrf_L_single.c | ||||
lauum/lauum_U_single.c | lauum/lauum_U_single.c | ||||
lauum/lauum_L_single.c | lauum/lauum_L_single.c | ||||
laed3/laed3_single.c | |||||
) | ) | ||||
# add a 'z' to filename for complex version | # add a 'z' to filename for complex version | ||||
@@ -79,6 +80,7 @@ if (USE_THREAD) | |||||
lauum/lauum_L_parallel.c | lauum/lauum_L_parallel.c | ||||
potrf/potrf_U_parallel.c | potrf/potrf_U_parallel.c | ||||
potrf/potrf_L_parallel.c | potrf/potrf_L_parallel.c | ||||
laed3/laed3_parallel.c | |||||
) | ) | ||||
# this has a z version | # this has a z version | ||||
@@ -2,7 +2,7 @@ TOPDIR = .. | |||||
include ../Makefile.system | include ../Makefile.system | ||||
#SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | #SUBDIRS = laswp getf2 getrf potf2 potrf lauu2 lauum trti2 trtri getrs | ||||
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs | |||||
SUBDIRS = getrf getf2 laswp getrs potrf potf2 lauu2 lauum trti2 trtri trtrs laed3 | |||||
FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | FLAMEDIRS = laswp getf2 potf2 lauu2 trti2 | ||||
@@ -0,0 +1,36 @@ | |||||
TOPDIR = ../.. | |||||
include ../../Makefile.system | |||||
SBLASOBJS = slaed3_single.$(SUFFIX) | |||||
DBLASOBJS = dlaed3_single.$(SUFFIX) | |||||
ifdef SMP | |||||
SBLASOBJS += slaed3_parallel.$(SUFFIX) | |||||
DBLASOBJS += dlaed3_parallel.$(SUFFIX) | |||||
endif | |||||
ifeq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" | |||||
SBLASOBJS= | |||||
endif | |||||
ifneq ($(BUILD_DOUBLE),1) | |||||
DBLASOBJS= | |||||
endif | |||||
slaed3_single.$(SUFFIX) : laed3_single.c | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||||
dlaed3_single.$(SUFFIX) : laed3_single.c | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||||
slaed3_parallel.$(SUFFIX) : laed3_parallel.c | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||||
dlaed3_parallel.$(SUFFIX) : laed3_parallel.c | |||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||||
slaed3_single.$(PSUFFIX) : laed3_single.c | |||||
$(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||||
dlaed3_single.$(PSUFFIX) : laed3_single.c | |||||
$(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||||
slaed3_parallel.$(PSUFFIX) : laed3_parallel.c | |||||
$(CC) -c $(PFLAGS) -UCOMPLEX -UDOUBLE $< -o $(@F) | |||||
dlaed3_parallel.$(PSUFFIX) : laed3_parallel.c | |||||
$(CC) -c $(PFLAGS) -UCOMPLEX -DDOUBLE $< -o $(@F) | |||||
include ../../Makefile.tail |
@@ -0,0 +1,243 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include <stdio.h> | |||||
#include "common.h" | |||||
#define max(a,b) ((a) > (b) ? (a) : (b)) | |||||
#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) | |||||
#if defined(DOUBLE) | |||||
#define LAMC3 BLASFUNC(dlamc3) | |||||
#define LAED4 BLASFUNC(dlaed4) | |||||
#define GEMM BLASFUNC(dgemm) | |||||
#define NRM2 BLASFUNC(dnrm2) | |||||
#define COPY BLASFUNC(dcopy) | |||||
#define LACPY BLASFUNC(dlacpy) | |||||
#define LASET BLASFUNC(dlaset) | |||||
#else | |||||
#define LAMC3 BLASFUNC(slamc3) | |||||
#define LAED4 BLASFUNC(slaed4) | |||||
#define GEMM BLASFUNC(sgemm) | |||||
#define NRM2 BLASFUNC(snrm2) | |||||
#define COPY BLASFUNC(scopy) | |||||
#define LACPY BLASFUNC(slacpy) | |||||
#define LASET BLASFUNC(slaset) | |||||
#endif | |||||
FLOAT LAMC3(FLOAT *, FLOAT *); | |||||
void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||||
void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); | |||||
void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||||
/* Table of constant values */ | |||||
static blasint c1 = 1; | |||||
static FLOAT c1f = 1.; | |||||
static FLOAT c0f = 0.; | |||||
static void inner_laed4_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||||
blasint kval = args -> m; | |||||
blasint j, j_from, j_to; | |||||
FLOAT *dlamda = (FLOAT *)args -> a; | |||||
FLOAT *w = (FLOAT *)args -> b; | |||||
FLOAT *q = (FLOAT *)args -> c; | |||||
BLASLONG qdim = args -> ldc; | |||||
FLOAT *d = (FLOAT *)args -> d; | |||||
FLOAT rho = *(FLOAT *)args -> alpha; | |||||
blasint *info = &((blasint*)args -> beta)[mypos]; | |||||
j_from = range_m[0] + 1; | |||||
j_to = range_m[1]; | |||||
for (j = j_from; j <= j_to; j++) { | |||||
LAED4(&kval, &j, dlamda, w, &q[(j - 1) * qdim], &rho, &d[j - 1], info); | |||||
if(*info != 0) break; | |||||
} | |||||
} | |||||
static void inner_wloop_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||||
blasint kval = args -> m; | |||||
blasint i, j, i_from, i_to; | |||||
FLOAT *dlamda = (FLOAT *)args -> a; | |||||
FLOAT *w = (FLOAT *)args -> b; | |||||
FLOAT *q = (FLOAT *)args -> c; | |||||
BLASLONG qdim = args -> ldc; | |||||
i_from = range_m[0]; | |||||
i_to = range_m[1]; | |||||
for (j = 0; j < kval; j++) { | |||||
for (i = i_from; i < i_to; i++) { | |||||
if (i != j) w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); | |||||
} | |||||
} | |||||
} | |||||
/* ===================================================================== */ | |||||
blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, | |||||
FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, | |||||
FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, | |||||
FLOAT *s, blasint *info) | |||||
{ | |||||
FLOAT temp; | |||||
blasint kval, qdim; | |||||
blasint i, j, itmp; | |||||
blasint n2, n12, ii, n23, iq2; | |||||
blas_queue_t queue[MAX_CPU_NUMBER]; | |||||
blas_arg_t args; | |||||
BLASLONG range[MAX_CPU_NUMBER + 1]; | |||||
blasint infoarray[MAX_CPU_NUMBER]; | |||||
int width, num_cpu, mode, nthreads; | |||||
qdim = *ldq; | |||||
kval = *k; | |||||
/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ | |||||
/* be computed with high relative accuracy (barring over/underflow). */ | |||||
for (i = 0; i < kval; i++) { | |||||
dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; | |||||
} | |||||
nthreads = num_cpu_avail(4); | |||||
#if defined(DOUBLE) | |||||
mode = BLAS_DOUBLE | BLAS_REAL; | |||||
#else | |||||
mode = BLAS_SINGLE | BLAS_REAL; | |||||
#endif | |||||
args.m = kval; | |||||
args.a = (void *)dlamda; | |||||
args.b = (void *)w; | |||||
args.c = (void *)q; | |||||
args.ldc = qdim; | |||||
args.d = (void *)d; | |||||
args.alpha = (void *)rho; | |||||
args.beta = (void *)infoarray; | |||||
num_cpu = 0; | |||||
range[0] = 0; | |||||
i = kval; | |||||
while (i > 0) { | |||||
width = blas_quickdivide(i + nthreads - num_cpu - 1, nthreads - num_cpu); | |||||
range[num_cpu + 1] = range[num_cpu] + width; | |||||
queue[num_cpu].range_m = &range[num_cpu]; | |||||
queue[num_cpu].range_n = NULL; | |||||
queue[num_cpu].routine = inner_laed4_thread; | |||||
queue[num_cpu].args = &args; | |||||
queue[num_cpu].sa = NULL; | |||||
queue[num_cpu].sb = NULL; | |||||
queue[num_cpu].mode = mode; | |||||
queue[num_cpu].next = &queue[num_cpu + 1]; | |||||
infoarray[num_cpu] = 0; | |||||
num_cpu ++; | |||||
i -= width; | |||||
} | |||||
if (num_cpu) { | |||||
queue[num_cpu - 1].next = NULL; | |||||
exec_blas(num_cpu, queue); | |||||
} | |||||
for (i = 0; i < num_cpu; i++) { | |||||
*info = max(infoarray[i], *info); | |||||
} | |||||
/* If the zero finder fails, the computation is terminated. */ | |||||
if (*info != 0) { | |||||
return 0; | |||||
} | |||||
if (kval == 2) { | |||||
for (j = 0; j < kval; j++) { | |||||
w[0] = q[j * qdim]; | |||||
w[1] = q[j * qdim + 1]; | |||||
ii = indx[0] - 1; | |||||
q[j * qdim] = w[ii]; | |||||
ii = indx[1] - 1; | |||||
q[j * qdim + 1] = w[ii]; | |||||
} | |||||
} else if (kval != 1) { | |||||
/* Compute updated W. */ | |||||
COPY(k, w, &c1, s, &c1); | |||||
/* Initialize W(I) = Q(I,I) */ | |||||
itmp = qdim + 1; | |||||
COPY(k, q, &itmp, w, &c1); | |||||
for (i = 0; i < num_cpu; i++) { | |||||
queue[i].routine = inner_wloop_thread; | |||||
} | |||||
if (num_cpu) { | |||||
exec_blas(num_cpu, queue); | |||||
} | |||||
for (i = 0; i < kval; i++) { | |||||
temp = sqrt(-w[i]); | |||||
w[i] = copysign(temp, s[i]); | |||||
} | |||||
/* Compute eigenvectors of the modified rank-1 modification. */ | |||||
for (j = 0; j < kval; j++) { | |||||
for (i = 0; i < kval; i++) { | |||||
s[i] = w[i] / q[j * qdim + i]; | |||||
} | |||||
temp = NRM2(k, s, &c1); | |||||
for (i = 0; i < kval; i++) { | |||||
ii = indx[i] - 1; | |||||
q[j * qdim + i] = s[ii] / temp; | |||||
} | |||||
} | |||||
} | |||||
/* Compute the updated eigenvectors. */ | |||||
n2 = *n - *n1; | |||||
n12 = ctot[0] + ctot[1]; | |||||
n23 = ctot[1] + ctot[2]; | |||||
LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); | |||||
iq2 = *n1 * n12; | |||||
if (n23 != 0) { | |||||
GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); | |||||
} else { | |||||
LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); | |||||
} | |||||
LACPY("A", &n12, k, q, ldq, s, &n12); | |||||
if (n12 != 0) { | |||||
GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); | |||||
} else { | |||||
LASET("A", n1, k, &c0f, &c0f, q, ldq); | |||||
} | |||||
return 0; | |||||
} |
@@ -0,0 +1,166 @@ | |||||
/*************************************************************************** | |||||
Copyright (c) 2025, The OpenBLAS Project | |||||
All rights reserved. | |||||
Redistribution and use in source and binary forms, with or without | |||||
modification, are permitted provided that the following conditions are | |||||
met: | |||||
1. Redistributions of source code must retain the above copyright | |||||
notice, this list of conditions and the following disclaimer. | |||||
2. Redistributions in binary form must reproduce the above copyright | |||||
notice, this list of conditions and the following disclaimer in | |||||
the documentation and/or other materials provided with the | |||||
distribution. | |||||
3. Neither the name of the OpenBLAS project nor the names of | |||||
its contributors may be used to endorse or promote products | |||||
derived from this software without specific prior written | |||||
permission. | |||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
*****************************************************************************/ | |||||
#include <stdio.h> | |||||
#include "common.h" | |||||
#define copysign(x,y) ((y) < 0 ? ((x) < 0 ? (x) : -(x)) : ((x) < 0 ? -(x) : (x))) | |||||
#if defined(DOUBLE) | |||||
#define LAMC3 BLASFUNC(dlamc3) | |||||
#define LAED4 BLASFUNC(dlaed4) | |||||
#define GEMM BLASFUNC(dgemm) | |||||
#define NRM2 BLASFUNC(dnrm2) | |||||
#define COPY BLASFUNC(dcopy) | |||||
#define LACPY BLASFUNC(dlacpy) | |||||
#define LASET BLASFUNC(dlaset) | |||||
#else | |||||
#define LAMC3 BLASFUNC(slamc3) | |||||
#define LAED4 BLASFUNC(slaed4) | |||||
#define GEMM BLASFUNC(sgemm) | |||||
#define NRM2 BLASFUNC(snrm2) | |||||
#define COPY BLASFUNC(scopy) | |||||
#define LACPY BLASFUNC(slacpy) | |||||
#define LASET BLASFUNC(slaset) | |||||
#endif | |||||
FLOAT LAMC3(FLOAT *, FLOAT *); | |||||
void LAED4(blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||||
void LACPY(char *, blasint *, blasint *, FLOAT *, blasint *, FLOAT *, blasint *); | |||||
void LASET(char *, blasint *, blasint *, FLOAT *, FLOAT *, FLOAT *, blasint *); | |||||
/* Table of constant values */ | |||||
static blasint c1 = 1; | |||||
static FLOAT c1f = 1.; | |||||
static FLOAT c0f = 0.; | |||||
/* ===================================================================== */ | |||||
blasint CNAME(blasint *k, blasint *n, blasint *n1, FLOAT *d, | |||||
FLOAT *q, blasint *ldq, FLOAT *rho, FLOAT *dlamda, | |||||
FLOAT *q2, blasint *indx, blasint *ctot, FLOAT *w, | |||||
FLOAT *s, blasint *info) | |||||
{ | |||||
FLOAT temp; | |||||
blasint kval, qdim; | |||||
blasint i, j, itmp; | |||||
blasint n2, n12, ii, n23, iq2; | |||||
qdim = *ldq; | |||||
kval = *k; | |||||
/* Modify values DLAMDA(i) to make sure all DLAMDA(i)-DLAMDA(j) can */ | |||||
/* be computed with high relative accuracy (barring over/underflow). */ | |||||
for (i = 0; i < kval; i++) { | |||||
dlamda[i] = LAMC3(&dlamda[i], &dlamda[i]) - dlamda[i]; | |||||
} | |||||
for (j = 1; j <= kval; j++) { | |||||
LAED4(k, &j, dlamda, w, &q[(j - 1) * qdim], rho, &d[j - 1], info); | |||||
if(*info != 0) break; | |||||
} | |||||
/* If the zero finder fails, the computation is terminated. */ | |||||
if (*info != 0) { | |||||
return 0; | |||||
} | |||||
if (kval == 2) { | |||||
for (j = 0; j < kval; j++) { | |||||
w[0] = q[j * qdim]; | |||||
w[1] = q[j * qdim + 1]; | |||||
ii = indx[0] - 1; | |||||
q[j * qdim] = w[ii]; | |||||
ii = indx[1] - 1; | |||||
q[j * qdim + 1] = w[ii]; | |||||
} | |||||
} else if (kval != 1) { | |||||
/* Compute updated W. */ | |||||
COPY(k, w, &c1, s, &c1); | |||||
/* Initialize W(I) = Q(I,I) */ | |||||
itmp = qdim + 1; | |||||
COPY(k, q, &itmp, w, &c1); | |||||
for (j = 0; j < kval; j++) { | |||||
for (i = 0; i < j; i++) { | |||||
w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); | |||||
} | |||||
for (i = j + 1; i < kval; i++) { | |||||
w[i] *= q[j * qdim + i] / (dlamda[i] - dlamda[j]); | |||||
} | |||||
} | |||||
for (i = 0; i < kval; i++) { | |||||
temp = sqrt(-w[i]); | |||||
w[i] = copysign(temp, s[i]); | |||||
} | |||||
/* Compute eigenvectors of the modified rank-1 modification. */ | |||||
for (j = 0; j < kval; j++) { | |||||
for (i = 0; i < kval; i++) { | |||||
s[i] = w[i] / q[j * qdim + i]; | |||||
} | |||||
temp = NRM2(k, s, &c1); | |||||
for (i = 0; i < kval; i++) { | |||||
ii = indx[i] - 1; | |||||
q[j * qdim + i] = s[ii] / temp; | |||||
} | |||||
} | |||||
} | |||||
/* Compute the updated eigenvectors. */ | |||||
n2 = *n - *n1; | |||||
n12 = ctot[0] + ctot[1]; | |||||
n23 = ctot[1] + ctot[2]; | |||||
LACPY("A", &n23, k, &q[ctot[0]], ldq, s, &n23); | |||||
iq2 = *n1 * n12; | |||||
if (n23 != 0) { | |||||
GEMM("N", "N", &n2, k, &n23, &c1f, &q2[iq2], &n2, s, &n23, &c0f, &q[*n1], ldq); | |||||
} else { | |||||
LASET("A", &n2, k, &c0f, &c0f, &q[*n1], ldq); | |||||
} | |||||
LACPY("A", &n12, k, q, ldq, s, &n12); | |||||
if (n12 != 0) { | |||||
GEMM("N", "N", n1, k, &n12, &c1f, q2, n1, s, &n12, &c0f, q, ldq); | |||||
} else { | |||||
LASET("A", n1, k, &c0f, &c0f, q, ldq); | |||||
} | |||||
return 0; | |||||
} |